From 5c8f9a05336cf5cadbd57ad461621b386aadb762 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Thu, 30 Jan 2025 08:38:49 +0300 Subject: arm64: dts: rockchip: Fix broken tsadc pinctrl names for rk3588 The tsadc driver does not handle pinctrl "gpio" and "otpout". Let's use the correct pinctrl names "default" and "sleep". Additionally, Alexey Charkov's testing [1] has established that it is necessary for pinctrl state to reference the &tsadc_shut_org configuration rather than &tsadc_shut for the driver to function correctly. [1] https://lkml.org/lkml/2025/1/24/966 Fixes: 32641b8ab1a5 ("arm64: dts: rockchip: add rk3588 thermal sensor") Cc: stable@vger.kernel.org Reviewed-by: Dragan Simic Signed-off-by: Alexander Shiyan Link: https://lore.kernel.org/r/20250130053849.4902-1-eagle.alexander923@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-base.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi index 8cfa30837ce7..978de506d434 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi @@ -2668,9 +2668,9 @@ rockchip,hw-tshut-temp = <120000>; rockchip,hw-tshut-mode = <0>; /* tshut mode 0:CRU 1:GPIO */ rockchip,hw-tshut-polarity = <0>; /* tshut polarity 0:LOW 1:HIGH */ - pinctrl-0 = <&tsadc_gpio_func>; - pinctrl-1 = <&tsadc_shut>; - pinctrl-names = "gpio", "otpout"; + pinctrl-0 = <&tsadc_shut_org>; + pinctrl-1 = <&tsadc_gpio_func>; + pinctrl-names = "default", "sleep"; #thermal-sensor-cells = <1>; status = "disabled"; }; -- cgit v1.2.3 From a6a7cba17c544fb95d5a29ab9d9ed4503029cb29 Mon Sep 17 00:00:00 2001 From: Tianling Shen Date: Sun, 19 Jan 2025 17:11:54 +0800 Subject: arm64: dts: rockchip: change eth phy mode to rgmii-id for orangepi r1 plus lts In general the delay should be added by the PHY instead of the MAC, and this improves network stability on some boards which seem to need different delay. Fixes: 387b3bbac5ea ("arm64: dts: rockchip: Add Xunlong OrangePi R1 Plus LTS") Cc: stable@vger.kernel.org # 6.6+ Signed-off-by: Tianling Shen Link: https://lore.kernel.org/r/20250119091154.1110762-1-cnsztl@gmail.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts | 3 +-- arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts | 1 + arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts index 67c246ad8b8c..ec2ce894da1f 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus-lts.dts @@ -17,8 +17,7 @@ &gmac2io { phy-handle = <&yt8531c>; - tx_delay = <0x19>; - rx_delay = <0x05>; + phy-mode = "rgmii-id"; status = "okay"; mdio { diff --git a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts index 324a8e951f7e..846b931e16d2 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts +++ b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dts @@ -15,6 +15,7 @@ &gmac2io { phy-handle = <&rtl8211e>; + phy-mode = "rgmii"; tx_delay = <0x24>; rx_delay = <0x18>; status = "okay"; diff --git a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi index 4f193704e5dc..09508e324a28 100644 --- a/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3328-orangepi-r1-plus.dtsi @@ -109,7 +109,6 @@ assigned-clocks = <&cru SCLK_MAC2IO>, <&cru SCLK_MAC2IO_EXT>; assigned-clock-parents = <&gmac_clk>, <&gmac_clk>; clock_in_out = "input"; - phy-mode = "rgmii"; phy-supply = <&vcc_io>; pinctrl-0 = <&rgmiim1_pins>; pinctrl-names = "default"; -- cgit v1.2.3 From 4eee627ea59304cdd66c5d4194ef13486a6c44fc Mon Sep 17 00:00:00 2001 From: Lukasz Czechowski Date: Tue, 21 Jan 2025 13:56:03 +0100 Subject: arm64: dts: rockchip: Move uart5 pin configuration to px30 ringneck SoM In the PX30-uQ7 (Ringneck) SoM, the hardware CTS and RTS pins for uart5 cannot be used for the UART CTS/RTS, because they are already allocated for different purposes. CTS pin is routed to SUS_S3# signal, while RTS pin is used internally and is not available on Q7 connector. Move definition of the pinctrl-0 property from px30-ringneck-haikou.dts to px30-ringneck.dtsi. This commit is a dependency to next commit in the patch series, that disables DMA for uart5. Cc: stable@vger.kernel.org Reviewed-by: Quentin Schulz Signed-off-by: Lukasz Czechowski Link: https://lore.kernel.org/r/20250121125604.3115235-2-lukasz.czechowski@thaumatec.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts | 1 - arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts index e4517f47d519..eb9470a00e54 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck-haikou.dts @@ -226,7 +226,6 @@ }; &uart5 { - pinctrl-0 = <&uart5_xfer>; rts-gpios = <&gpio0 RK_PB5 GPIO_ACTIVE_HIGH>; status = "okay"; }; diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi index ae050cc6cd05..2c87005c89bd 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi @@ -396,6 +396,10 @@ status = "okay"; }; +&uart5 { + pinctrl-0 = <&uart5_xfer>; +}; + /* Mule UCAN */ &usb_host0_ehci { status = "okay"; -- cgit v1.2.3 From 5ae4dca718eacd0a56173a687a3736eb7e627c77 Mon Sep 17 00:00:00 2001 From: Lukasz Czechowski Date: Tue, 21 Jan 2025 13:56:04 +0100 Subject: arm64: dts: rockchip: Disable DMA for uart5 on px30-ringneck MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UART controllers without flow control seem to behave unstable in case DMA is enabled. The issues were indicated in the message: https://lore.kernel.org/linux-arm-kernel/CAMdYzYpXtMocCtCpZLU_xuWmOp2Ja_v0Aj0e6YFNRA-yV7u14g@mail.gmail.com/ In case of PX30-uQ7 Ringneck SoM, it was noticed that after couple of hours of UART communication, the CPU stall was occurring, leading to the system becoming unresponsive. After disabling the DMA, extensive UART communication tests for up to two weeks were performed, and no issues were further observed. The flow control pins for uart5 are not available on PX30-uQ7 Ringneck, as configured by pinctrl-0, so the DMA nodes were removed on SoM dtsi. Cc: stable@vger.kernel.org Fixes: c484cf93f61b ("arm64: dts: rockchip: add PX30-µQ7 (Ringneck) SoM with Haikou baseboard") Reviewed-by: Quentin Schulz Signed-off-by: Lukasz Czechowski Link: https://lore.kernel.org/r/20250121125604.3115235-3-lukasz.czechowski@thaumatec.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi index 2c87005c89bd..e80412abec08 100644 --- a/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi +++ b/arch/arm64/boot/dts/rockchip/px30-ringneck.dtsi @@ -397,6 +397,8 @@ }; &uart5 { + /delete-property/ dmas; + /delete-property/ dma-names; pinctrl-0 = <&uart5_xfer>; }; -- cgit v1.2.3 From 2f9eb5262e63396a315c7da34a6c80c5d335df9f Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Thu, 16 Jan 2025 15:36:31 +0100 Subject: arm64: dts: rockchip: fix fixed-regulator renames on rk3399-gru devices rk3399-gru chromebooks have a regulator chains where one named regulator supplies multiple regulators pp900-usb pp900_pcie that supply the named peripherals. The dtsi used somewhat creative structure to describe that in creating the base node 3 times with different phandles and describing the EC dependency in a comment. This didn't register in the recent regulator-node renaming, as the additional nodes were empty, so adapt the missing node names for now. Fixes: 5c96e6330197 ("arm64: dts: rockchip: adapt regulator nodenames to preferred form") Tested-by: Vicente Bergas Signed-off-by: Heiko Stuebner Link: https://lore.kernel.org/r/20250116143631.3650469-1-heiko@sntech.de --- .../boot/dts/rockchip/rk3399-gru-chromebook.dtsi | 8 ++++---- .../boot/dts/rockchip/rk3399-gru-scarlet.dtsi | 6 +++--- arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi | 22 +++++++++++----------- 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi index 988e6ca32fac..a9ea4b0daa04 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-chromebook.dtsi @@ -22,11 +22,11 @@ }; /* EC turns on w/ pp900_usb_en */ - pp900_usb: pp900-ap { + pp900_usb: regulator-pp900-ap { }; /* EC turns on w/ pp900_pcie_en */ - pp900_pcie: pp900-ap { + pp900_pcie: regulator-pp900-ap { }; pp3000: regulator-pp3000 { @@ -126,7 +126,7 @@ }; /* Always on; plain and simple */ - pp3000_ap: pp3000_emmc: pp3000 { + pp3000_ap: pp3000_emmc: regulator-pp3000 { }; pp1500_ap_io: regulator-pp1500-ap-io { @@ -160,7 +160,7 @@ }; /* EC turns on w/ pp3300_usb_en_l */ - pp3300_usb: pp3300 { + pp3300_usb: regulator-pp3300 { }; /* gpio is shared with pp1800_pcie and pinctrl is set there */ diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi index 19b23b438965..5e068377a0a2 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru-scarlet.dtsi @@ -92,7 +92,7 @@ }; /* EC turns on pp1800_s3_en */ - pp1800_s3: pp1800 { + pp1800_s3: regulator-pp1800 { }; /* pp3300 children, sorted by name */ @@ -109,11 +109,11 @@ }; /* EC turns on pp3300_s0_en */ - pp3300_s0: pp3300 { + pp3300_s0: regulator-pp3300 { }; /* EC turns on pp3300_s3_en */ - pp3300_s3: pp3300 { + pp3300_s3: regulator-pp3300 { }; /* diff --git a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi index 6d9e60b01225..7eca1da78cff 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi @@ -189,39 +189,39 @@ }; /* EC turns on w/ pp900_ddrpll_en */ - pp900_ddrpll: pp900-ap { + pp900_ddrpll: regulator-pp900-ap { }; /* EC turns on w/ pp900_pll_en */ - pp900_pll: pp900-ap { + pp900_pll: regulator-pp900-ap { }; /* EC turns on w/ pp900_pmu_en */ - pp900_pmu: pp900-ap { + pp900_pmu: regulator-pp900-ap { }; /* EC turns on w/ pp1800_s0_en_l */ - pp1800_ap_io: pp1800_emmc: pp1800_nfc: pp1800_s0: pp1800 { + pp1800_ap_io: pp1800_emmc: pp1800_nfc: pp1800_s0: regulator-pp1800 { }; /* EC turns on w/ pp1800_avdd_en_l */ - pp1800_avdd: pp1800 { + pp1800_avdd: regulator-pp1800 { }; /* EC turns on w/ pp1800_lid_en_l */ - pp1800_lid: pp1800_mic: pp1800 { + pp1800_lid: pp1800_mic: regulator-pp1800 { }; /* EC turns on w/ lpddr_pwr_en */ - pp1800_lpddr: pp1800 { + pp1800_lpddr: regulator-pp1800 { }; /* EC turns on w/ pp1800_pmu_en_l */ - pp1800_pmu: pp1800 { + pp1800_pmu: regulator-pp1800 { }; /* EC turns on w/ pp1800_usb_en_l */ - pp1800_usb: pp1800 { + pp1800_usb: regulator-pp1800 { }; pp3000_sd_slot: regulator-pp3000-sd-slot { @@ -259,11 +259,11 @@ }; /* EC turns on w/ pp3300_trackpad_en_l */ - pp3300_trackpad: pp3300-trackpad { + pp3300_trackpad: regulator-pp3300-trackpad { }; /* EC turns on w/ usb_a_en */ - pp5000_usb_a_vbus: pp5000 { + pp5000_usb_a_vbus: regulator-pp5000 { }; ap_rtc_clk: ap-rtc-clk { -- cgit v1.2.3 From a1d939055a22be06d8c12bf53afb258b9d38575f Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Mon, 13 Jan 2025 18:47:34 +0800 Subject: arm64: dts: rockchip: Fix lcdpwr_en pin for Cool Pi GenBook According to the schematic, the lcdpwr_en pin is GPIO0_C4, not GPIO1_C4. Fixes: 4a8c1161b843 ("arm64: dts: rockchip: Add support for rk3588 based Cool Pi CM5 GenBook") Signed-off-by: Andy Yan Link: https://lore.kernel.org/r/20250113104825.2390427-1-andyshrk@163.com Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts index 92f0ed83c990..bc6b43a77153 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts +++ b/arch/arm64/boot/dts/rockchip/rk3588-coolpi-cm5-genbook.dts @@ -113,7 +113,7 @@ compatible = "regulator-fixed"; regulator-name = "vcc3v3_lcd"; enable-active-high; - gpio = <&gpio1 RK_PC4 GPIO_ACTIVE_HIGH>; + gpio = <&gpio0 RK_PC4 GPIO_ACTIVE_HIGH>; pinctrl-names = "default"; pinctrl-0 = <&lcdpwr_en>; vin-supply = <&vcc3v3_sys>; @@ -241,7 +241,7 @@ &pinctrl { lcd { lcdpwr_en: lcdpwr-en { - rockchip,pins = <1 RK_PC4 RK_FUNC_GPIO &pcfg_pull_down>; + rockchip,pins = <0 RK_PC4 RK_FUNC_GPIO &pcfg_pull_down>; }; bl_en: bl-en { -- cgit v1.2.3 From 33ea120582a638b2f2e380a50686c2b1d7cce795 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 26 Jan 2025 09:47:25 +0200 Subject: x86/mm/pat: cpa-test: fix length for CPA_ARRAY test The CPA_ARRAY test always uses len[1] as numpages argument to change_page_attr_set() although the addresses array is different each iteration of the test loop. Replace len[1] with len[i] to have numpages matching the addresses array. Fixes: ecc729f1f471 ("x86/mm/cpa: Add ARRAY and PAGES_ARRAY selftests") Signed-off-by: "Mike Rapoport (Microsoft)" Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250126074733.1384926-2-rppt@kernel.org --- arch/x86/mm/pat/cpa-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/pat/cpa-test.c b/arch/x86/mm/pat/cpa-test.c index 3d2f7f0a6ed1..ad3c1feec990 100644 --- a/arch/x86/mm/pat/cpa-test.c +++ b/arch/x86/mm/pat/cpa-test.c @@ -183,7 +183,7 @@ static int pageattr_test(void) break; case 1: - err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); + err = change_page_attr_set(addrs, len[i], PAGE_CPA_TEST, 1); break; case 2: -- cgit v1.2.3 From 4ee788eb0781ba082709c1ac1d5146ebcc40b967 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 26 Jan 2025 09:47:26 +0200 Subject: x86/mm/pat: drop duplicate variable in cpa_flush() There is a 'struct cpa_data *data' parameter in cpa_flush() that is assigned to a local 'struct cpa_data *cpa' variable. Rename the parameter from 'data' to 'cpa' and drop declaration of the local 'cpa' variable. Signed-off-by: "Mike Rapoport (Microsoft)" Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250126074733.1384926-3-rppt@kernel.org --- arch/x86/mm/pat/set_memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index ef4514d64c05..1f7698caa6f7 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -394,9 +394,8 @@ static void __cpa_flush_tlb(void *data) flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } -static void cpa_flush(struct cpa_data *data, int cache) +static void cpa_flush(struct cpa_data *cpa, int cache) { - struct cpa_data *cpa = data; unsigned int i; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); -- cgit v1.2.3 From 41d88484c71cd4f659348da41b7b5b3dbd3be1f6 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sun, 26 Jan 2025 09:47:27 +0200 Subject: x86/mm/pat: restore large ROX pages after fragmentation Change of attributes of the pages may lead to fragmentation of direct mapping over time and performance degradation when these pages contain executable code. With current code it's one way road: kernel tries to avoid splitting large pages, but it doesn't restore them back even if page attributes got compatible again. Any change to the mapping may potentially allow to restore large page. Add a hook to cpa_flush() path that will check if the pages in the range that were just touched can be mapped at PMD level. If the collapse at the PMD level succeeded, also attempt to collapse PUD level. The collapse logic runs only when a set_memory_ method explicitly sets CPA_COLLAPSE flag, for now this is only enabled in set_memory_rox(). CPUs don't like[1] to have to have TLB entries of different size for the same memory, but looks like it's okay as long as these entries have matching attributes[2]. Therefore it's critical to flush TLB before any following changes to the mapping. Note that we already allow for multiple TLB entries of different sizes for the same memory now in split_large_page() path. It's not a new situation. set_memory_4k() provides a way to use 4k pages on purpose. Kernel must not remap such pages as large. Re-use one of software PTE bits to indicate such pages. [1] See Erratum 383 of AMD Family 10h Processors [2] https://lore.kernel.org/linux-mm/1da1b025-cabc-6f04-bde5-e50830d1ecf0@amd.com/ [rppt@kernel.org: * s/restore/collapse/ * update formatting per peterz * use 'struct ptdesc' instead of 'struct page' for list of page tables to be freed * try to collapse PMD first and if it succeeds move on to PUD as peterz suggested * flush TLB twice: for changes done in the original CPA call and after collapsing of large pages * update commit message ] Signed-off-by: "Kirill A. Shutemov" Co-developed-by: "Mike Rapoport (Microsoft)" Signed-off-by: "Mike Rapoport (Microsoft)" Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250126074733.1384926-4-rppt@kernel.org --- arch/x86/include/asm/pgtable_types.h | 2 + arch/x86/mm/pat/set_memory.c | 217 ++++++++++++++++++++++++++++++++++- 2 files changed, 215 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 4b804531b03c..c90e9c51edb7 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -33,6 +33,7 @@ #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ +#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 #ifdef CONFIG_X86_64 @@ -64,6 +65,7 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K) #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1) diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 1f7698caa6f7..7bd0f62ba48f 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -73,6 +73,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ +#define CPA_COLLAPSE 16 /* try to collapse large pages */ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { @@ -105,6 +106,18 @@ static void split_page_count(int level) direct_pages_count[level - 1] += PTRS_PER_PTE; } +static void collapse_page_count(int level) +{ + direct_pages_count[level]++; + if (system_state == SYSTEM_RUNNING) { + if (level == PG_LEVEL_2M) + count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE); + else if (level == PG_LEVEL_1G) + count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE); + } + direct_pages_count[level - 1] -= PTRS_PER_PTE; +} + void arch_report_meminfo(struct seq_file *m) { seq_printf(m, "DirectMap4k: %8lu kB\n", @@ -122,6 +135,7 @@ void arch_report_meminfo(struct seq_file *m) } #else static inline void split_page_count(int level) { } +static inline void collapse_page_count(int level) { } #endif #ifdef CONFIG_X86_CPA_STATISTICS @@ -394,6 +408,40 @@ static void __cpa_flush_tlb(void *data) flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } +static int collapse_large_pages(unsigned long addr, struct list_head *pgtables); + +static void cpa_collapse_large_pages(struct cpa_data *cpa) +{ + unsigned long start, addr, end; + struct ptdesc *ptdesc, *tmp; + LIST_HEAD(pgtables); + int collapsed = 0; + int i; + + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + for (i = 0; i < cpa->numpages; i++) + collapsed += collapse_large_pages(__cpa_addr(cpa, i), + &pgtables); + } else { + addr = __cpa_addr(cpa, 0); + start = addr & PMD_MASK; + end = addr + PAGE_SIZE * cpa->numpages; + + for (addr = start; within(addr, start, end); addr += PMD_SIZE) + collapsed += collapse_large_pages(addr, &pgtables); + } + + if (!collapsed) + return; + + flush_tlb_all(); + + list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { + list_del(&ptdesc->pt_list); + __free_page(ptdesc_page(ptdesc)); + } +} + static void cpa_flush(struct cpa_data *cpa, int cache) { unsigned int i; @@ -402,7 +450,7 @@ static void cpa_flush(struct cpa_data *cpa, int cache) if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); - return; + goto collapse_large_pages; } if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) @@ -411,7 +459,7 @@ static void cpa_flush(struct cpa_data *cpa, int cache) on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) - return; + goto collapse_large_pages; mb(); for (i = 0; i < cpa->numpages; i++) { @@ -427,6 +475,10 @@ static void cpa_flush(struct cpa_data *cpa, int cache) clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); } mb(); + +collapse_large_pages: + if (cpa->flags & CPA_COLLAPSE) + cpa_collapse_large_pages(cpa); } static bool overlaps(unsigned long r1_start, unsigned long r1_end, @@ -1196,6 +1248,161 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, return 0; } +static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, + struct list_head *pgtables) +{ + pmd_t _pmd, old_pmd; + pte_t *pte, first; + unsigned long pfn; + pgprot_t pgprot; + int i = 0; + + addr &= PMD_MASK; + pte = pte_offset_kernel(pmd, addr); + first = *pte; + pfn = pte_pfn(first); + + /* Make sure alignment is suitable */ + if (PFN_PHYS(pfn) & ~PMD_MASK) + return 0; + + /* The page is 4k intentionally */ + if (pte_flags(first) & _PAGE_KERNEL_4K) + return 0; + + /* Check that the rest of PTEs are compatible with the first one */ + for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) { + pte_t entry = *pte; + + if (!pte_present(entry)) + return 0; + if (pte_flags(entry) != pte_flags(first)) + return 0; + if (pte_pfn(entry) != pte_pfn(first) + i) + return 0; + } + + old_pmd = *pmd; + + /* Success: set up a large page */ + pgprot = pgprot_4k_2_large(pte_pgprot(first)); + pgprot_val(pgprot) |= _PAGE_PSE; + _pmd = pfn_pmd(pfn, pgprot); + set_pmd(pmd, _pmd); + + /* Queue the page table to be freed after TLB flush */ + list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables); + + if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) { + struct page *page; + + /* Update all PGD tables to use the same large page */ + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + pmd_t *pmd = pmd_offset(pud, addr); + /* Something is wrong if entries doesn't match */ + if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd))) + continue; + set_pmd(pmd, _pmd); + } + } + + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) + collapse_page_count(PG_LEVEL_2M); + + return 1; +} + +static int collapse_pud_page(pud_t *pud, unsigned long addr, + struct list_head *pgtables) +{ + unsigned long pfn; + pmd_t *pmd, first; + int i; + + if (!direct_gbpages) + return 0; + + addr &= PUD_MASK; + pmd = pmd_offset(pud, addr); + first = *pmd; + + /* + * To restore PUD page all PMD entries must be large and + * have suitable alignment + */ + pfn = pmd_pfn(first); + if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK)) + return 0; + + /* + * To restore PUD page, all following PMDs must be compatible with the + * first one. + */ + for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) { + pmd_t entry = *pmd; + + if (!pmd_present(entry) || !pmd_leaf(entry)) + return 0; + if (pmd_flags(entry) != pmd_flags(first)) + return 0; + if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE) + return 0; + } + + /* Restore PUD page and queue page table to be freed after TLB flush */ + list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables); + set_pud(pud, pfn_pud(pfn, pmd_pgprot(first))); + + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) + collapse_page_count(PG_LEVEL_1G); + + return 1; +} + +/* + * Collapse PMD and PUD pages in the kernel mapping around the address where + * possible. + * + * Caller must flush TLB and free page tables queued on the list before + * touching the new entries. CPU must not see TLB entries of different size + * with different attributes. + */ +static int collapse_large_pages(unsigned long addr, struct list_head *pgtables) +{ + int collapsed = 0; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + addr &= PMD_MASK; + + spin_lock(&pgd_lock); + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + goto out; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + goto out; + pud = pud_offset(p4d, addr); + if (!pud_present(*pud) || pud_leaf(*pud)) + goto out; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd) || pmd_leaf(*pmd)) + goto out; + + collapsed = collapse_pmd_page(pmd, addr, pgtables); + if (collapsed) + collapsed += collapse_pud_page(pud, addr, pgtables); + +out: + spin_unlock(&pgd_lock); + return collapsed; +} + static bool try_to_free_pte_page(pte_t *pte) { int i; @@ -2119,7 +2326,8 @@ int set_memory_rox(unsigned long addr, int numpages) if (__supported_pte_mask & _PAGE_NX) clr.pgprot |= _PAGE_NX; - return change_page_attr_clear(&addr, numpages, clr, 0); + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0, + CPA_COLLAPSE, NULL); } int set_memory_rw(unsigned long addr, int numpages) @@ -2146,7 +2354,8 @@ int set_memory_p(unsigned long addr, int numpages) int set_memory_4k(unsigned long addr, int numpages) { - return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + return change_page_attr_set_clr(&addr, numpages, + __pgprot(_PAGE_KERNEL_4K), __pgprot(0), 1, 0, NULL); } -- cgit v1.2.3 From 1d7e707af446134dd272ea8a89018c63cc17bb6a Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 26 Jan 2025 09:47:31 +0200 Subject: Revert "x86/module: prepare module loading for ROX allocations of text" The module code does not create a writable copy of the executable memory anymore so there is no need to handle it in module relocation and alternatives patching. This reverts commit 9bfc4824fd4836c16bb44f922bfaffba5da3e4f3. Signed-off-by: "Mike Rapoport (Microsoft)" Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250126074733.1384926-8-rppt@kernel.org --- arch/um/kernel/um_arch.c | 11 +-- arch/x86/entry/vdso/vma.c | 3 +- arch/x86/include/asm/alternative.h | 14 +-- arch/x86/kernel/alternative.c | 181 ++++++++++++++++--------------------- arch/x86/kernel/ftrace.c | 30 +++--- arch/x86/kernel/module.c | 45 +++------ 6 files changed, 117 insertions(+), 167 deletions(-) (limited to 'arch') diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 79ea97d4797e..8be91974e786 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -440,25 +440,24 @@ void __init arch_cpu_finalize_init(void) os_check_bugs(); } -void apply_seal_endbr(s32 *start, s32 *end, struct module *mod) +void apply_seal_endbr(s32 *start, s32 *end) { } -void apply_retpolines(s32 *start, s32 *end, struct module *mod) +void apply_retpolines(s32 *start, s32 *end) { } -void apply_returns(s32 *start, s32 *end, struct module *mod) +void apply_returns(s32 *start, s32 *end) { } void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi) { } -void apply_alternatives(struct alt_instr *start, struct alt_instr *end, - struct module *mod) +void apply_alternatives(struct alt_instr *start, struct alt_instr *end) { } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 39e6efc1a9ca..bfc7cabf4017 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -48,8 +48,7 @@ int __init init_vdso_image(const struct vdso_image *image) apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + - image->alt_len), - NULL); + image->alt_len)); return 0; } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e3903b731305..a2141665239b 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -87,16 +87,16 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; * instructions were patched in already: */ extern int alternatives_patched; -struct module; extern void alternative_instructions(void); -extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end, - struct module *mod); -extern void apply_retpolines(s32 *start, s32 *end, struct module *mod); -extern void apply_returns(s32 *start, s32 *end, struct module *mod); -extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern void apply_retpolines(s32 *start, s32 *end); +extern void apply_returns(s32 *start, s32 *end); +extern void apply_seal_endbr(s32 *start, s32 *end); extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, - s32 *start_cfi, s32 *end_cfi, struct module *mod); + s32 *start_cfi, s32 *end_cfi); + +struct module; struct callthunk_sites { s32 *call_start, *call_end; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c71b575bf229..8b66a555d2f0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -392,10 +392,8 @@ EXPORT_SYMBOL(BUG_func); * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ -static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, - struct module *mod) +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) { - u8 *wr_instr = module_writable_address(mod, instr); void *target, *bug = &BUG_func; s32 disp; @@ -405,14 +403,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, } if (a->instrlen != 6 || - wr_instr[0] != CALL_RIP_REL_OPCODE || - wr_instr[1] != CALL_RIP_REL_MODRM) { + instr[0] != CALL_RIP_REL_OPCODE || + instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ - disp = *(s32 *)(wr_instr + 2); + disp = *(s32 *)(instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ @@ -450,8 +448,7 @@ static inline u8 * instr_va(struct alt_instr *i) * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, - struct alt_instr *end, - struct module *mod) + struct alt_instr *end) { u8 insn_buff[MAX_PATCH_LEN]; u8 *instr, *replacement; @@ -480,7 +477,6 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; - u8 *wr_instr, *wr_replacement; /* * In case of nested ALTERNATIVE()s the outer alternative might @@ -494,11 +490,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, } instr = instr_va(a); - wr_instr = module_writable_address(mod, instr); - replacement = (u8 *)&a->repl_offset + a->repl_offset; - wr_replacement = module_writable_address(mod, replacement); - BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); @@ -509,9 +501,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - memcpy(insn_buff, wr_instr, a->instrlen); + memcpy(insn_buff, instr, a->instrlen); optimize_nops(instr, insn_buff, a->instrlen); - text_poke_early(wr_instr, insn_buff, a->instrlen); + text_poke_early(instr, insn_buff, a->instrlen); continue; } @@ -521,12 +513,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); - memcpy(insn_buff, wr_replacement, a->replacementlen); + memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { - insn_buff_sz = alt_replace_call(instr, insn_buff, a, - mod); + insn_buff_sz = alt_replace_call(instr, insn_buff, a); if (insn_buff_sz < 0) continue; } @@ -536,11 +527,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(wr_instr, insn_buff, insn_buff_sz); + text_poke_early(instr, insn_buff, insn_buff_sz); } kasan_enable_current(); @@ -731,20 +722,18 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * Generated by 'objtool --retpoline'. */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, - struct module *mod) +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; - ret = insn_decode_kernel(&insn, wr_addr); + ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -772,9 +761,9 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); - DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(wr_addr, bytes, len); + text_poke_early(addr, bytes, len); } } } @@ -810,8 +799,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) return i; } -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; @@ -820,13 +808,12 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end, for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op; - ret = insn_decode_kernel(&insn, wr_addr); + ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -846,35 +833,32 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end, len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(wr_addr, bytes, len); + text_poke_early(addr, bytes, len); } } } #else -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, - struct module *mod) { } -void __init_or_module noinline apply_returns(s32 *start, s32 *end, - struct module *mod) { } +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr, void *wr_addr); +static void poison_cfi(void *addr); -static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) +static void __init_or_module poison_endbr(void *addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); - if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) return; if (!is_endbr(endbr)) { @@ -889,7 +873,7 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(wr_addr, &poison, 4); + text_poke_early(addr, &poison, 4); } /* @@ -898,23 +882,22 @@ static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ -void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr, wr_addr, true); + poison_endbr(addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16, wr_addr - 16); + poison_cfi(addr - 16); } } #else -void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ @@ -1136,7 +1119,7 @@ static u32 decode_caller_hash(void *addr) } /* .retpoline_sites */ -static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_disable_callers(s32 *start, s32 *end) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate @@ -1148,23 +1131,20 @@ static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); - + hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(wr_addr, jmp, 2); + text_poke_early(addr, jmp, 2); } return 0; } -static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_enable_callers(s32 *start, s32 *end) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. @@ -1174,115 +1154,106 @@ static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(wr_addr, mov, 2); + text_poke_early(addr, mov, 2); } return 0; } /* .cfi_sites */ -static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) +static int cfi_rand_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(wr_addr); + hash = decode_preamble_hash(addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); - text_poke_early(wr_addr + 1, &hash, 4); + text_poke_early(addr + 1, &hash, 4); } return 0; } -static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) +static int cfi_rewrite_preamble(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(wr_addr); + hash = decode_preamble_hash(addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; - text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); - WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678); - text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); } return 0; } -static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) +static void cfi_rewrite_endbr(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr + 16, wr_addr + 16, false); + poison_endbr(addr+16, false); } } /* .retpoline_sites */ -static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_rand_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (hash) { hash = -cfi_rehash(hash); - text_poke_early(wr_addr + 2, &hash, 4); + text_poke_early(addr + 2, &hash, 4); } } return 0; } -static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) +static int cfi_rewrite_callers(s32 *start, s32 *end) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; - void *wr_addr; u32 hash; addr -= fineibt_caller_size; - wr_addr = module_writable_address(mod, addr); - hash = decode_caller_hash(wr_addr); + hash = decode_caller_hash(addr); if (hash) { - text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); - WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); - text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); } /* rely on apply_retpolines() */ } @@ -1291,9 +1262,8 @@ static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi, bool builtin) { - bool builtin = mod ? false : true; int ret; if (WARN_ONCE(fineibt_preamble_size != 16, @@ -1311,7 +1281,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ - ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); + ret = cfi_disable_callers(start_retpoline, end_retpoline); if (ret) goto err; @@ -1322,11 +1292,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } - ret = cfi_rand_preamble(start_cfi, end_cfi, mod); + ret = cfi_rand_preamble(start_cfi, end_cfi); if (ret) goto err; - ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); + ret = cfi_rand_callers(start_retpoline, end_retpoline); if (ret) goto err; } @@ -1338,7 +1308,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_KCFI: - ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); + ret = cfi_enable_callers(start_retpoline, end_retpoline); if (ret) goto err; @@ -1348,17 +1318,17 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ - ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); + ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; /* rewrite the callers to target func()-16 */ - ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ - cfi_rewrite_endbr(start_cfi, end_cfi, mod); + cfi_rewrite_endbr(start_cfi, end_cfi); if (builtin) pr_info("Using FineIBT CFI\n"); @@ -1377,7 +1347,7 @@ static inline void poison_hash(void *addr) *(u32 *)addr = 0; } -static void poison_cfi(void *addr, void *wr_addr) +static void poison_cfi(void *addr) { switch (cfi_mode) { case CFI_FINEIBT: @@ -1389,8 +1359,8 @@ static void poison_cfi(void *addr, void *wr_addr) * ud2 * 1: nop */ - poison_endbr(addr, wr_addr, false); - poison_hash(wr_addr + fineibt_preamble_hash); + poison_endbr(addr, false); + poison_hash(addr + fineibt_preamble_hash); break; case CFI_KCFI: @@ -1399,7 +1369,7 @@ static void poison_cfi(void *addr, void *wr_addr) * movl $0, %eax * .skip 11, 0x90 */ - poison_hash(wr_addr + 1); + poison_hash(addr + 1); break; default: @@ -1410,21 +1380,22 @@ static void poison_cfi(void *addr, void *wr_addr) #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi, bool builtin) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr, void *wr_addr) { } +static void poison_cfi(void *addr) { } #endif #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, struct module *mod) + s32 *start_cfi, s32 *end_cfi) { return __apply_fineibt(start_retpoline, end_retpoline, - start_cfi, end_cfi, mod); + start_cfi, end_cfi, + /* .builtin = */ false); } #ifdef CONFIG_SMP @@ -1721,16 +1692,16 @@ void __init alternative_instructions(void) paravirt_set_cap(); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, - __cfi_sites, __cfi_sites_end, NULL); + __cfi_sites, __cfi_sites_end, true); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ - apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); - apply_returns(__return_sites, __return_sites_end, NULL); + apply_retpolines(__retpoline_sites, __retpoline_sites_end); + apply_returns(__return_sites, __return_sites_end); - apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); + apply_alternatives(__alt_instructions, __alt_instructions_end); /* * Now all calls are established. Apply the call thunks if @@ -1741,7 +1712,7 @@ void __init alternative_instructions(void) /* * Seal all functions that do not have their address taken. */ - apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 166bc0ea3bdf..cace6e8d7cc7 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -118,13 +118,10 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, return ret; /* replace the text with the new text */ - if (ftrace_poke_late) { + if (ftrace_poke_late) text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - } else { - mutex_lock(&text_mutex); - text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); - mutex_unlock(&text_mutex); - } + else + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; } @@ -321,7 +318,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; - void *ret; + int ret; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { start_offset = (unsigned long)ftrace_regs_caller; @@ -352,15 +349,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = text_poke_copy(trampoline, (void *)start_offset, size); - if (WARN_ON(!ret)) + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); + if (WARN_ON(ret < 0)) goto fail; ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else - text_poke_copy(ip, retq, sizeof(retq)); + memcpy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -368,7 +365,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - if (!text_poke_copy(ip, x86_nops[2], 2)) + ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); + if (ret < 0) goto fail; } @@ -381,7 +379,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) */ ptr = (unsigned long *)(trampoline + size + RET_SIZE); - text_poke_copy(ptr, &ops, sizeof(unsigned long)); + *ptr = (unsigned long)ops; op_offset -= start_offset; memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); @@ -397,7 +395,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_ptr.offset = offset; /* put in the new offset to the ftrace_ops */ - text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); /* put in the call to the function */ mutex_lock(&text_mutex); @@ -407,9 +405,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) * the depth accounting before the call already. */ dest = ftrace_ops_get_func(ops); - text_poke_copy_locked(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), - CALL_INSN_SIZE, false); + memcpy(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8984abd91c00..837450b6e882 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -146,21 +146,18 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, } if (apply) { - void *wr_loc = module_writable_address(me, loc); - - if (memcmp(wr_loc, &zero, size)) { + if (memcmp(loc, &zero, size)) { pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(wr_loc, &val, size); + write(loc, &val, size); } else { if (memcmp(loc, &val, size)) { pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - /* FIXME: needs care for ROX module allocations */ write(loc, &zero, size); } } @@ -227,7 +224,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *alt = NULL, + const Elf_Shdr *s, *alt = NULL, *locks = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; @@ -236,6 +233,8 @@ int module_finalize(const Elf_Ehdr *hdr, for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -266,20 +265,20 @@ int module_finalize(const Elf_Ehdr *hdr, csize = cfi->sh_size; } - apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me); + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; - apply_retpolines(rseg, rseg + retpolines->sh_size, me); + apply_retpolines(rseg, rseg + retpolines->sh_size); } if (returns) { void *rseg = (void *)returns->sh_addr; - apply_returns(rseg, rseg + returns->sh_size, me); + apply_returns(rseg, rseg + returns->sh_size); } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size, me); + apply_alternatives(aseg, aseg + alt->sh_size); } if (calls || alt) { struct callthunk_sites cs = {}; @@ -298,28 +297,8 @@ int module_finalize(const Elf_Ehdr *hdr, } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); } - - if (orc && orc_ip) - unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, - (void *)orc->sh_addr, orc->sh_size); - - return 0; -} - -int module_post_finalize(const Elf_Ehdr *hdr, - const Elf_Shdr *sechdrs, - struct module *me) -{ - const Elf_Shdr *s, *locks = NULL; - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; - } - if (locks) { void *lseg = (void *)locks->sh_addr; void *text = me->mem[MOD_TEXT].base; @@ -329,6 +308,10 @@ int module_post_finalize(const Elf_Ehdr *hdr, text, text_end); } + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } -- cgit v1.2.3 From 64f6a4e10c05ed527f0f24b7954964255e0d3535 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sun, 26 Jan 2025 09:47:33 +0200 Subject: x86: re-enable EXECMEM_ROX support after rework of execmem ROX caches Signed-off-by: "Mike Rapoport (Microsoft)" Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250126074733.1384926-10-rppt@kernel.org --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 87198d957e2f..6df7779ed6da 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -85,6 +85,7 @@ config X86 select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_EXECMEM_ROX if X86_64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL -- cgit v1.2.3 From 3ef938c3503563bfc2ac15083557f880d29c2e64 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 3 Jan 2025 19:39:38 +0100 Subject: x86/mm: Fix flush_tlb_range() when used for zapping normal PMDs On the following path, flush_tlb_range() can be used for zapping normal PMD entries (PMD entries that point to page tables) together with the PTE entries in the pointed-to page table: collapse_pte_mapped_thp pmdp_collapse_flush flush_tlb_range The arm64 version of flush_tlb_range() has a comment describing that it can be used for page table removal, and does not use any last-level invalidation optimizations. Fix the X86 version by making it behave the same way. Currently, X86 only uses this information for the following two purposes, which I think means the issue doesn't have much impact: - In native_flush_tlb_multi() for checking if lazy TLB CPUs need to be IPI'd to avoid issues with speculative page table walks. - In Hyper-V TLB paravirtualization, again for lazy TLB stuff. The patch "x86/mm: only invalidate final translations with INVLPGB" which is currently under review (see ) would probably be making the impact of this a lot worse. Fixes: 016c4d92cd16 ("x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range") Signed-off-by: Jann Horn Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250103-x86-collapse-flush-fix-v1-1-3c521856cfa6@google.com --- arch/x86/include/asm/tlbflush.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 02fc2aa06e9e..3da645139748 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -242,7 +242,7 @@ void flush_tlb_multi(const struct cpumask *cpumask, flush_tlb_mm_range((vma)->vm_mm, start, end, \ ((vma)->vm_flags & VM_HUGETLB) \ ? huge_page_shift(hstate_vma(vma)) \ - : PAGE_SHIFT, false) + : PAGE_SHIFT, true) extern void flush_tlb_all(void); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, -- cgit v1.2.3 From 1e66d6cf888fd206a89b8c476b1b28b63faf7fd6 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 7 Oct 2024 09:57:01 -0700 Subject: x86/cpu: Fix #define name for Intel CPU model 0x5A This CPU was mistakenly given the name INTEL_ATOM_AIRMONT_MID. But it uses a Silvermont core, not Airmont. Change #define name to INTEL_ATOM_SILVERMONT_MID2 Reported-by: Christian Ludloff Signed-off-by: Tony Luck Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20241007165701.19693-1-tony.luck%40intel.com --- arch/x86/events/intel/core.c | 2 +- arch/x86/include/asm/intel-family.h | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/tsc_msr.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7601196d1d18..89880540ab43 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6622,7 +6622,7 @@ __init int intel_pmu_init(void) case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_SILVERMONT_MID: case INTEL_ATOM_AIRMONT: - case INTEL_ATOM_AIRMONT_MID: + case INTEL_ATOM_SILVERMONT_MID2: memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 6d7b04ffc5fd..8359113e3e58 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -149,9 +149,9 @@ #define INTEL_ATOM_SILVERMONT IFM(6, 0x37) /* Bay Trail, Valleyview */ #define INTEL_ATOM_SILVERMONT_D IFM(6, 0x4D) /* Avaton, Rangely */ #define INTEL_ATOM_SILVERMONT_MID IFM(6, 0x4A) /* Merriefield */ +#define INTEL_ATOM_SILVERMONT_MID2 IFM(6, 0x5A) /* Anniedale */ #define INTEL_ATOM_AIRMONT IFM(6, 0x4C) /* Cherry Trail, Braswell */ -#define INTEL_ATOM_AIRMONT_MID IFM(6, 0x5A) /* Moorefield */ #define INTEL_ATOM_AIRMONT_NP IFM(6, 0x75) /* Lightning Mountain */ #define INTEL_ATOM_GOLDMONT IFM(6, 0x5C) /* Apollo Lake */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7cce91b19fb2..76598a93a8fa 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1164,7 +1164,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(INTEL_CORE_YONAH, NO_SSB), - VULNWL_INTEL(INTEL_ATOM_AIRMONT_MID, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), + VULNWL_INTEL(INTEL_ATOM_SILVERMONT_MID2,NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | MSBDS_ONLY), VULNWL_INTEL(INTEL_ATOM_AIRMONT_NP, NO_SSB | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(INTEL_ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_MMIO), diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index deeb02825670..48e6cc1cb017 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -152,7 +152,7 @@ static const struct x86_cpu_id tsc_msr_cpu_ids[] = { X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &freq_desc_byt), X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, &freq_desc_tng), X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &freq_desc_cht), - X86_MATCH_VFM(INTEL_ATOM_AIRMONT_MID, &freq_desc_ann), + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID2, &freq_desc_ann), X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &freq_desc_lgm), {} }; -- cgit v1.2.3 From a7dd183f0b3848c056bbeed78ef5d5c52fe94d83 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 5 Feb 2025 17:52:08 +0200 Subject: x86/smp: Allow calling mwait_play_dead with an arbitrary hint Introduce a helper function to allow offlined CPUs to enter idle states with a specific MWAIT hint. The new helper will be used in subsequent patches by the acpi_idle and intel_idle drivers. No functional change intended. Signed-off-by: Patryk Wlazlyn Signed-off-by: Artem Bityutskiy Signed-off-by: Dave Hansen Reviewed-by: Gautham R. Shenoy Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/all/20250205155211.329780-2-artem.bityutskiy%40linux.intel.com --- arch/x86/include/asm/smp.h | 3 ++ arch/x86/kernel/smpboot.c | 88 +++++++++++++++++++++++++--------------------- 2 files changed, 50 insertions(+), 41 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ca073f40698f..80f8bfd83fc7 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); void smp_kick_mwait_play_dead(void); +void mwait_play_dead(unsigned int eax_hint); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); @@ -164,6 +165,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) { return (struct cpumask *)cpumask_of(0); } + +static inline void mwait_play_dead(unsigned int eax_hint) { } #endif /* CONFIG_SMP */ #ifdef CONFIG_DEBUG_NMI_SELFTEST diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c10850ae6f09..8aad14e43f54 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1258,47 +1258,9 @@ void play_dead_common(void) local_irq_disable(); } -/* - * We need to flush the caches before going to sleep, lest we have - * dirty data in our caches when we come back up. - */ -static inline void mwait_play_dead(void) +void __noreturn mwait_play_dead(unsigned int eax_hint) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); - unsigned int eax, ebx, ecx, edx; - unsigned int highest_cstate = 0; - unsigned int highest_subcstate = 0; - int i; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - return; - if (!this_cpu_has(X86_FEATURE_MWAIT)) - return; - if (!this_cpu_has(X86_FEATURE_CLFLUSH)) - return; - - eax = CPUID_LEAF_MWAIT; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - /* - * eax will be 0 if EDX enumeration is not valid. - * Initialized below to cstate, sub_cstate value when EDX is valid. - */ - if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { - eax = 0; - } else { - edx >>= MWAIT_SUBSTATE_SIZE; - for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { - if (edx & MWAIT_SUBSTATE_MASK) { - highest_cstate = i; - highest_subcstate = edx & MWAIT_SUBSTATE_MASK; - } - } - eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | - (highest_subcstate - 1); - } /* Set up state for the kexec() hack below */ md->status = CPUDEAD_MWAIT_WAIT; @@ -1319,7 +1281,7 @@ static inline void mwait_play_dead(void) mb(); __monitor(md, 0, 0); mb(); - __mwait(eax, 0); + __mwait(eax_hint, 0); if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) { /* @@ -1341,6 +1303,50 @@ static inline void mwait_play_dead(void) } } +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ +static inline void mwait_play_dead_cpuid_hint(void) +{ + unsigned int eax, ebx, ecx, edx; + unsigned int highest_cstate = 0; + unsigned int highest_subcstate = 0; + int i; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + return; + if (!this_cpu_has(X86_FEATURE_MWAIT)) + return; + if (!this_cpu_has(X86_FEATURE_CLFLUSH)) + return; + + eax = CPUID_LEAF_MWAIT; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + /* + * eax will be 0 if EDX enumeration is not valid. + * Initialized below to cstate, sub_cstate value when EDX is valid. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { + eax = 0; + } else { + edx >>= MWAIT_SUBSTATE_SIZE; + for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { + if (edx & MWAIT_SUBSTATE_MASK) { + highest_cstate = i; + highest_subcstate = edx & MWAIT_SUBSTATE_MASK; + } + } + eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | + (highest_subcstate - 1); + } + + mwait_play_dead(eax); +} + /* * Kick all "offline" CPUs out of mwait on kexec(). See comment in * mwait_play_dead(). @@ -1391,7 +1397,7 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - mwait_play_dead(); + mwait_play_dead_cpuid_hint(); if (cpuidle_play_dead()) hlt_play_dead(); } -- cgit v1.2.3 From 541ddf31e30022b8e6f44b3a943964e8f0989d15 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 5 Feb 2025 17:52:09 +0200 Subject: ACPI/processor_idle: Add FFH state handling Recent Intel platforms will depend on the idle driver to pass the correct hint for playing dead via mwait_play_dead_with_hint(). Expand the existing enter_dead interface with handling for FFH states and pass the MWAIT hint to the mwait_play_dead code. Suggested-by: Gautham R. Shenoy Signed-off-by: Patryk Wlazlyn Signed-off-by: Artem Bityutskiy Signed-off-by: Dave Hansen Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/all/20250205155211.329780-3-artem.bityutskiy%40linux.intel.com --- arch/x86/kernel/acpi/cstate.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 5854f0b8f0f1..5bdb65516969 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * Initialize bm_flags based on the CPU cache properties @@ -205,6 +206,15 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); +void acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +{ + unsigned int cpu = smp_processor_id(); + struct cstate_entry *percpu_entry; + + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); + mwait_play_dead(percpu_entry->states[cx->index].eax); +} + void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); -- cgit v1.2.3 From 96040f7273e2bc0be1871ad9ed4da7b504da9410 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 5 Feb 2025 17:52:11 +0200 Subject: x86/smp: Eliminate mwait_play_dead_cpuid_hint() Currently, mwait_play_dead_cpuid_hint() looks up the MWAIT hint of the deepest idle state by inspecting CPUID leaf 0x5 with the assumption that, if the number of sub-states for a given major C-state is nonzero, those sub-states are always represented by consecutive numbers starting from 0. This assumption is not based on the documented platform behavior and in fact it is not met on recent Intel platforms. For example, Intel's Sierra Forest report two C-states with two substates each in cpuid leaf 0x5: Name* target cstate target subcstate (mwait hint) =========================================================== C1 0x00 0x00 C1E 0x00 0x01 -- 0x10 ---- C6S 0x20 0x22 C6P 0x20 0x23 -- 0x30 ---- /* No more (sub)states all the way down to the end. */ =========================================================== * Names of the cstates are not included in the CPUID leaf 0x5, they are taken from the product specific documentation. Notice that hints 0x20 and 0x21 are not defined for C-state 0x20 (C6), so the existing MWAIT hint lookup in mwait_play_dead_cpuid_hint() based on the CPUID leaf 0x5 contents does not work in this case. Instead of using MWAIT hint lookup that is not guaranteed to work, make native_play_dead() rely on the idle driver for the given platform to put CPUs going offline into appropriate idle state and, if that fails, fall back to hlt_play_dead(). Accordingly, drop mwait_play_dead_cpuid_hint() altogether and make native_play_dead() call cpuidle_play_dead() instead of it unconditionally with the assumption that it will not return if it is successful. Still, in case cpuidle_play_dead() fails, call hlt_play_dead() at the end. Signed-off-by: Patryk Wlazlyn Signed-off-by: Artem Bityutskiy Signed-off-by: Dave Hansen Reviewed-by: Gautham R. Shenoy Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/all/20250205155211.329780-5-artem.bityutskiy%40linux.intel.com --- arch/x86/kernel/smpboot.c | 54 ++++++----------------------------------------- 1 file changed, 7 insertions(+), 47 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8aad14e43f54..5746084bafe4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1258,6 +1258,10 @@ void play_dead_common(void) local_irq_disable(); } +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ void __noreturn mwait_play_dead(unsigned int eax_hint) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); @@ -1303,50 +1307,6 @@ void __noreturn mwait_play_dead(unsigned int eax_hint) } } -/* - * We need to flush the caches before going to sleep, lest we have - * dirty data in our caches when we come back up. - */ -static inline void mwait_play_dead_cpuid_hint(void) -{ - unsigned int eax, ebx, ecx, edx; - unsigned int highest_cstate = 0; - unsigned int highest_subcstate = 0; - int i; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - return; - if (!this_cpu_has(X86_FEATURE_MWAIT)) - return; - if (!this_cpu_has(X86_FEATURE_CLFLUSH)) - return; - - eax = CPUID_LEAF_MWAIT; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - /* - * eax will be 0 if EDX enumeration is not valid. - * Initialized below to cstate, sub_cstate value when EDX is valid. - */ - if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { - eax = 0; - } else { - edx >>= MWAIT_SUBSTATE_SIZE; - for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { - if (edx & MWAIT_SUBSTATE_MASK) { - highest_cstate = i; - highest_subcstate = edx & MWAIT_SUBSTATE_MASK; - } - } - eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | - (highest_subcstate - 1); - } - - mwait_play_dead(eax); -} - /* * Kick all "offline" CPUs out of mwait on kexec(). See comment in * mwait_play_dead(). @@ -1397,9 +1357,9 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - mwait_play_dead_cpuid_hint(); - if (cpuidle_play_dead()) - hlt_play_dead(); + /* Below returns only on error. */ + cpuidle_play_dead(); + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From 63887c9f02030afd042c125052ad60680f7c21b2 Mon Sep 17 00:00:00 2001 From: Maciej Wieczor-Retman Date: Tue, 4 Feb 2025 18:33:50 +0100 Subject: x86: Compare physical instead of virtual PGD addresses This is a preparatory patch for when pointers have tags in their upper address bits. But it's a harmless change on its own. The mm->pgd virtual address may be tagged because it came out of the allocator at some point. The __va(read_cr3_pa()) address will never be tagged (the tag bits are all 1's). A direct pointer value comparison would fail if one is tagged and the other is not. To fix this, just compare the physical addresses which are never affected by tagging. [ dhansen: subject and changelog munging ] Signed-off-by: Maciej Wieczor-Retman Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/fde443d0e67f76a51e7ab4e96647705840f53ddb.1738686764.git.maciej.wieczor-retman%40intel.com --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 6cf881a942bb..ffc25b348041 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1325,7 +1325,7 @@ bool nmi_uaccess_okay(void) if (loaded_mm != current_mm) return false; - VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa()); return true; } -- cgit v1.2.3 From c7691aec5e991cec9c5c5fdab08c24856a1fc56f Mon Sep 17 00:00:00 2001 From: Vaishnav Achath Date: Wed, 5 Feb 2025 11:52:29 +0530 Subject: arm64: defconfig: Enable TISCI Interrupt Router and Aggregator Enable TISCI Interrupt Router and Interrupt Aggregator drivers. These IPs are found in all TI K3 SoCs like J721E, AM62X and is required for core functionality like DMA, GPIO Interrupts which is necessary during boot, thus make them built-in. bloat-o-meter summary on vmlinux: add/remove: 460/1 grow/shrink: 4/0 up/down: 162483/-8 (162475) ... Total: Before=31615984, After=31778459, chg +0.51% These configs were previously selected for ARCH_K3 in respective Kconfigs till commit b8b26ae398c4 ("irqchip/ti-sci-inta : Add module build support") and commit 2d95ffaecbc2 ("irqchip/ti-sci-intr: Add module build support") dropped them and few driver configs (TI_K3_UDMA, TI_K3_RINGACC) dependent on these also got disabled due to this. While re-enabling the TI_SCI_INT_*_IRQCHIP configs, these configs with missing dependencies (which are already part of arm64 defconfig) also get re-enabled which explains the slightly larger size increase from the bloat-o-meter summary. Fixes: 2d95ffaecbc2 ("irqchip/ti-sci-intr: Add module build support") Fixes: b8b26ae398c4 ("irqchip/ti-sci-inta : Add module build support") Signed-off-by: Vaishnav Achath Tested-by: Dhruva Gole Reviewed-by: Dhruva Gole Link: https://lore.kernel.org/r/20250205062229.3869081-1-vaishnav.a@ti.com Signed-off-by: Nishanth Menon --- arch/arm64/configs/defconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index cb7da4415599..1f25423de383 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -1551,6 +1551,8 @@ CONFIG_PWM_VISCONTI=m CONFIG_SL28CPLD_INTC=y CONFIG_QCOM_PDC=y CONFIG_QCOM_MPM=y +CONFIG_TI_SCI_INTR_IRQCHIP=y +CONFIG_TI_SCI_INTA_IRQCHIP=y CONFIG_RESET_GPIO=m CONFIG_RESET_IMX7=y CONFIG_RESET_QCOM_AOSS=y -- cgit v1.2.3 From dc9c5166c3cb044f8a001e397195242fd6796eee Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 3 Feb 2025 11:14:57 +0100 Subject: powerpc/code-patching: Disable KASAN report during patching via temporary mm Erhard reports the following KASAN hit on Talos II (power9) with kernel 6.13: [ 12.028126] ================================================================== [ 12.028198] BUG: KASAN: user-memory-access in copy_to_kernel_nofault+0x8c/0x1a0 [ 12.028260] Write of size 8 at addr 0000187e458f2000 by task systemd/1 [ 12.028346] CPU: 87 UID: 0 PID: 1 Comm: systemd Tainted: G T 6.13.0-P9-dirty #3 [ 12.028408] Tainted: [T]=RANDSTRUCT [ 12.028446] Hardware name: T2P9D01 REV 1.01 POWER9 0x4e1202 opal:skiboot-bc106a0 PowerNV [ 12.028500] Call Trace: [ 12.028536] [c000000008dbf3b0] [c000000001656a48] dump_stack_lvl+0xbc/0x110 (unreliable) [ 12.028609] [c000000008dbf3f0] [c0000000006e2fc8] print_report+0x6b0/0x708 [ 12.028666] [c000000008dbf4e0] [c0000000006e2454] kasan_report+0x164/0x300 [ 12.028725] [c000000008dbf600] [c0000000006e54d4] kasan_check_range+0x314/0x370 [ 12.028784] [c000000008dbf640] [c0000000006e6310] __kasan_check_write+0x20/0x40 [ 12.028842] [c000000008dbf660] [c000000000578e8c] copy_to_kernel_nofault+0x8c/0x1a0 [ 12.028902] [c000000008dbf6a0] [c0000000000acfe4] __patch_instructions+0x194/0x210 [ 12.028965] [c000000008dbf6e0] [c0000000000ade80] patch_instructions+0x150/0x590 [ 12.029026] [c000000008dbf7c0] [c0000000001159bc] bpf_arch_text_copy+0x6c/0xe0 [ 12.029085] [c000000008dbf800] [c000000000424250] bpf_jit_binary_pack_finalize+0x40/0xc0 [ 12.029147] [c000000008dbf830] [c000000000115dec] bpf_int_jit_compile+0x3bc/0x930 [ 12.029206] [c000000008dbf990] [c000000000423720] bpf_prog_select_runtime+0x1f0/0x280 [ 12.029266] [c000000008dbfa00] [c000000000434b18] bpf_prog_load+0xbb8/0x1370 [ 12.029324] [c000000008dbfb70] [c000000000436ebc] __sys_bpf+0x5ac/0x2e00 [ 12.029379] [c000000008dbfd00] [c00000000043a228] sys_bpf+0x28/0x40 [ 12.029435] [c000000008dbfd20] [c000000000038eb4] system_call_exception+0x334/0x610 [ 12.029497] [c000000008dbfe50] [c00000000000c270] system_call_vectored_common+0xf0/0x280 [ 12.029561] --- interrupt: 3000 at 0x3fff82f5cfa8 [ 12.029608] NIP: 00003fff82f5cfa8 LR: 00003fff82f5cfa8 CTR: 0000000000000000 [ 12.029660] REGS: c000000008dbfe80 TRAP: 3000 Tainted: G T (6.13.0-P9-dirty) [ 12.029735] MSR: 900000000280f032 CR: 42004848 XER: 00000000 [ 12.029855] IRQMASK: 0 GPR00: 0000000000000169 00003fffdcf789a0 00003fff83067100 0000000000000005 GPR04: 00003fffdcf78a98 0000000000000090 0000000000000000 0000000000000008 GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR12: 0000000000000000 00003fff836ff7e0 c000000000010678 0000000000000000 GPR16: 0000000000000000 0000000000000000 00003fffdcf78f28 00003fffdcf78f90 GPR20: 0000000000000000 0000000000000000 0000000000000000 00003fffdcf78f80 GPR24: 00003fffdcf78f70 00003fffdcf78d10 00003fff835c7239 00003fffdcf78bd8 GPR28: 00003fffdcf78a98 0000000000000000 0000000000000000 000000011f547580 [ 12.030316] NIP [00003fff82f5cfa8] 0x3fff82f5cfa8 [ 12.030361] LR [00003fff82f5cfa8] 0x3fff82f5cfa8 [ 12.030405] --- interrupt: 3000 [ 12.030444] ================================================================== Commit c28c15b6d28a ("powerpc/code-patching: Use temporary mm for Radix MMU") is inspired from x86 but unlike x86 is doesn't disable KASAN reports during patching. This wasn't a problem at the begining because __patch_mem() is not instrumented. Commit 465cabc97b42 ("powerpc/code-patching: introduce patch_instructions()") use copy_to_kernel_nofault() to copy several instructions at once. But when using temporary mm the destination is not regular kernel memory but a kind of kernel-like memory located in user address space. Because it is not in kernel address space it is not covered by KASAN shadow memory. Since commit e4137f08816b ("mm, kasan, kmsan: instrument copy_from/to_kernel_nofault") KASAN reports bad accesses from copy_to_kernel_nofault(). Here a bad access to user memory is reported because KASAN detects the lack of shadow memory and the address is below TASK_SIZE. Do like x86 in commit b3fd8e83ada0 ("x86/alternatives: Use temporary mm for text poking") and disable KASAN reports during patching when using temporary mm. Reported-by: Erhard Furtner Close: https://lore.kernel.org/all/20250201151435.48400261@yea/ Fixes: 465cabc97b42 ("powerpc/code-patching: introduce patch_instructions()") Signed-off-by: Christophe Leroy Acked-by: Michael Ellerman Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/1c05b2a1b02ad75b981cfc45927e0b4a90441046.1738577687.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index af97fbb3c257..81c0f673eb25 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -493,7 +493,9 @@ static int __do_patch_instructions_mm(u32 *addr, u32 *code, size_t len, bool rep orig_mm = start_using_temp_mm(patching_mm); + kasan_disable_current(); err = __patch_instructions(patch_addr, code, len, repeat_instr); + kasan_enable_current(); /* context synchronisation performed by __patch_instructions */ stop_using_temp_mm(patching_mm, orig_mm); -- cgit v1.2.3 From 61bcc752d1b81fde3cae454ff20c1d3c359df500 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 12 Jan 2025 19:24:46 +0100 Subject: powerpc/64s: Rewrite __real_pte() and __rpte_to_hidx() as static inline Rewrite __real_pte() and __rpte_to_hidx() as static inline in order to avoid following warnings/errors when building with 4k page size: CC arch/powerpc/mm/book3s64/hash_tlb.o arch/powerpc/mm/book3s64/hash_tlb.c: In function 'hpte_need_flush': arch/powerpc/mm/book3s64/hash_tlb.c:49:16: error: variable 'offset' set but not used [-Werror=unused-but-set-variable] 49 | int i, offset; | ^~~~~~ CC arch/powerpc/mm/book3s64/hash_native.o arch/powerpc/mm/book3s64/hash_native.c: In function 'native_flush_hash_range': arch/powerpc/mm/book3s64/hash_native.c:782:29: error: variable 'index' set but not used [-Werror=unused-but-set-variable] 782 | unsigned long hash, index, hidx, shift, slot; | ^~~~~ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501081741.AYFwybsq-lkp@intel.com/ Fixes: ff31e105464d ("powerpc/mm/hash64: Store the slot information at the right offset for hugetlb") Signed-off-by: Christophe Leroy Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/e0d340a5b7bd478ecbf245d826e6ab2778b74e06.1736706263.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index c3efacab4b94..aa90a048f319 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -77,9 +77,17 @@ /* * With 4K page size the real_pte machinery is all nops. */ -#define __real_pte(e, p, o) ((real_pte_t){(e)}) +static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep, int offset) +{ + return (real_pte_t){pte}; +} + #define __rpte_to_pte(r) ((r).pte) -#define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT) + +static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index) +{ + return pte_val(__rpte_to_pte(rpte)) >> H_PAGE_F_GIX_SHIFT; +} #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift) \ do { \ -- cgit v1.2.3 From 7d1163fc08936fcb5cf5d9daf366c322c3b4e882 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 7 Feb 2025 15:39:01 +0100 Subject: arm64: dts: rockchip: disable IOMMU when running rk3588 in PCIe endpoint mode Commit da92d3dfc871 ("arm64: dts: rockchip: enable the mmu600_pcie IOMMU on the rk3588 SoC") enabled the mmu600_pcie IOMMU, both in the normal case (when all PCIe controllers are running in Root Complex mode) and in the case when running the pcie3x4 PCIe controller in Endpoint mode. There have been no issues detected when running the PCIe controllers in Root Complex mode. During PCI probe time, we will add a SID to the IOMMU for each PCI device enumerated on the bus, including the root port itself. However, when running the pcie3x4 PCIe controller in Endpoint mode, we will only add a single SID to the IOMMU (the SID specified in the iommus DT property). The enablement of IOMMU in endpoint mode was verified on setup with two Rock 5b:s, where the BDF of the Root Complex has BDF (00:00.0). A Root Complex sending a TLP to the Endpoint will have Requester ID set to the BDF of the initiator. On the EP side, the Requester ID will then be used as the SID. This works fine if the Root Complex has a BDF that matches the iommus DT property, however, if the Root Complex has any other BDF, we will see something like: arm-smmu-v3 fc900000.iommu: event: C_BAD_STREAMID client: (unassigned sid) sid: 0x1600 ssid: 0x0 on the endpoint side. For PCIe controllers running in endpoint mode that always uses the incoming Requester ID as the SID, the iommus DT property simply isn't a viable solution. (Neither is iommu-map a viable solution, as there is no enumeration done on the endpoint side.) Thus, partly revert commit da92d3dfc871 ("arm64: dts: rockchip: enable the mmu600_pcie IOMMU on the rk3588 SoC") by disabling the PCI IOMMU when running the pcie3x4 PCIe controller in Endpoint mode. Since the PCI IOMMU is working as expected in the normal case, keep it enabled when running all PCIe controllers in Root Complex mode. Fixes: da92d3dfc871 ("arm64: dts: rockchip: enable the mmu600_pcie IOMMU on the rk3588 SoC") Signed-off-by: Niklas Cassel Acked-by: Robin Murphy Link: https://lore.kernel.org/r/20250207143900.2047949-2-cassel@kernel.org Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi | 1 - arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi index 4a950907ea6f..840b638af1c2 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-extra.dtsi @@ -213,7 +213,6 @@ interrupt-names = "sys", "pmc", "msg", "legacy", "err", "dma0", "dma1", "dma2", "dma3"; max-link-speed = <3>; - iommus = <&mmu600_pcie 0x0000>; num-lanes = <4>; phys = <&pcie30phy>; phy-names = "pcie-phy"; diff --git a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso index 672d748fcc67..f229cb49da68 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso +++ b/arch/arm64/boot/dts/rockchip/rk3588-rock-5b-pcie-ep.dtso @@ -23,3 +23,7 @@ vpcie3v3-supply = <&vcc3v3_pcie30>; status = "okay"; }; + +&mmu600_pcie { + status = "disabled"; +}; -- cgit v1.2.3 From 8546cfd08aa4b982acd2357403a1f15495d622ec Mon Sep 17 00:00:00 2001 From: Patrick Wildt Date: Mon, 10 Feb 2025 22:37:29 +0100 Subject: arm64: dts: rockchip: adjust SMMU interrupt type on rk3588 The SMMU architecture requires wired interrupts to be edge triggered, which does not align with the DT description for the RK3588. This leads to interrupt storms, as the SMMU continues to hold the pin high and only pulls it down for a short amount when issuing an IRQ. Update the DT description to be in line with the spec and perceived reality. Signed-off-by: Patrick Wildt Fixes: cd81d3a0695c ("arm64: dts: rockchip: add rk3588 pcie and php IOMMUs") Reviewed-by: Niklas Cassel Link: https://lore.kernel.org/r/Z6pxme2Chmf3d3uK@windev.fritz.box Signed-off-by: Heiko Stuebner --- arch/arm64/boot/dts/rockchip/rk3588-base.dtsi | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi index 978de506d434..c3abdfb04f8f 100644 --- a/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3588-base.dtsi @@ -549,10 +549,10 @@ mmu600_pcie: iommu@fc900000 { compatible = "arm,smmu-v3"; reg = <0x0 0xfc900000 0x0 0x200000>; - interrupts = , - , - , - ; + interrupts = , + , + , + ; interrupt-names = "eventq", "gerror", "priq", "cmdq-sync"; #iommu-cells = <1>; }; @@ -560,10 +560,10 @@ mmu600_php: iommu@fcb00000 { compatible = "arm,smmu-v3"; reg = <0x0 0xfcb00000 0x0 0x200000>; - interrupts = , - , - , - ; + interrupts = , + , + , + ; interrupt-names = "eventq", "gerror", "priq", "cmdq-sync"; #iommu-cells = <1>; status = "disabled"; -- cgit v1.2.3 From d262a192d38e527faa5984629aabda2e0d1c4f54 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 12 Feb 2025 07:46:28 +0100 Subject: powerpc/code-patching: Fix KASAN hit by not flagging text patching area as VM_ALLOC Erhard reported the following KASAN hit while booting his PowerMac G4 with a KASAN-enabled kernel 6.13-rc6: BUG: KASAN: vmalloc-out-of-bounds in copy_to_kernel_nofault+0xd8/0x1c8 Write of size 8 at addr f1000000 by task chronyd/1293 CPU: 0 UID: 123 PID: 1293 Comm: chronyd Tainted: G W 6.13.0-rc6-PMacG4 #2 Tainted: [W]=WARN Hardware name: PowerMac3,6 7455 0x80010303 PowerMac Call Trace: [c2437590] [c1631a84] dump_stack_lvl+0x70/0x8c (unreliable) [c24375b0] [c0504998] print_report+0xdc/0x504 [c2437610] [c050475c] kasan_report+0xf8/0x108 [c2437690] [c0505a3c] kasan_check_range+0x24/0x18c [c24376a0] [c03fb5e4] copy_to_kernel_nofault+0xd8/0x1c8 [c24376c0] [c004c014] patch_instructions+0x15c/0x16c [c2437710] [c00731a8] bpf_arch_text_copy+0x60/0x7c [c2437730] [c0281168] bpf_jit_binary_pack_finalize+0x50/0xac [c2437750] [c0073cf4] bpf_int_jit_compile+0xb30/0xdec [c2437880] [c0280394] bpf_prog_select_runtime+0x15c/0x478 [c24378d0] [c1263428] bpf_prepare_filter+0xbf8/0xc14 [c2437990] [c12677ec] bpf_prog_create_from_user+0x258/0x2b4 [c24379d0] [c027111c] do_seccomp+0x3dc/0x1890 [c2437ac0] [c001d8e0] system_call_exception+0x2dc/0x420 [c2437f30] [c00281ac] ret_from_syscall+0x0/0x2c --- interrupt: c00 at 0x5a1274 NIP: 005a1274 LR: 006a3b3c CTR: 005296c8 REGS: c2437f40 TRAP: 0c00 Tainted: G W (6.13.0-rc6-PMacG4) MSR: 0200f932 CR: 24004422 XER: 00000000 GPR00: 00000166 af8f3fa0 a7ee3540 00000001 00000000 013b6500 005a5858 0200f932 GPR08: 00000000 00001fe9 013d5fc8 005296c8 2822244c 00b2fcd8 00000000 af8f4b57 GPR16: 00000000 00000001 00000000 00000000 00000000 00000001 00000000 00000002 GPR24: 00afdbb0 00000000 00000000 00000000 006e0004 013ce060 006e7c1c 00000001 NIP [005a1274] 0x5a1274 LR [006a3b3c] 0x6a3b3c --- interrupt: c00 The buggy address belongs to the virtual mapping at [f1000000, f1002000) created by: text_area_cpu_up+0x20/0x190 The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:00000000 index:0x0 pfn:0x76e30 flags: 0x80000000(zone=2) raw: 80000000 00000000 00000122 00000000 00000000 00000000 ffffffff 00000001 raw: 00000000 page dumped because: kasan: bad access detected Memory state around the buggy address: f0ffff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f0ffff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 >f1000000: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 ^ f1000080: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f1000100: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 ================================================================== f8 corresponds to KASAN_VMALLOC_INVALID which means the area is not initialised hence not supposed to be used yet. Powerpc text patching infrastructure allocates a virtual memory area using get_vm_area() and flags it as VM_ALLOC. But that flag is meant to be used for vmalloc() and vmalloc() allocated memory is not supposed to be used before a call to __vmalloc_node_range() which is never called for that area. That went undetected until commit e4137f08816b ("mm, kasan, kmsan: instrument copy_from/to_kernel_nofault") The area allocated by text_area_cpu_up() is not vmalloc memory, it is mapped directly on demand when needed by map_kernel_page(). There is no VM flag corresponding to such usage, so just pass no flag. That way the area will be unpoisonned and usable immediately. Reported-by: Erhard Furtner Closes: https://lore.kernel.org/all/20250112135832.57c92322@yea/ Fixes: 37bc3e5fd764 ("powerpc/lib/code-patching: Use alternate map for patch_instruction()") Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/06621423da339b374f48c0886e3a5db18e896be8.1739342693.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 81c0f673eb25..f84e0337cc02 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -108,7 +108,7 @@ static int text_area_cpu_up(unsigned int cpu) unsigned long addr; int err; - area = get_vm_area(PAGE_SIZE, VM_ALLOC); + area = get_vm_area(PAGE_SIZE, 0); if (!area) { WARN_ONCE(1, "Failed to create text area for cpu %d\n", cpu); -- cgit v1.2.3 From fb8179ce2996bffaa36a04e2b6262843b01b7565 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Mon, 4 Nov 2024 13:03:13 -0600 Subject: riscv: cacheinfo: Use of_property_present() for non-boolean properties MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The use of of_property_read_bool() for non-boolean properties is deprecated in favor of of_property_present() when testing for property presence. Signed-off-by: Rob Herring (Arm) Reviewed-by: Clément Léger Cc: stable@vger.kernel.org Fixes: 76d2a0493a17 ("RISC-V: Init and Halt Code") Link: https://lore.kernel.org/r/20241104190314.270095-1-robh@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cacheinfo.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index 2d40736fc37c..26b085dbdd07 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -108,11 +108,11 @@ int populate_cache_leaves(unsigned int cpu) if (!np) return -ENOENT; - if (of_property_read_bool(np, "cache-size")) + if (of_property_present(np, "cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); - if (of_property_read_bool(np, "i-cache-size")) + if (of_property_present(np, "i-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); - if (of_property_read_bool(np, "d-cache-size")) + if (of_property_present(np, "d-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); prev = np; @@ -125,11 +125,11 @@ int populate_cache_leaves(unsigned int cpu) break; if (level <= levels) break; - if (of_property_read_bool(np, "cache-size")) + if (of_property_present(np, "cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_UNIFIED, level); - if (of_property_read_bool(np, "i-cache-size")) + if (of_property_present(np, "i-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_INST, level); - if (of_property_read_bool(np, "d-cache-size")) + if (of_property_present(np, "d-cache-size")) ci_leaf_init(this_leaf++, CACHE_TYPE_DATA, level); levels = level; } -- cgit v1.2.3 From c6ec1e1b078d8e2ecd075e46db6197a14930a3fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Mon, 10 Feb 2025 16:56:14 +0100 Subject: riscv: cpufeature: use bitmap_equal() instead of memcmp() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comparison of bitmaps should be done using bitmap_equal(), not memcmp(), use the former one to compare isa bitmaps. Signed-off-by: Clément Léger Fixes: 625034abd52a8c ("riscv: add ISA extensions validation callback") Reviewed-by: Alexandre Ghiti Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250210155615.1545738-1-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpufeature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index c6ba750536c3..40ac72e407b6 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -479,7 +479,7 @@ static void __init riscv_resolve_isa(unsigned long *source_isa, if (bit < RISCV_ISA_EXT_BASE) *this_hwcap |= isa2hwcap[bit]; } - } while (loop && memcmp(prev_resolved_isa, resolved_isa, sizeof(prev_resolved_isa))); + } while (loop && !bitmap_equal(prev_resolved_isa, resolved_isa, RISCV_ISA_EXT_MAX)); } static void __init match_isa_ext(const char *name, const char *name_end, unsigned long *bitmap) -- cgit v1.2.3 From 1898300abf3508bca152e65b36cce5bf93d7e63e Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Thu, 30 Jan 2025 10:25:38 +0100 Subject: riscv/atomic: Do proper sign extension also for unsigned in arch_cmpxchg Sign extend also an unsigned compare value to match what lr.w is doing. Otherwise try_cmpxchg may spuriously return true when used on a u32 value that has the sign bit set, as it happens often in inode_set_ctime_current. Do this in three conversion steps. The first conversion to long is needed to avoid a -Wpointer-to-int-cast warning when arch_cmpxchg is used with a pointer type. Then convert to int and back to long to always sign extend the 32-bit value to 64-bit. Fixes: 6c58f25e6938 ("riscv/atomic: Fix sign extension for RV64I") Signed-off-by: Andreas Schwab Reviewed-by: Alexandre Ghiti Reviewed-by: Andrew Jones Tested-by: Xi Ruoyao Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/mvmed0k4prh.fsf@suse.de Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cmpxchg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 4cadc56220fe..427c41dde643 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -231,7 +231,7 @@ __arch_cmpxchg(".w", ".w" sc_sfx, ".w" cas_sfx, \ sc_prepend, sc_append, \ cas_prepend, cas_append, \ - __ret, __ptr, (long), __old, __new); \ + __ret, __ptr, (long)(int)(long), __old, __new); \ break; \ case 8: \ __arch_cmpxchg(".d", ".d" sc_sfx, ".d" cas_sfx, \ -- cgit v1.2.3 From 599c44cd21f4967774e0acf58f734009be4aea9a Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Mon, 3 Feb 2025 11:06:00 +0100 Subject: riscv/futex: sign extend compare value in atomic cmpxchg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure the compare value in the lr/sc loop is sign extended to match what lr.w does. Fortunately, due to the compiler keeping the register contents sign extended anyway the lack of the explicit extension didn't result in wrong code so far, but this cannot be relied upon. Fixes: b90edb33010b ("RISC-V: Add futex support.") Signed-off-by: Andreas Schwab Reviewed-by: Alexandre Ghiti Reviewed-by: Björn Töpel Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/mvmfrkv2vhz.fsf@suse.de Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/futex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/futex.h b/arch/riscv/include/asm/futex.h index 72be100afa23..90c86b115e00 100644 --- a/arch/riscv/include/asm/futex.h +++ b/arch/riscv/include/asm/futex.h @@ -93,7 +93,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %[r]) \ _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %[r]) \ : [r] "+r" (ret), [v] "=&r" (val), [u] "+m" (*uaddr), [t] "=&r" (tmp) - : [ov] "Jr" (oldval), [nv] "Jr" (newval) + : [ov] "Jr" ((long)(int)oldval), [nv] "Jr" (newval) : "memory"); __disable_user_access(); -- cgit v1.2.3 From aa49bc2ca8524186ceb0811c23a7f00c3dea6987 Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Fri, 20 Dec 2024 16:39:23 +0800 Subject: riscv: signal: fix signal frame size The signal context of certain RISC-V extensions will be appended after struct __riscv_extra_ext_header, which already includes an empty context header. Therefore, there is no need to preserve a separate hdr for the END of signal context. Fixes: 8ee0b41898fa ("riscv: signal: Add sigcontext save/restore for vector") Signed-off-by: Yong-Xuan Wang Reviewed-by: Zong Li Reviewed-by: Andy Chiu Reviewed-by: Alexandre Ghiti Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241220083926.19453-2-yongxuan.wang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/signal.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/signal.c b/arch/riscv/kernel/signal.c index 94e905eea1de..08378fea3a11 100644 --- a/arch/riscv/kernel/signal.c +++ b/arch/riscv/kernel/signal.c @@ -215,12 +215,6 @@ static size_t get_rt_frame_size(bool cal_all) if (cal_all || riscv_v_vstate_query(task_pt_regs(current))) total_context_size += riscv_v_sc_size; } - /* - * Preserved a __riscv_ctx_hdr for END signal context header if an - * extension uses __riscv_extra_ext_header - */ - if (total_context_size) - total_context_size += sizeof(struct __riscv_ctx_hdr); frame_size += total_context_size; -- cgit v1.2.3 From 564fc8eb6f78e01292ff10801f318feae6153fdd Mon Sep 17 00:00:00 2001 From: Yong-Xuan Wang Date: Fri, 20 Dec 2024 16:39:24 +0800 Subject: riscv: signal: fix signal_minsigstksz The init_rt_signal_env() funciton is called before the alternative patch is applied, so using the alternative-related API to check the availability of an extension within this function doesn't have the intended effect. This patch reorders the init_rt_signal_env() and apply_boot_alternatives() to get the correct signal_minsigstksz. Fixes: e92f469b0771 ("riscv: signal: Report signal frame size to userspace via auxv") Signed-off-by: Yong-Xuan Wang Reviewed-by: Zong Li Reviewed-by: Andy Chiu Reviewed-by: Alexandre Ghiti Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20241220083926.19453-3-yongxuan.wang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index f1793630fc51..4fe45daa6281 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -322,8 +322,8 @@ void __init setup_arch(char **cmdline_p) riscv_init_cbo_blocksizes(); riscv_fill_hwcap(); - init_rt_signal_env(); apply_boot_alternatives(); + init_rt_signal_env(); if (IS_ENABLED(CONFIG_RISCV_ISA_ZICBOM) && riscv_isa_extension_available(NULL, ZICBOM)) -- cgit v1.2.3 From c7db342e3b4744688be1e27e31254c1d31a35274 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 17 Feb 2025 09:45:08 +0100 Subject: riscv: KVM: Fix hart suspend status check "Not stopped" means started or suspended so we need to check for a single state in order to have a chance to check for each state. Also, we need to use target_vcpu when checking for the suspend state. Fixes: 763c8bed8c05 ("RISC-V: KVM: Implement SBI HSM suspend call") Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250217084506.18763-8-ajones@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_hsm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c index dce667f4b6ab..13a35eb77e8e 100644 --- a/arch/riscv/kvm/vcpu_sbi_hsm.c +++ b/arch/riscv/kvm/vcpu_sbi_hsm.c @@ -79,12 +79,12 @@ static int kvm_sbi_hsm_vcpu_get_status(struct kvm_vcpu *vcpu) target_vcpu = kvm_get_vcpu_by_id(vcpu->kvm, target_vcpuid); if (!target_vcpu) return SBI_ERR_INVALID_PARAM; - if (!kvm_riscv_vcpu_stopped(target_vcpu)) - return SBI_HSM_STATE_STARTED; - else if (vcpu->stat.generic.blocking) + if (kvm_riscv_vcpu_stopped(target_vcpu)) + return SBI_HSM_STATE_STOPPED; + else if (target_vcpu->stat.generic.blocking) return SBI_HSM_STATE_SUSPENDED; else - return SBI_HSM_STATE_STOPPED; + return SBI_HSM_STATE_STARTED; } static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, -- cgit v1.2.3 From e3219b0c491f2aa0e0b200a39d3352ab05cdda96 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 17 Feb 2025 09:45:09 +0100 Subject: riscv: KVM: Fix hart suspend_type use The spec says suspend_type is 32 bits wide and "In case the data is defined as 32bit wide, higher privilege software must ensure that it only uses 32 bit data." Mask off upper bits of suspend_type before using it. Fixes: 763c8bed8c05 ("RISC-V: KVM: Implement SBI HSM suspend call") Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250217084506.18763-9-ajones@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_hsm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_sbi_hsm.c b/arch/riscv/kvm/vcpu_sbi_hsm.c index 13a35eb77e8e..3070bb31745d 100644 --- a/arch/riscv/kvm/vcpu_sbi_hsm.c +++ b/arch/riscv/kvm/vcpu_sbi_hsm.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -109,7 +110,7 @@ static int kvm_sbi_ext_hsm_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, } return 0; case SBI_EXT_HSM_HART_SUSPEND: - switch (cp->a0) { + switch (lower_32_bits(cp->a0)) { case SBI_HSM_SUSPEND_RET_DEFAULT: kvm_riscv_vcpu_wfi(vcpu); break; -- cgit v1.2.3 From 0611f78f83c93c000029ab01daa28166d03590ed Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 17 Feb 2025 09:45:10 +0100 Subject: riscv: KVM: Fix SBI IPI error generation When an invalid function ID of an SBI extension is used we should return not-supported, not invalid-param. Also, when we see that at least one hartid constructed from the base and mask parameters is invalid, then we should return invalid-param. Finally, rather than relying on overflowing a left shift to result in zero and then using that zero in a condition which [correctly] skips sending an IPI (but loops unnecessarily), explicitly check for overflow and exit the loop immediately. Fixes: 5f862df5585c ("RISC-V: KVM: Add v0.1 replacement SBI extensions defined in v0.2") Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250217084506.18763-10-ajones@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_replace.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 9c2ab3dfa93a..74e3a38c6a29 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -51,9 +51,10 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; unsigned long hbase = cp->a1; + unsigned long hart_bit = 0, sentmask = 0; if (cp->a6 != SBI_EXT_IPI_SEND_IPI) { - retdata->err_val = SBI_ERR_INVALID_PARAM; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; return 0; } @@ -62,15 +63,23 @@ static int kvm_sbi_ext_ipi_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, if (hbase != -1UL) { if (tmp->vcpu_id < hbase) continue; - if (!(hmask & (1UL << (tmp->vcpu_id - hbase)))) + hart_bit = tmp->vcpu_id - hbase; + if (hart_bit >= __riscv_xlen) + goto done; + if (!(hmask & (1UL << hart_bit))) continue; } ret = kvm_riscv_vcpu_set_interrupt(tmp, IRQ_VS_SOFT); if (ret < 0) break; + sentmask |= 1UL << hart_bit; kvm_riscv_vcpu_pmu_incr_fw(tmp, SBI_PMU_FW_IPI_RCVD); } +done: + if (hbase != -1UL && (hmask ^ sentmask)) + retdata->err_val = SBI_ERR_INVALID_PARAM; + return ret; } -- cgit v1.2.3 From b901484852992cf3d162a5eab72251cc813ca624 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 17 Feb 2025 09:45:11 +0100 Subject: riscv: KVM: Fix SBI TIME error generation When an invalid function ID of an SBI extension is used we should return not-supported, not invalid-param. Fixes: 5f862df5585c ("RISC-V: KVM: Add v0.1 replacement SBI extensions defined in v0.2") Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250217084506.18763-11-ajones@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_replace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 74e3a38c6a29..5fbf3f94f1e8 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -21,7 +21,7 @@ static int kvm_sbi_ext_time_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, u64 next_cycle; if (cp->a6 != SBI_EXT_TIME_SET_TIMER) { - retdata->err_val = SBI_ERR_INVALID_PARAM; + retdata->err_val = SBI_ERR_NOT_SUPPORTED; return 0; } -- cgit v1.2.3 From 351e02b1733b057e33fe13fc03ca93ec799e4f78 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Mon, 17 Feb 2025 09:45:12 +0100 Subject: riscv: KVM: Fix SBI sleep_type use The spec says sleep_type is 32 bits wide and "In case the data is defined as 32bit wide, higher privilege software must ensure that it only uses 32 bit data." Mask off upper bits of sleep_type before using it. Fixes: 023c15151fbb ("RISC-V: KVM: Add SBI system suspend support") Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20250217084506.18763-12-ajones@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_system.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_sbi_system.c b/arch/riscv/kvm/vcpu_sbi_system.c index 5d55e08791fa..bc0ebba89003 100644 --- a/arch/riscv/kvm/vcpu_sbi_system.c +++ b/arch/riscv/kvm/vcpu_sbi_system.c @@ -4,6 +4,7 @@ */ #include +#include #include #include @@ -19,7 +20,7 @@ static int kvm_sbi_ext_susp_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, switch (funcid) { case SBI_EXT_SUSP_SYSTEM_SUSPEND: - if (cp->a0 != SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM) { + if (lower_32_bits(cp->a0) != SBI_SUSP_SLEEP_TYPE_SUSPEND_TO_RAM) { retdata->err_val = SBI_ERR_INVALID_PARAM; return 0; } -- cgit v1.2.3 From ed83aff5a94e1d623c007159a6a7f1c3ef202c6c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 12 Feb 2025 12:01:15 +0100 Subject: s390: Update defconfigs Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/configs/debug_defconfig | 2 ++ arch/s390/configs/defconfig | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 44f01a4bc810..80bdfbae6e5b 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -469,6 +469,7 @@ CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_BITMAP_FILE is not set +CONFIG_MD_LINEAR=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -874,6 +875,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=300 CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FUNCTION_GRAPH_RETADDR=y CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index 8bcd37edd3c9..449a0e996b96 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -459,6 +459,7 @@ CONFIG_SCSI_DH_ALUA=m CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_BITMAP_FILE is not set +CONFIG_MD_LINEAR=m CONFIG_MD_CLUSTER=m CONFIG_BCACHE=m CONFIG_BLK_DEV_DM=y @@ -825,6 +826,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_LATENCYTOP=y CONFIG_BOOTTIME_TRACING=y CONFIG_FUNCTION_GRAPH_RETVAL=y +CONFIG_FUNCTION_GRAPH_RETADDR=y CONFIG_FPROBE=y CONFIG_FUNCTION_PROFILER=y CONFIG_STACK_TRACER=y -- cgit v1.2.3 From 173767c218cc1da74704e7863f165ac8a9796f3e Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Thu, 13 Feb 2025 21:16:14 +0000 Subject: s390/purgatory: Use -D__DISABLE_EXPORTS The object files in purgatory do not export symbols, so disable exports for all the object files, not only sha256.o, with -D__DISABLE_EXPORTS. This fixes a build failure with CONFIG_GENDWARFKSYMS, where we would otherwise attempt to calculate symbol versions for purgatory objects and fail because they're not built with debugging information: error: gendwarfksyms: process_module: dwarf_get_units failed: no debugging information? make[5]: *** [../scripts/Makefile.build:207: arch/s390/purgatory/string.o] Error 1 make[5]: *** Deleting file 'arch/s390/purgatory/string.o' Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502120752.U3fOKScQ-lkp@intel.com/ Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20250213211614.3537605-2-samitolvanen@google.com Acked-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/purgatory/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index bdcf2a3b6c41..bd39b36e7bd6 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -8,7 +8,7 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/sha256.o: $(srctree)/lib/crypto/sha256.c FORCE $(call if_changed_rule,cc_o_c) -CFLAGS_sha256.o := -D__DISABLE_EXPORTS -D__NO_FORTIFY +CFLAGS_sha256.o := -D__NO_FORTIFY $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(call if_changed_rule,as_o_S) @@ -19,9 +19,11 @@ KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -Os -m64 -msoft-float -fno-common KBUILD_CFLAGS += -fno-stack-protector KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +KBUILD_CFLAGS += -D__DISABLE_EXPORTS KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS += -D__DISABLE_EXPORTS # Since we link purgatory with -r unresolved symbols are not checked, so we # also link a purgatory.chk binary without -r to check for unresolved symbols. -- cgit v1.2.3 From c3a589fd9fcbf295a7402a4b188dc9277d505f4f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 18 Feb 2025 12:11:34 +0100 Subject: s390/boot: Fix ESSA detection The cmma_test_essa() inline assembly uses tmp as input and output, however tmp is specified as output only, which allows the compiler to optimize the initialization of tmp away. Therefore the ESSA detection may or may not work depending on previous contents of the register that the compiler selected for tmp. Fix this by using the correct constraint modifier. Fixes: 468a3bc2b7b9 ("s390/cmma: move parsing of cmma kernel parameter to early boot code") Cc: stable@vger.kernel.org Signed-off-by: Heiko Carstens Reviewed-by: Vasily Gorbik Signed-off-by: Vasily Gorbik --- arch/s390/boot/startup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index 885bd1dd2c82..9276e0576d0a 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -86,7 +86,7 @@ static int cmma_test_essa(void) : [reg1] "=&d" (reg1), [reg2] "=&a" (reg2), [rc] "+&d" (rc), - [tmp] "=&d" (tmp), + [tmp] "+&d" (tmp), "+Q" (get_lowcore()->program_new_psw), "=Q" (old) : [psw_old] "a" (&old), -- cgit v1.2.3 From 102c51c50db88aedd00a318b7708ad60dbec2e95 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 14 Feb 2025 13:37:24 +0000 Subject: KVM: arm64: Fix tcr_el2 initialisation in hVHE mode When not running in VHE mode, cpu_prepare_hyp_mode() computes the value of TCR_EL2 using the host's TCR_EL1 settings as a starting point. For nVHE, this amounts to masking out everything apart from the TG0, SH0, ORGN0, IRGN0 and T0SZ fields before setting the RES1 bits, shifting the IPS field down to the PS field and setting DS if LPA2 is enabled. Unfortunately, for hVHE, things go slightly wonky: EPD1 is correctly set to disable walks via TTBR1_EL2 but then the T1SZ and IPS fields are corrupted when we mistakenly attempt to initialise the PS and DS fields in their E2H=0 positions. Furthermore, many fields are retained from TCR_EL1 which should not be propagated to TCR_EL2. Notably, this means we can end up with A1 set despite not initialising TTBR1_EL2 at all. This has been shown to cause unexpected translation faults at EL2 with pKVM due to TLB invalidation not taking effect when running with a non-zero ASID. Fix the TCR_EL2 initialisation code to set PS and DS only when E2H=0, masking out HD, HA and A1 when E2H=1. Cc: Marc Zyngier Cc: Oliver Upton Fixes: ad744e8cb346 ("arm64: Allow arm64_sw.hvhe on command line") Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20250214133724.13179-1-will@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_arm.h | 2 +- arch/arm64/kvm/arm.c | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 8d94a6c0ed5c..c2417a424b98 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -119,7 +119,7 @@ #define TCR_EL2_IRGN0_MASK TCR_IRGN0_MASK #define TCR_EL2_T0SZ_MASK 0x3f #define TCR_EL2_MASK (TCR_EL2_TG0_MASK | TCR_EL2_SH0_MASK | \ - TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK | TCR_EL2_T0SZ_MASK) + TCR_EL2_ORGN0_MASK | TCR_EL2_IRGN0_MASK) /* VTCR_EL2 Registers bits */ #define VTCR_EL2_DS TCR_EL2_DS diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b8e55a441282..bc7a37cea242 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1980,7 +1980,7 @@ static int kvm_init_vector_slots(void) static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) { struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); - unsigned long tcr, ips; + unsigned long tcr; /* * Calculate the raw per-cpu offset without a translation from the @@ -1994,19 +1994,18 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) params->mair_el2 = read_sysreg(mair_el1); tcr = read_sysreg(tcr_el1); - ips = FIELD_GET(TCR_IPS_MASK, tcr); if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK); tcr |= TCR_EPD1_MASK; } else { + unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr); + tcr &= TCR_EL2_MASK; - tcr |= TCR_EL2_RES1; + tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips); + if (lpa2_is_enabled()) + tcr |= TCR_EL2_DS; } - tcr &= ~TCR_T0SZ_MASK; tcr |= TCR_T0SZ(hyp_va_bits); - tcr &= ~TCR_EL2_PS_MASK; - tcr |= FIELD_PREP(TCR_EL2_PS_MASK, ips); - if (lpa2_is_enabled()) - tcr |= TCR_EL2_DS; params->tcr_el2 = tcr; params->pgd_pa = kvm_mmu_get_httbr(); -- cgit v1.2.3 From 782cffeec9ad96daa64ffb2d527b2a052fb02552 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 19 Feb 2025 06:10:05 -0800 Subject: perf/x86/intel: Fix event constraints for LNC According to the latest event list, update the event constraint tables for Lion Cove core. The general rule (the event codes < 0x90 are restricted to counters 0-3.) has been removed. There is no restriction for most of the performance monitoring events. Fixes: a932aa0e868f ("perf/x86: Add Lunar Lake and Arrow Lake support") Reported-by: Amiri Khalil Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250219141005.2446823-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 20 +++++++------------- arch/x86/events/intel/ds.c | 2 +- 2 files changed, 8 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index e86333eee266..cdcebf30468a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -397,34 +397,28 @@ static struct event_constraint intel_lnc_event_constraints[] = { METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + INTEL_EVENT_CONSTRAINT(0x20, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x012a, 0xf), + INTEL_UEVENT_CONSTRAINT(0x012b, 0xf), INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), INTEL_UEVENT_CONSTRAINT(0x0175, 0x4), INTEL_EVENT_CONSTRAINT(0x2e, 0x3ff), INTEL_EVENT_CONSTRAINT(0x3c, 0x3ff), - /* - * Generally event codes < 0x90 are restricted to counters 0-3. - * The 0x2E and 0x3C are exception, which has no restriction. - */ - INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf), - INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf), - INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x10a4, 0x1), INTEL_UEVENT_CONSTRAINT(0x01b1, 0x8), + INTEL_UEVENT_CONSTRAINT(0x01cd, 0x3fc), INTEL_UEVENT_CONSTRAINT(0x02cd, 0x3), - INTEL_EVENT_CONSTRAINT(0xce, 0x1), INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), - /* - * Generally event codes >= 0x90 are likely to have no restrictions. - * The exception are defined as above. - */ - INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0x3ff), + + INTEL_UEVENT_CONSTRAINT(0x00e0, 0xf), EVENT_CONSTRAINT_END }; diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index c2e2eae7309c..f122882ef278 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1199,7 +1199,7 @@ struct event_constraint intel_lnc_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), - INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3ff), + INTEL_HYBRID_LDLAT_CONSTRAINT(0x1cd, 0x3fc), INTEL_HYBRID_STLAT_CONSTRAINT(0x2cd, 0x3), INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ -- cgit v1.2.3 From fa808ed4e199ed17d878eb75b110bda30dd52434 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Wed, 19 Feb 2025 14:07:37 -0800 Subject: KVM: arm64: Ensure a VMID is allocated before programming VTTBR_EL2 Vladimir reports that a race condition to attach a VMID to a stage-2 MMU sometimes results in a vCPU entering the guest with a VMID of 0: | CPU1 | CPU2 | | | | kvm_arch_vcpu_ioctl_run | | vcpu_load <= load VTTBR_EL2 | | kvm_vmid->id = 0 | | | kvm_arch_vcpu_ioctl_run | | vcpu_load <= load VTTBR_EL2 | | with kvm_vmid->id = 0| | kvm_arm_vmid_update <= allocates fresh | | kvm_vmid->id and | | reload VTTBR_EL2 | | | | | kvm_arm_vmid_update <= observes that kvm_vmid->id | | already allocated, | | skips reload VTTBR_EL2 Oh yeah, it's as bad as it looks. Remember that VHE loads the stage-2 MMU eagerly but a VMID only gets attached to the MMU later on in the KVM_RUN loop. Even in the "best case" where VTTBR_EL2 correctly gets reprogrammed before entering the EL1&0 regime, there is a period of time where hardware is configured with VMID 0. That's completely insane. So, rather than decorating the 'late' binding with another hack, just allocate the damn thing up front. Attaching a VMID from vcpu_load() is still rollover safe since (surprise!) it'll always get called after a vCPU was preempted. Excuse me while I go find a brown paper bag. Cc: stable@vger.kernel.org Fixes: 934bf871f011 ("KVM: arm64: Load the stage-2 MMU context in kvm_vcpu_load_vhe()") Reported-by: Vladimir Murzin Signed-off-by: Oliver Upton Link: https://lore.kernel.org/r/20250219220737.130842-1-oliver.upton@linux.dev Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 2 +- arch/arm64/kvm/arm.c | 22 ++++++++++------------ arch/arm64/kvm/vmid.c | 11 +++-------- 3 files changed, 14 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3a7ec98ef123..d919557af5e5 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1259,7 +1259,7 @@ int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu, extern unsigned int __ro_after_init kvm_arm_vmid_bits; int __init kvm_arm_vmid_alloc_init(void); void __init kvm_arm_vmid_alloc_free(void); -bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); +void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid); void kvm_arm_vmid_clear_active(void); static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index bc7a37cea242..0160b4924351 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -559,6 +559,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) mmu = vcpu->arch.hw_mmu; last_ran = this_cpu_ptr(mmu->last_vcpu_ran); + /* + * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2, + * which happens eagerly in VHE. + * + * Also, the VMID allocator only preserves VMIDs that are active at the + * time of rollover, so KVM might need to grab a new VMID for the MMU if + * this is called from kvm_sched_in(). + */ + kvm_arm_vmid_update(&mmu->vmid); + /* * We guarantee that both TLBs and I-cache are private to each * vcpu. If detecting that a vcpu from the same VM has @@ -1138,18 +1148,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ preempt_disable(); - /* - * The VMID allocator only tracks active VMIDs per - * physical CPU, and therefore the VMID allocated may not be - * preserved on VMID roll-over if the task was preempted, - * making a thread's VMID inactive. So we need to call - * kvm_arm_vmid_update() in non-premptible context. - */ - if (kvm_arm_vmid_update(&vcpu->arch.hw_mmu->vmid) && - has_vhe()) - __load_stage2(vcpu->arch.hw_mmu, - vcpu->arch.hw_mmu->arch); - kvm_pmu_flush_hwstate(vcpu); local_irq_disable(); diff --git a/arch/arm64/kvm/vmid.c b/arch/arm64/kvm/vmid.c index 806223b7022a..7fe8ba1a2851 100644 --- a/arch/arm64/kvm/vmid.c +++ b/arch/arm64/kvm/vmid.c @@ -135,11 +135,10 @@ void kvm_arm_vmid_clear_active(void) atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID); } -bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) +void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) { unsigned long flags; u64 vmid, old_active_vmid; - bool updated = false; vmid = atomic64_read(&kvm_vmid->id); @@ -157,21 +156,17 @@ bool kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid) if (old_active_vmid != 0 && vmid_gen_match(vmid) && 0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids), old_active_vmid, vmid)) - return false; + return; raw_spin_lock_irqsave(&cpu_vmid_lock, flags); /* Check that our VMID belongs to the current generation. */ vmid = atomic64_read(&kvm_vmid->id); - if (!vmid_gen_match(vmid)) { + if (!vmid_gen_match(vmid)) vmid = new_vmid(kvm_vmid); - updated = true; - } atomic64_set(this_cpu_ptr(&active_vmids), vmid); raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags); - - return updated; } /* -- cgit v1.2.3 From d252435aca44d647d57b84de5108556f9c97614a Mon Sep 17 00:00:00 2001 From: BillXiang Date: Fri, 21 Feb 2025 18:45:38 +0800 Subject: riscv: KVM: Remove unnecessary vcpu kick MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the unnecessary kick to the vCPU after writing to the vs_file of IMSIC in kvm_riscv_vcpu_aia_imsic_inject. For vCPUs that are running, writing to the vs_file directly forwards the interrupt as an MSI to them and does not need an extra kick. For vCPUs that are descheduled after emulating WFI, KVM will enable the guest external interrupt for that vCPU in kvm_riscv_aia_wakeon_hgei. This means that writing to the vs_file will cause a guest external interrupt, which will cause KVM to wake up the vCPU in hgei_interrupt to handle the interrupt properly. Signed-off-by: BillXiang Reviewed-by: Andrew Jones Reviewed-by: Radim Krčmář Link: https://lore.kernel.org/r/20250221104538.2147-1-xiangwencheng@lanxincomputing.com Signed-off-by: Anup Patel --- arch/riscv/kvm/aia_imsic.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index a8085cd8215e..29ef9c2133a9 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -974,7 +974,6 @@ int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu, if (imsic->vsfile_cpu >= 0) { writel(iid, imsic->vsfile_va + IMSIC_MMIO_SETIPNUM_LE); - kvm_vcpu_kick(vcpu); } else { eix = &imsic->swfile->eix[iid / BITS_PER_TYPE(u64)]; set_bit(iid & (BITS_PER_TYPE(u64) - 1), eix->eip); -- cgit v1.2.3 From 517120728484df1ab8b71cba8d2cad19f52f18a1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 19 Feb 2025 22:01:24 -0800 Subject: x86/cpufeatures: Make AVX-VNNI depend on AVX The 'noxsave' boot option disables support for AVX, but support for the AVX-VNNI feature was still declared on CPUs that support it. Fix this. Signed-off-by: Eric Biggers Signed-off-by: Ingo Molnar Cc: Dave Hansen Link: https://lore.kernel.org/r/20250220060124.89622-1-ebiggers@kernel.org --- arch/x86/kernel/cpu/cpuid-deps.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index 8bd84114c2d9..df838e3bdbe0 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -45,6 +45,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_AES, X86_FEATURE_XMM2 }, { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, { X86_FEATURE_GFNI, X86_FEATURE_XMM2 }, + { X86_FEATURE_AVX_VNNI, X86_FEATURE_AVX }, { X86_FEATURE_FMA, X86_FEATURE_AVX }, { X86_FEATURE_VAES, X86_FEATURE_AVX }, { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX }, -- cgit v1.2.3 From a9ebcb88136ca80cb53de27ca5ae77de18bbe368 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 17 Feb 2025 18:38:20 +0200 Subject: mm/memremap: Pass down MEMREMAP_* flags to arch_memremap_wb() x86 version of arch_memremap_wb() needs the flags to decide if the mapping has to be encrypted or decrypted. Pass down the flag to arch_memremap_wb(). All current implementations ignore the argument. Signed-off-by: Kirill A. Shutemov Signed-off-by: Ingo Molnar Cc: Andrew Morton Cc: Dave Hansen Cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20250217163822.343400-2-kirill.shutemov@linux.intel.com --- arch/arm/include/asm/io.h | 2 +- arch/arm/mm/ioremap.c | 2 +- arch/arm/mm/nommu.c | 2 +- arch/riscv/include/asm/io.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 1815748f5d2a..bae5edf348ef 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -381,7 +381,7 @@ void __iomem *ioremap_wc(resource_size_t res_cookie, size_t size); void iounmap(volatile void __iomem *io_addr); #define iounmap iounmap -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size); +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags); #define arch_memremap_wb arch_memremap_wb /* diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 89f1c97f3079..748698e91a4b 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -436,7 +436,7 @@ void __arm_iomem_set_ro(void __iomem *ptr, size_t size) set_memory_ro((unsigned long)ptr, PAGE_ALIGN(size) / PAGE_SIZE); } -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size) +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) { return (__force void *)arch_ioremap_caller(phys_addr, size, MT_MEMORY_RW, diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 1a8f6914ee59..d638cc87807e 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -248,7 +248,7 @@ void __iomem *pci_remap_cfgspace(resource_size_t res_cookie, size_t size) EXPORT_SYMBOL_GPL(pci_remap_cfgspace); #endif -void *arch_memremap_wb(phys_addr_t phys_addr, size_t size) +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) { return (void *)phys_addr; } diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 1c5c641075d2..0257f4aa7ff4 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -136,7 +136,7 @@ __io_writes_outs(outs, u64, q, __io_pbr(), __io_paw()) #include #ifdef CONFIG_MMU -#define arch_memremap_wb(addr, size) \ +#define arch_memremap_wb(addr, size, flags) \ ((__force void *)ioremap_prot((addr), (size), _PAGE_KERNEL)) #endif -- cgit v1.2.3 From 81256a50aa0fddefbf4849db8cad9f70c5167c04 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 17 Feb 2025 18:38:21 +0200 Subject: x86/mm: Make memremap(MEMREMAP_WB) map memory as encrypted by default Currently memremap(MEMREMAP_WB) can produce decrypted/shared mapping: memremap(MEMREMAP_WB) arch_memremap_wb() ioremap_cache() __ioremap_caller(.encrytped = false) In such cases, the IORES_MAP_ENCRYPTED flag on the memory will determine if the resulting mapping is encrypted or decrypted. Creating a decrypted mapping without explicit request from the caller is risky: - It can inadvertently expose the guest's data and compromise the guest. - Accessing private memory via shared/decrypted mapping on TDX will either trigger implicit conversion to shared or #VE (depending on VMM implementation). Implicit conversion is destructive: subsequent access to the same memory via private mapping will trigger a hard-to-debug #VE crash. The kernel already provides a way to request decrypted mapping explicitly via the MEMREMAP_DEC flag. Modify memremap(MEMREMAP_WB) to produce encrypted/private mapping by default unless MEMREMAP_DEC is specified or if the kernel runs on a machine with SME enabled. It fixes the crash due to #VE on kexec in TDX guests if CONFIG_EISA is enabled. Signed-off-by: Kirill A. Shutemov Signed-off-by: Ingo Molnar Cc: Andrew Morton Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: linux-mm@kvack.org Link: https://lore.kernel.org/r/20250217163822.343400-3-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/io.h | 3 +++ arch/x86/mm/ioremap.c | 8 ++++++++ 2 files changed, 11 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index ed580c7f9d0a..1a0dc2b2bf5b 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -175,6 +175,9 @@ extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, un extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); #define ioremap_encrypted ioremap_encrypted +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags); +#define arch_memremap_wb arch_memremap_wb + /** * ioremap - map bus memory into CPU space * @offset: bus address of the memory diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 38ff7791a9c7..42c90b420773 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -503,6 +503,14 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +void *arch_memremap_wb(phys_addr_t phys_addr, size_t size, unsigned long flags) +{ + if ((flags & MEMREMAP_DEC) || cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return (void __force *)ioremap_cache(phys_addr, size); + + return (void __force *)ioremap_encrypted(phys_addr, size); +} + /* * Convert a physical pointer to a virtual kernel pointer for /dev/mem * access -- cgit v1.2.3 From 64aad4749d7911f8c5e69d93a929a269605dd3cb Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 16 Feb 2025 14:26:14 +0200 Subject: ACPI/processor_idle: Export acpi_processor_ffh_play_dead() The kernel test robot reported the following build error: >> ERROR: modpost: "acpi_processor_ffh_play_dead" [drivers/acpi/processor.ko] undefined! Caused by this recently merged commit: 541ddf31e300 ("ACPI/processor_idle: Add FFH state handling") The build failure is due to an oversight in the 'CONFIG_ACPI_PROCESSOR=m' case, the function export is missing. Add it. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202502151207.FA9UO1iX-lkp@intel.com/ Fixes: 541ddf31e300 ("ACPI/processor_idle: Add FFH state handling") Signed-off-by: Artem Bityutskiy Signed-off-by: Ingo Molnar Cc: Dave Hansen Link: https://lore.kernel.org/r/de5bf4f116779efde315782a15146fdc77a4a044.camel@linux.intel.com --- arch/x86/kernel/acpi/cstate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 5bdb65516969..86c87c01d23d 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -214,6 +214,7 @@ void acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); mwait_play_dead(percpu_entry->states[cx->index].eax); } +EXPORT_SYMBOL_GPL(acpi_processor_ffh_play_dead); void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { -- cgit v1.2.3 From 282f395244df3663dc24e97a86087431c9192513 Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Sat, 15 Feb 2025 12:52:49 +0000 Subject: x86/mm: Replace open-coded gap bounding with clamp() Rather than manually bounding gap between gap_min and gap_max, use the well-known clamp() macro to make the code easier to read. Signed-off-by: Qasim Ijaz Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250215125249.10729-1-qasdev00@gmail.com --- arch/x86/mm/mmap.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index b8a6ffffb451..5ed2109211da 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -84,7 +84,6 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; - unsigned long gap_min, gap_max; /* Values close to RLIM_INFINITY can overflow. */ if (gap + pad > gap) @@ -94,13 +93,7 @@ static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, * Top of mmap area (just below the process stack). * Leave an at least ~128 MB hole with possible stack randomization. */ - gap_min = SIZE_128M; - gap_max = (task_size / 6) * 5; - - if (gap < gap_min) - gap = gap_min; - else if (gap > gap_max) - gap = gap_max; + gap = clamp(gap, SIZE_128M, (task_size / 6) * 5); return PAGE_ALIGN(task_size - gap - rnd); } -- cgit v1.2.3 From a37259732a7dc33047fa1e4f9a338088f452e017 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Feb 2025 11:13:52 -0500 Subject: x86/mm: Make MMU_GATHER_RCU_TABLE_FREE unconditional Currently x86 uses CONFIG_MMU_GATHER_TABLE_FREE when using paravirt, and not when running on bare metal. There is no real good reason to do things differently for each setup. Make them all the same. Currently get_user_pages_fast synchronizes against page table freeing in two different ways: - on bare metal, by blocking IRQs, which block TLB flush IPIs - on paravirt, with MMU_GATHER_RCU_TABLE_FREE This is done because some paravirt TLB flush implementations handle the TLB flush in the hypervisor, and will do the flush even when the target CPU has interrupts disabled. Always handle page table freeing with MMU_GATHER_RCU_TABLE_FREE. Using RCU synchronization between page table freeing and get_user_pages_fast() allows bare metal to also do TLB flushing while interrupts are disabled. Various places in the mm do still block IRQs or disable preemption as an implicit way to block RCU frees. That makes it safe to use INVLPGB on AMD CPUs. Suggested-by: Peter Zijlstra Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Tested-by: Manali Shukla Tested-by: Brendan Jackman Tested-by: Michael Kelley Link: https://lore.kernel.org/r/20250213161423.449435-2-riel@surriel.com --- arch/x86/Kconfig | 2 +- arch/x86/kernel/paravirt.c | 17 +---------------- arch/x86/mm/pgtable.c | 27 ++++----------------------- 3 files changed, 6 insertions(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c4175f4635ee..d581634c6a59 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -278,7 +278,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_MERGE_VMAS select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1ccaa3397a67..527f5605aa3e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -59,21 +59,6 @@ void __init native_pv_lock_init(void) static_branch_enable(&virt_spin_lock_key); } -#ifndef CONFIG_PT_RECLAIM -static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct ptdesc *ptdesc = (struct ptdesc *)table; - - pagetable_dtor(ptdesc); - tlb_remove_page(tlb, ptdesc_page(ptdesc)); -} -#else -static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - tlb_remove_table(tlb, table); -} -#endif - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; @@ -195,7 +180,7 @@ struct paravirt_patch_template pv_ops = { .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, - .mmu.tlb_remove_table = native_tlb_remove_table, + .mmu.tlb_remove_table = tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 1fef5ad32d5a..b1c1f72c1fd1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -18,25 +18,6 @@ EXPORT_SYMBOL(physical_mask); #define PGTABLE_HIGHMEM 0 #endif -#ifndef CONFIG_PARAVIRT -#ifndef CONFIG_PT_RECLAIM -static inline -void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct ptdesc *ptdesc = (struct ptdesc *)table; - - pagetable_dtor(ptdesc); - tlb_remove_page(tlb, ptdesc_page(ptdesc)); -} -#else -static inline -void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - tlb_remove_table(tlb, table); -} -#endif /* !CONFIG_PT_RECLAIM */ -#endif /* !CONFIG_PARAVIRT */ - gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) @@ -64,7 +45,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { paravirt_release_pte(page_to_pfn(pte)); - paravirt_tlb_remove_table(tlb, page_ptdesc(pte)); + tlb_remove_table(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -78,21 +59,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pmd)); + tlb_remove_table(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(pud)); + tlb_remove_table(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - paravirt_tlb_remove_table(tlb, virt_to_ptdesc(p4d)); + tlb_remove_table(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ -- cgit v1.2.3 From f2c5c21058270167ce23172022da083b62e5ad4c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Feb 2025 11:13:53 -0500 Subject: x86/mm: Remove pv_ops.mmu.tlb_remove_table call Every pv_ops.mmu.tlb_remove_table call ends up calling tlb_remove_table. Get rid of the indirection by simply calling tlb_remove_table directly, and not going through the paravirt function pointers. Suggested-by: Qi Zheng Signed-off-by: Rik van Riel Signed-off-by: Ingo Molnar Tested-by: Manali Shukla Tested-by: Brendan Jackman Tested-by: Michael Kelley Link: https://lore.kernel.org/r/20250213161423.449435-3-riel@surriel.com --- arch/x86/hyperv/mmu.c | 1 - arch/x86/include/asm/paravirt.h | 5 ----- arch/x86/include/asm/paravirt_types.h | 2 -- arch/x86/kernel/kvm.c | 1 - arch/x86/kernel/paravirt.c | 1 - arch/x86/xen/mmu_pv.c | 1 - 6 files changed, 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index cc8c3bd0e7c2..1f7c3082a36d 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -239,5 +239,4 @@ void hyperv_setup_mmu_ops(void) pr_info("Using hypercall for remote TLB flush\n"); pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; } diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 041aff51eb50..38a632a282d4 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(const struct cpumask *cpumask, PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } -static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); -} - static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index fea56b04f436..e26633c00455 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -134,8 +134,6 @@ struct pv_mmu_ops { void (*flush_tlb_multi)(const struct cpumask *cpus, const struct flush_tlb_info *info); - void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); - /* Hook for intercepting the destruction of an mm_struct. */ void (*exit_mmap)(struct mm_struct *mm); void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 7a422a6c5983..3be9b3342c67 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -838,7 +838,6 @@ static void __init kvm_guest_init(void) #ifdef CONFIG_SMP if (pv_tlb_flush_supported()) { pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; - pv_ops.mmu.tlb_remove_table = tlb_remove_table; pr_info("KVM setup pv remote TLB flush\n"); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 527f5605aa3e..2aa251d0b308 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -180,7 +180,6 @@ struct paravirt_patch_template pv_ops = { .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, - .mmu.tlb_remove_table = tlb_remove_table, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index d078de2c952b..38971c6dcd4b 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .flush_tlb_kernel = xen_flush_tlb, .flush_tlb_one_user = xen_flush_tlb_one_user, .flush_tlb_multi = xen_flush_tlb_multi, - .tlb_remove_table = tlb_remove_table, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, -- cgit v1.2.3 From 3fcae7771fb724c276e87e80827b264d2c3ad67e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 11 Feb 2025 16:57:21 +0200 Subject: x86/pat: Fix W=1 build warning when the within_inclusive() function is unused The within_inclusive() function, in some cases, when CONFIG_X86_64=n, may be not used. This, in particular, prevents kernel builds with Clang, `make W=1` and CONFIG_WERROR=y: arch/x86/mm/pat/set_memory.c:215:1: error: unused function 'within_inclusive' [-Werror,-Wunused-function] Fix this by guarding the definitions with the respective ifdeffery. See also: 6863f5643dd7 ("kbuild: allow Clang to find unused static inline functions for W=1 build") Signed-off-by: Andy Shevchenko Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250211145721.1620552-1-andriy.shevchenko@linux.intel.com --- arch/x86/mm/pat/set_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 7bd0f62ba48f..84d0bca3be28 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -225,14 +225,14 @@ within(unsigned long addr, unsigned long start, unsigned long end) return addr >= start && addr < end; } +#ifdef CONFIG_X86_64 + static inline int within_inclusive(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr <= end; } -#ifdef CONFIG_X86_64 - /* * The kernel image is mapped into two places in the virtual address space * (addresses without KASLR, of course): -- cgit v1.2.3 From 7ffb791423c7c518269a9aad35039ef824a40adb Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 7 Feb 2025 10:42:34 +1100 Subject: x86/kaslr: Reduce KASLR entropy on most x86 systems When CONFIG_PCI_P2PDMA=y (which is basically enabled on all large x86 distros), it maps the PFN's via a ZONE_DEVICE mapping using devm_memremap_pages(). The mapped virtual address range corresponds to the pci_resource_start() of the BAR address and size corresponding to the BAR length. When KASLR is enabled, the direct map range of the kernel is reduced to the size of physical memory plus additional padding. If the BAR address is beyond this limit, PCI peer to peer DMA mappings fail. Fix this by not shrinking the size of the direct map when CONFIG_PCI_P2PDMA=y. This reduces the total available entropy, but it's better than the current work around of having to disable KASLR completely. [ mingo: Clarified the changelog to point out the broad impact ... ] Signed-off-by: Balbir Singh Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Acked-by: Bjorn Helgaas # drivers/pci/Kconfig Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Andy Lutomirski Link: https://lore.kernel.org/lkml/20250206023201.1481957-1-balbirs@nvidia.com/ Link: https://lore.kernel.org/r/20250206234234.1912585-1-balbirs@nvidia.com -- arch/x86/mm/kaslr.c | 10 ++++++++-- drivers/pci/Kconfig | 6 ++++++ 2 files changed, 14 insertions(+), 2 deletions(-) --- arch/x86/mm/kaslr.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 11a93542d198..3c306de52fd4 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -113,8 +113,14 @@ void __init kernel_randomize_memory(void) memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; - /* Adapt physical memory region size based on available memory */ - if (memory_tb < kaslr_regions[0].size_tb) + /* + * Adapt physical memory region size based on available memory, + * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the + * device BAR space assuming the direct map space is large enough + * for creating a ZONE_DEVICE mapping in the direct map corresponding + * to the physical BAR address. + */ + if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb)) kaslr_regions[0].size_tb = memory_tb; /* -- cgit v1.2.3 From 43bb700cff6bc2f0d337006b864192227fb05dc1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jan 2025 17:22:52 +0100 Subject: x86/cpu: Update Intel Family comments Because who can ever remember all these names. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250127162252.GK16742@noisy.programming.kicks-ass.net --- arch/x86/include/asm/intel-family.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index 8359113e3e58..f9f67afeb48a 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -110,9 +110,9 @@ #define INTEL_SAPPHIRERAPIDS_X IFM(6, 0x8F) /* Golden Cove */ -#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) +#define INTEL_EMERALDRAPIDS_X IFM(6, 0xCF) /* Raptor Cove */ -#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) +#define INTEL_GRANITERAPIDS_X IFM(6, 0xAD) /* Redwood Cove */ #define INTEL_GRANITERAPIDS_D IFM(6, 0xAE) /* "Hybrid" Processors (P-Core/E-Core) */ @@ -126,16 +126,16 @@ #define INTEL_RAPTORLAKE_P IFM(6, 0xBA) #define INTEL_RAPTORLAKE_S IFM(6, 0xBF) -#define INTEL_METEORLAKE IFM(6, 0xAC) +#define INTEL_METEORLAKE IFM(6, 0xAC) /* Redwood Cove / Crestmont */ #define INTEL_METEORLAKE_L IFM(6, 0xAA) -#define INTEL_ARROWLAKE_H IFM(6, 0xC5) +#define INTEL_ARROWLAKE_H IFM(6, 0xC5) /* Lion Cove / Skymont */ #define INTEL_ARROWLAKE IFM(6, 0xC6) #define INTEL_ARROWLAKE_U IFM(6, 0xB5) -#define INTEL_LUNARLAKE_M IFM(6, 0xBD) +#define INTEL_LUNARLAKE_M IFM(6, 0xBD) /* Lion Cove / Skymont */ -#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) +#define INTEL_PANTHERLAKE_L IFM(6, 0xCC) /* Cougar Cove / Crestmont */ /* "Small Core" Processors (Atom/E-Core) */ -- cgit v1.2.3 From 88ec7eedbbd21cad38707620ad6c48a4e9a87c18 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Jan 2025 07:19:11 -0800 Subject: perf/x86: Fix low freqency setting issue Perf doesn't work at low frequencies: $ perf record -e cpu_core/instructions/ppp -F 120 Error: The sys_perf_event_open() syscall returned with 22 (Invalid argument) for event (cpu_core/instructions/ppp). "dmesg | grep -i perf" may provide additional information. The limit_period() check avoids a low sampling period on a counter. It doesn't intend to limit the frequency. The check in the x86_pmu_hw_config() should be limited to non-freq mode. The attr.sample_period and attr.sample_freq are union. The attr.sample_period should not be used to indicate the frequency mode. Fixes: c46e665f0377 ("perf/x86: Add INST_RETIRED.ALL workarounds") Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Reviewed-by: Ravi Bangoria Cc: Peter Zijlstra Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250117151913.3043942-1-kan.liang@linux.intel.com Closes: https://lore.kernel.org/lkml/20250115154949.3147-1-ravi.bangoria@amd.com/ --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8f218ac0d445..2092d615333d 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -628,7 +628,7 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); - if (event->attr.sample_period && x86_pmu.limit_period) { + if (!event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) -- cgit v1.2.3 From a26b24b2e21f6222635a95426b9ef9eec63d69b1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Fri, 17 Jan 2025 07:19:13 -0800 Subject: perf/x86/intel: Use better start period for frequency mode Freqency mode is the current default mode of Linux perf. A period of 1 is used as a starting period. The period is auto-adjusted on each tick or an overflow, to meet the frequency target. The start period of 1 is too low and may trigger some issues: - Many HWs do not support period 1 well. https://lore.kernel.org/lkml/875xs2oh69.ffs@tglx/ - For an event that occurs frequently, period 1 is too far away from the real period. Lots of samples are generated at the beginning. The distribution of samples may not be even. - A low starting period for frequently occurring events also challenges virtualization, which has a longer path to handle a PMI. The limit_period value only checks the minimum acceptable value for HW. It cannot be used to set the start period, because some events may need a very low period. The limit_period cannot be set too high. It doesn't help with the events that occur frequently. It's hard to find a universal starting period for all events. The idea implemented by this patch is to only give an estimate for the popular HW and HW cache events. For the rest of the events, start from the lowest possible recommended value. Signed-off-by: Kan Liang Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250117151913.3043942-3-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 85 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cdcebf30468a..cdb19e3ba3aa 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3952,6 +3952,85 @@ static inline bool intel_pmu_has_cap(struct perf_event *event, int idx) return test_bit(idx, (unsigned long *)&intel_cap->capabilities); } +static u64 intel_pmu_freq_start_period(struct perf_event *event) +{ + int type = event->attr.type; + u64 config, factor; + s64 start; + + /* + * The 127 is the lowest possible recommended SAV (sample after value) + * for a 4000 freq (default freq), according to the event list JSON file. + * Also, assume the workload is idle 50% time. + */ + factor = 64 * 4000; + if (type != PERF_TYPE_HARDWARE && type != PERF_TYPE_HW_CACHE) + goto end; + + /* + * The estimation of the start period in the freq mode is + * based on the below assumption. + * + * For a cycles or an instructions event, 1GHZ of the + * underlying platform, 1 IPC. The workload is idle 50% time. + * The start period = 1,000,000,000 * 1 / freq / 2. + * = 500,000,000 / freq + * + * Usually, the branch-related events occur less than the + * instructions event. According to the Intel event list JSON + * file, the SAV (sample after value) of a branch-related event + * is usually 1/4 of an instruction event. + * The start period of branch-related events = 125,000,000 / freq. + * + * The cache-related events occurs even less. The SAV is usually + * 1/20 of an instruction event. + * The start period of cache-related events = 25,000,000 / freq. + */ + config = event->attr.config & PERF_HW_EVENT_MASK; + if (type == PERF_TYPE_HARDWARE) { + switch (config) { + case PERF_COUNT_HW_CPU_CYCLES: + case PERF_COUNT_HW_INSTRUCTIONS: + case PERF_COUNT_HW_BUS_CYCLES: + case PERF_COUNT_HW_STALLED_CYCLES_FRONTEND: + case PERF_COUNT_HW_STALLED_CYCLES_BACKEND: + case PERF_COUNT_HW_REF_CPU_CYCLES: + factor = 500000000; + break; + case PERF_COUNT_HW_BRANCH_INSTRUCTIONS: + case PERF_COUNT_HW_BRANCH_MISSES: + factor = 125000000; + break; + case PERF_COUNT_HW_CACHE_REFERENCES: + case PERF_COUNT_HW_CACHE_MISSES: + factor = 25000000; + break; + default: + goto end; + } + } + + if (type == PERF_TYPE_HW_CACHE) + factor = 25000000; +end: + /* + * Usually, a prime or a number with less factors (close to prime) + * is chosen as an SAV, which makes it less likely that the sampling + * period synchronizes with some periodic event in the workload. + * Minus 1 to make it at least avoiding values near power of twos + * for the default freq. + */ + start = DIV_ROUND_UP_ULL(factor, event->attr.sample_freq) - 1; + + if (start > x86_pmu.max_period) + start = x86_pmu.max_period; + + if (x86_pmu.limit_period) + x86_pmu.limit_period(event, &start); + + return start; +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -3963,6 +4042,12 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.freq && event->attr.sample_freq) { + event->hw.sample_period = intel_pmu_freq_start_period(event); + event->hw.last_period = event->hw.sample_period; + local64_set(&event->hw.period_left, event->hw.sample_period); + } + if (event->attr.precise_ip) { if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) return -EINVAL; -- cgit v1.2.3 From 8e8f0306497dea58fb4e8e2558949daae5eeac5c Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 25 Feb 2025 14:16:19 +0100 Subject: x86/mtrr: Remove unnecessary strlen() in mtrr_write() The local variable length already holds the string length after calling strncpy_from_user(). Using another local variable linlen and calling strlen() is therefore unnecessary and can be removed. Remove linlen and strlen() and use length instead. No change in functionality intended. Signed-off-by: Thorsten Blum Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250225131621.329699-2-thorsten.blum@linux.dev --- arch/x86/kernel/cpu/mtrr/if.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index a5c506f6da7f..4049235b1bfe 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -99,7 +99,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) char *ptr; char line[LINE_SIZE]; int length; - size_t linelen; memset(line, 0, LINE_SIZE); @@ -108,9 +107,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) if (length < 0) return length; - linelen = strlen(line); - ptr = line + linelen - 1; - if (linelen && *ptr == '\n') + ptr = line + length - 1; + if (length && *ptr == '\n') *ptr = '\0'; if (!strncmp(line, "disable=", 8)) { -- cgit v1.2.3 From 96f41f644c4885761b0d117fc36dc5dcf92e15ec Mon Sep 17 00:00:00 2001 From: Dmytro Maluka Date: Sun, 5 Jan 2025 17:27:40 +0000 Subject: x86/of: Don't use DTB for SMP setup if ACPI is enabled There are cases when it is useful to use both ACPI and DTB provided by the bootloader, however in such cases we should make sure to prevent conflicts between the two. Namely, don't try to use DTB for SMP setup if ACPI is enabled. Precisely, this prevents at least: - incorrectly calling register_lapic_address(APIC_DEFAULT_PHYS_BASE) after the LAPIC was already successfully enumerated via ACPI, causing noisy kernel warnings and probably potential real issues as well - failed IOAPIC setup in the case when IOAPIC is enumerated via mptable instead of ACPI (e.g. with acpi=noirq), due to mpparse_parse_smp_config() overridden by x86_dtb_parse_smp_config() Signed-off-by: Dmytro Maluka Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250105172741.3476758-2-dmaluka@chromium.org --- arch/x86/kernel/devicetree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 59d23cdf4ed0..dd8748c45529 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -2,6 +2,7 @@ /* * Architecture specific OF callbacks. */ +#include #include #include #include @@ -313,6 +314,6 @@ void __init x86_flattree_get_config(void) if (initial_dtb) early_memunmap(dt, map_len); #endif - if (of_have_populated_dt()) + if (acpi_disabled && of_have_populated_dt()) x86_init.mpparse.parse_smp_cfg = x86_dtb_parse_smp_config; } -- cgit v1.2.3 From bebe35bb738b573c32a5033499cd59f20293f2a3 Mon Sep 17 00:00:00 2001 From: Russell Senior Date: Tue, 25 Feb 2025 22:31:20 +0100 Subject: x86/CPU: Fix warm boot hang regression on AMD SC1100 SoC systems I still have some Soekris net4826 in a Community Wireless Network I volunteer with. These devices use an AMD SC1100 SoC. I am running OpenWrt on them, which uses a patched kernel, that naturally has evolved over time. I haven't updated the ones in the field in a number of years (circa 2017), but have one in a test bed, where I have intermittently tried out test builds. A few years ago, I noticed some trouble, particularly when "warm booting", that is, doing a reboot without removing power, and noticed the device was hanging after the kernel message: [ 0.081615] Working around Cyrix MediaGX virtual DMA bugs. If I removed power and then restarted, it would boot fine, continuing through the message above, thusly: [ 0.081615] Working around Cyrix MediaGX virtual DMA bugs. [ 0.090076] Enable Memory-Write-back mode on Cyrix/NSC processor. [ 0.100000] Enable Memory access reorder on Cyrix/NSC processor. [ 0.100070] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0 [ 0.110058] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0, 1GB 0 [ 0.120037] CPU: NSC Geode(TM) Integrated Processor by National Semi (family: 0x5, model: 0x9, stepping: 0x1) [...] In order to continue using modern tools, like ssh, to interact with the software on these old devices, I need modern builds of the OpenWrt firmware on the devices. I confirmed that the warm boot hang was still an issue in modern OpenWrt builds (currently using a patched linux v6.6.65). Last night, I decided it was time to get to the bottom of the warm boot hang, and began bisecting. From preserved builds, I narrowed down the bisection window from late February to late May 2019. During this period, the OpenWrt builds were using 4.14.x. I was able to build using period-correct Ubuntu 18.04.6. After a number of bisection iterations, I identified a kernel bump from 4.14.112 to 4.14.113 as the commit that introduced the warm boot hang. https://github.com/openwrt/openwrt/commit/07aaa7e3d62ad32767d7067107db64b6ade81537 Looking at the upstream changes in the stable kernel between 4.14.112 and 4.14.113 (tig v4.14.112..v4.14.113), I spotted a likely suspect: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=20afb90f730982882e65b01fb8bdfe83914339c5 So, I tried reverting just that kernel change on top of the breaking OpenWrt commit, and my warm boot hang went away. Presumably, the warm boot hang is due to some register not getting cleared in the same way that a loss of power does. That is approximately as much as I understand about the problem. More poking/prodding and coaching from Jonas Gorski, it looks like this test patch fixes the problem on my board: Tested against v6.6.67 and v4.14.113. Fixes: 18fb053f9b82 ("x86/cpu/cyrix: Use correct macros for Cyrix calls on Geode processors") Debugged-by: Jonas Gorski Signed-off-by: Russell Senior Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/CAHP3WfOgs3Ms4Z+L9i0-iBOE21sdMk5erAiJurPjnrL9LSsgRA@mail.gmail.com Cc: Matthew Whitehead Cc: Thomas Gleixner --- arch/x86/kernel/cpu/cyrix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 9651275aecd1..dfec2c61e354 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -153,8 +153,8 @@ static void geode_configure(void) u8 ccr3; local_irq_save(flags); - /* Suspend on halt power saving and enable #SUSP pin */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + /* Suspend on halt power saving */ + setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x08); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ -- cgit v1.2.3 From 9de7695925d5d2d2085681ba935857246eb2817d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 25 Feb 2025 22:32:33 +0100 Subject: x86/irq: Define trace events conditionally When both of X86_LOCAL_APIC and X86_THERMAL_VECTOR are disabled, the irq tracing produces a W=1 build warning for the tracing definitions: In file included from include/trace/trace_events.h:27, from include/trace/define_trace.h:113, from arch/x86/include/asm/trace/irq_vectors.h:383, from arch/x86/kernel/irq.c:29: include/trace/stages/init.h:2:23: error: 'str__irq_vectors__trace_system_name' defined but not used [-Werror=unused-const-variable=] Make the tracepoints conditional on the same symbosl that guard their usage. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250225213236.3141752-1-arnd@kernel.org --- arch/x86/kernel/irq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 385e3a5fc304..feca4f20b06a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -25,8 +25,10 @@ #include #include +#if defined(CONFIG_X86_LOCAL_APIC) || defined(CONFIG_X86_THERMAL_VECTOR) #define CREATE_TRACE_POINTS #include +#endif DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); -- cgit v1.2.3 From 68a9b0e313302451468c0b0eda53c383fa51a8f4 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Tue, 24 Dec 2024 22:55:16 +0800 Subject: perf/x86/rapl: Add support for Intel Arrow Lake U Add Arrow Lake U model for RAPL: $ ls -1 /sys/devices/power/events/ energy-cores energy-cores.scale energy-cores.unit energy-gpu energy-gpu.scale energy-gpu.unit energy-pkg energy-pkg.scale energy-pkg.unit energy-psys energy-psys.scale energy-psys.unit The same output as ArrowLake: $ perf stat -a -I 1000 --per-socket -e power/energy-pkg/ Signed-off-by: Aaron Ma Signed-off-by: Ingo Molnar Acked-by: Zhang Rui Link: https://lore.kernel.org/r/20241224145516.349028-1-aaron.ma@canonical.com --- arch/x86/events/rapl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 4952faf03e82..6941f4811bec 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -879,6 +879,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_VFM(INTEL_METEORLAKE_L, &model_skl), X86_MATCH_VFM(INTEL_ARROWLAKE_H, &model_skl), X86_MATCH_VFM(INTEL_ARROWLAKE, &model_skl), + X86_MATCH_VFM(INTEL_ARROWLAKE_U, &model_skl), X86_MATCH_VFM(INTEL_LUNARLAKE_M, &model_skl), {}, }; -- cgit v1.2.3 From 0f6750b15ffdf274668b12824b09bd49ea854e18 Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Thu, 19 Dec 2024 08:52:27 -0700 Subject: x86/entry: Fix kernel-doc warning The do_int80_emulation() function is missing a kernel-doc formatted description of its argument. This is causing a warning when building with W=1. Add a brief description of the argument to satisfy kernel-doc. Reported-by: kernel test robot Signed-off-by: Daniel Sneddon Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241219155227.685692-1-daniel.sneddon@linux.intel.com Closes: https://lore.kernel.org/oe-kbuild-all/202412131236.a5HhOqXo-lkp@intel.com/ --- arch/x86/entry/common.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 94941c5a10ac..14db5b85114c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -190,6 +190,7 @@ static __always_inline bool int80_is_external(void) /** * do_int80_emulation - 32-bit legacy syscall C entry from asm + * @regs: syscall arguments in struct pt_args on the stack. * * This entry point can be used by 32-bit and 64-bit programs to perform * 32-bit system calls. Instances of INT $0x80 can be found inline in -- cgit v1.2.3 From 17bcd714426386fda741a4bccd96a2870179344b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 24 Feb 2025 15:55:36 -0800 Subject: KVM: x86: Free vCPUs before freeing VM state Free vCPUs before freeing any VM state, as both SVM and VMX may access VM state when "freeing" a vCPU that is currently "in" L2, i.e. that needs to be kicked out of nested guest mode. Commit 6fcee03df6a1 ("KVM: x86: avoid loading a vCPU after .vm_destroy was called") partially fixed the issue, but for unknown reasons only moved the MMU unloading before VM destruction. Complete the change, and free all vCPU state prior to destroying VM state, as nVMX accesses even more state than nSVM. In addition to the AVIC, KVM can hit a use-after-free on MSR filters: kvm_msr_allowed+0x4c/0xd0 __kvm_set_msr+0x12d/0x1e0 kvm_set_msr+0x19/0x40 load_vmcs12_host_state+0x2d8/0x6e0 [kvm_intel] nested_vmx_vmexit+0x715/0xbd0 [kvm_intel] nested_vmx_free_vcpu+0x33/0x50 [kvm_intel] vmx_free_vcpu+0x54/0xc0 [kvm_intel] kvm_arch_vcpu_destroy+0x28/0xf0 kvm_vcpu_destroy+0x12/0x50 kvm_arch_destroy_vm+0x12c/0x1c0 kvm_put_kvm+0x263/0x3c0 kvm_vm_release+0x21/0x30 and an upcoming fix to process injectable interrupts on nested VM-Exit will access the PIC: BUG: kernel NULL pointer dereference, address: 0000000000000090 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page CPU: 23 UID: 1000 PID: 2658 Comm: kvm-nx-lpage-re RIP: 0010:kvm_cpu_has_extint+0x2f/0x60 [kvm] Call Trace: kvm_cpu_has_injectable_intr+0xe/0x60 [kvm] nested_vmx_vmexit+0x2d7/0xdf0 [kvm_intel] nested_vmx_free_vcpu+0x40/0x50 [kvm_intel] vmx_vcpu_free+0x2d/0x80 [kvm_intel] kvm_arch_vcpu_destroy+0x2d/0x130 [kvm] kvm_destroy_vcpus+0x8a/0x100 [kvm] kvm_arch_destroy_vm+0xa7/0x1d0 [kvm] kvm_destroy_vm+0x172/0x300 [kvm] kvm_vcpu_release+0x31/0x50 [kvm] Inarguably, both nSVM and nVMX need to be fixed, but punt on those cleanups for the moment. Conceptually, vCPUs should be freed before VM state. Assets like the I/O APIC and PIC _must_ be allocated before vCPUs are created, so it stands to reason that they must be freed _after_ vCPUs are destroyed. Reported-by: Aaron Lewis Closes: https://lore.kernel.org/all/20240703175618.2304869-2-aaronlewis@google.com Cc: Jim Mattson Cc: Yan Zhao Cc: Rick P Edgecombe Cc: Kai Huang Cc: Isaku Yamahata Signed-off-by: Sean Christopherson Message-ID: <20250224235542.2562848-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 02159c967d29..6fc4ddc606bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12877,11 +12877,11 @@ void kvm_arch_destroy_vm(struct kvm *kvm) mutex_unlock(&kvm->slots_lock); } kvm_unload_vcpu_mmus(kvm); + kvm_destroy_vcpus(kvm); kvm_x86_call(vm_destroy)(kvm); kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); - kvm_destroy_vcpus(kvm); kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); kvm_mmu_uninit_vm(kvm); -- cgit v1.2.3 From 982caaa1150479f022003390cd72a1941663d211 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 24 Feb 2025 15:55:37 -0800 Subject: KVM: nVMX: Process events on nested VM-Exit if injectable IRQ or NMI is pending Process pending events on nested VM-Exit if the vCPU has an injectable IRQ or NMI, as the event may have become pending while L2 was active, i.e. may not be tracked in the context of vmcs01. E.g. if L1 has passed its APIC through to L2 and an IRQ arrives while L2 is active, then KVM needs to request an IRQ window prior to running L1, otherwise delivery of the IRQ will be delayed until KVM happens to process events for some other reason. The missed failure is detected by vmx_apic_passthrough_tpr_threshold_test in KVM-Unit-Tests, but has effectively been masked due to a flaw in KVM's PIC emulation that causes KVM to make spurious KVM_REQ_EVENT requests (and apparently no one ever ran the test with split IRQ chips). Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250224235542.2562848-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 8a7af02d466e..ed8a3cb53961 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5084,6 +5084,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, load_vmcs12_host_state(vcpu, vmcs12); + /* + * Process events if an injectable IRQ or NMI is pending, even + * if the event is blocked (RFLAGS.IF is cleared on VM-Exit). + * If an event became pending while L2 was active, KVM needs to + * either inject the event or request an IRQ/NMI window. SMIs + * don't need to be processed as SMM is mutually exclusive with + * non-root mode. INIT/SIPI don't need to be checked as INIT + * is blocked post-VMXON, and SIPIs are ignored. + */ + if (kvm_cpu_has_injectable_intr(vcpu) || vcpu->arch.nmi_pending) + kvm_make_request(KVM_REQ_EVENT, vcpu); return; } -- cgit v1.2.3 From 2b1283e1ea9b5e0b06f075f79391a51d9f70749b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 25 Feb 2025 11:46:36 +0000 Subject: arm64/mm: Fix Boot panic on Ampere Altra When the range of present physical memory is sufficiently small enough and the reserved address space for the linear map is sufficiently large enough, The linear map base address is randomized in arm64_memblock_init(). Prior to commit 62cffa496aac ("arm64/mm: Override PARange for !LPA2 and use it consistently"), we decided if the sizes were suitable with the help of the raw mmfr0.parange. But the commit changed this to use the sanitized version instead. But the function runs before the register has been sanitized so this returns 0, interpreted as a parange of 32 bits. Some fun wrapping occurs and the logic concludes that there is enough room to randomize the linear map base address, when really there isn't. So the top of the linear map ends up outside the reserved address space. Since the PA range cannot be overridden in the first place, restore the mmfr0 reading logic to its state prior to 62cffa496aac, where the raw register value is used. Reported-by: Luiz Capitulino Suggested-by: Ard Biesheuvel Closes: https://lore.kernel.org/all/a3d9acbe-07c2-43b6-9ba9-a7585f770e83@redhat.com/ Fixes: 62cffa496aac ("arm64/mm: Override PARange for !LPA2 and use it consistently") Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20250225114638.2038006-1-ryan.roberts@arm.com Cc: stable@vger.kernel.org Signed-off-by: Will Deacon --- arch/arm64/mm/init.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9c0b8d9558fc..ccdef53872a0 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -279,12 +279,7 @@ void __init arm64_memblock_init(void) if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { extern u16 memstart_offset_seed; - - /* - * Use the sanitised version of id_aa64mmfr0_el1 so that linear - * map randomization can be enabled by shrinking the IPA space. - */ - u64 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); + u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1); int parange = cpuid_feature_extract_unsigned_field( mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT); s64 range = linear_region_size - -- cgit v1.2.3 From c1fcf41cf37f7a3fd3bbf6f0c04aba3ea4258888 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 25 Feb 2025 19:37:32 +0000 Subject: x86/mm: Clear _PAGE_DIRTY for kernel mappings when we clear _PAGE_RW The bit pattern of _PAGE_DIRTY set and _PAGE_RW clear is used to mark shadow stacks. This is currently checked for in mk_pte() but not pfn_pte(). If we add the check to pfn_pte(), it catches vfree() calling set_direct_map_invalid_noflush() which calls __change_page_attr() which loads the old protection bits from the PTE, clears the specified bits and uses pfn_pte() to construct the new PTE. We should, therefore, for kernel mappings, clear the _PAGE_DIRTY bit consistently whenever we clear _PAGE_RW. I opted to do it in the callers in case we want to use __change_page_attr() to create shadow stacks inside the kernel at some point in the future. Arguably, we might also want to clear _PAGE_ACCESSED here. Note that the 3 functions involved: __set_pages_np() kernel_map_pages_in_pgd() kernel_unmap_pages_in_pgd() Only ever manipulate non-swappable kernel mappings, so maintaining the DIRTY:1|RW:0 special pattern for shadow stacks and DIRTY:0 pattern for non-shadow-stack entries can be maintained consistently and doesn't result in the unintended clearing of a live dirty bit that could corrupt (destroy) dirty bit information for user mappings. Reported-by: kernel test robot Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ingo Molnar Acked-by: Linus Torvalds Link: https://lore.kernel.org/r/174051422675.10177.13226545170101706336.tip-bot2@tip-bot2 Closes: https://lore.kernel.org/oe-lkp/202502241646.719f4651-lkp@intel.com --- arch/x86/mm/pat/set_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 84d0bca3be28..d1740159029e 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2628,7 +2628,7 @@ static int __set_pages_np(struct page *page, int numpages) .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS }; /* @@ -2715,7 +2715,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), + .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)), .flags = CPA_NO_CHECK_ALIAS, }; @@ -2758,7 +2758,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS, }; -- cgit v1.2.3 From 6d48ad04075729519f6baaa1dc9e5a3a39d05f53 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Wed, 26 Feb 2025 21:28:41 +0800 Subject: MIPS: Ignore relocs against __ex_table for relocatable kernel Since commit 6f2c2f93a190 ("scripts/sorttable: Remove unneeded Elf_Rel"), sorttable no longer clears relocs against __ex_table, claiming "it was never used." But in fact MIPS relocatable kernel had been implicitly depending on this behavior, so after this commit the MIPS relocatable kernel has started to spit oops like: CPU 1 Unable to handle kernel paging request at virtual address 000000fffbbdbff8, epc == ffffffff818f9a6c, ra == ffffffff813ad7d0 ... ... Call Trace: [] __raw_copy_from_user+0x48/0x2fc [] cp_statx+0x1a0/0x1e0 [] do_statx_fd+0xa8/0x118 [] sys_statx+0xd8/0xf8 [] syscall_common+0x34/0x58 So ignore those relocs on our own to fix the issue. Fixes: 6f2c2f93a190 ("scripts/sorttable: Remove unneeded Elf_Rel") Signed-off-by: Xi Ruoyao Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/tools/relocs.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/mips/boot/tools/relocs.c b/arch/mips/boot/tools/relocs.c index a88d66c46d7f..9863e1d5c62e 100644 --- a/arch/mips/boot/tools/relocs.c +++ b/arch/mips/boot/tools/relocs.c @@ -468,6 +468,8 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname)) { int i; + struct section *extab_sec = sec_lookup("__ex_table"); + int extab_index = extab_sec ? extab_sec - secs : -1; /* Walk through the relocations */ for (i = 0; i < ehdr.e_shnum; i++) { @@ -480,6 +482,9 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, if (sec->shdr.sh_type != SHT_REL_TYPE) continue; + if (sec->shdr.sh_info == extab_index) + continue; + sec_symtab = sec->link; sec_applies = &secs[sec->shdr.sh_info]; if (!(sec_applies->shdr.sh_flags & SHF_ALLOC)) -- cgit v1.2.3 From 6ac43f2be982ea54b75206dccd33f4cf81bfdc39 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:05 +0100 Subject: x86/Kconfig: Add cmpxchg8b support back to Geode CPUs An older cleanup of mine inadvertently removed geode-gx1 and geode-lx from the list of CPUs that are known to support a working cmpxchg8b. Fixes: 88a2b4edda3d ("x86/Kconfig: Rework CONFIG_X86_PAE dependency") Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250226213714.4040853-2-arnd@kernel.org --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2a7279d80460..42e6a40876ea 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -368,7 +368,7 @@ config X86_HAVE_PAE config X86_CMPXCHG64 def_bool y - depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 + depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 || MGEODEGX1 || MGEODE_LX # this should be set for all -march=.. options where the compiler # generates cmov. -- cgit v1.2.3 From 0abf508675c0dbbca6a387842f90db60756c4af5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:06 +0100 Subject: x86/smp: Drop 32-bit "bigsmp" machine support The x86-32 kernel used to support multiple platforms with more than eight logical CPUs, from the 1999-2003 timeframe: Sequent NUMA-Q, IBM Summit, Unisys ES7000 and HP F8. Support for all except the latter was dropped back in 2014, leaving only the F8 based DL740 and DL760 G2 machines in this catery, with up to eight single-core Socket-603 Xeon-MP processors with hyperthreading. Like the already removed machines, the HP F8 servers at the time cost upwards of $100k in typical configurations, but were quickly obsoleted by their 64-bit Socket-604 cousins and the AMD Opteron. Earlier servers with up to 8 Pentium Pro or Xeon processors remain fully supported as they had no hyperthreading. Similarly, the more common 4-socket Xeon-MP machines with hyperthreading using Intel or ServerWorks chipsets continue to work without this, and all the multi-core Xeon processors also run 64-bit kernels. While the "bigsmp" support can also be used to run on later 64-bit machines (including VM guests), it seems best to discourage that and get any remaining users to update their kernels to 64-bit builds on these. As a side-effect of this, there is also no more need to support NUMA configurations on 32-bit x86, as all true 32-bit NUMA platforms are already gone. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-3-arnd@kernel.org --- arch/x86/Kconfig | 20 ++------ arch/x86/kernel/apic/Makefile | 3 -- arch/x86/kernel/apic/apic.c | 3 -- arch/x86/kernel/apic/bigsmp_32.c | 105 --------------------------------------- arch/x86/kernel/apic/local.h | 13 ----- arch/x86/kernel/apic/probe_32.c | 29 ----------- 6 files changed, 4 insertions(+), 169 deletions(-) delete mode 100644 arch/x86/kernel/apic/bigsmp_32.c (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d581634c6a59..887b77bdeb06 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -531,12 +531,6 @@ config X86_FRED ring transitions and exception/interrupt handling if the system supports it. -config X86_BIGSMP - bool "Support for big SMP systems with more than 8 CPUs" - depends on SMP && X86_32 - help - This option is needed for the systems that have more than 8 CPUs. - config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" default y @@ -735,8 +729,8 @@ config X86_32_NON_STANDARD depends on X86_32 && SMP depends on X86_EXTENDED_PLATFORM help - This option compiles in the bigsmp and STA2X11 default - subarchitectures. It is intended for a generic binary + This option compiles in the STA2X11 default + subarchitecture. It is intended for a generic binary kernel. If you select them all, kernel will probe it one by one and will fallback to default. @@ -1013,8 +1007,7 @@ config NR_CPUS_RANGE_BEGIN config NR_CPUS_RANGE_END int depends on X86_32 - default 64 if SMP && X86_BIGSMP - default 8 if SMP && !X86_BIGSMP + default 8 if SMP default 1 if !SMP config NR_CPUS_RANGE_END @@ -1027,7 +1020,6 @@ config NR_CPUS_RANGE_END config NR_CPUS_DEFAULT int depends on X86_32 - default 32 if X86_BIGSMP default 8 if SMP default 1 if !SMP @@ -1574,8 +1566,7 @@ config AMD_MEM_ENCRYPT config NUMA bool "NUMA Memory Allocation and Scheduler Support" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP) - default y if X86_BIGSMP + depends on X86_64 select USE_PERCPU_NUMA_NODE_ID select OF_NUMA if OF help @@ -1588,9 +1579,6 @@ config NUMA For 64-bit this is recommended if the system is Intel Core i7 (or later), AMD Opteron, or EM64T NUMA. - For 32-bit this is only needed if you boot a 32-bit - kernel on a 64-bit NUMA platform. - Otherwise, you should say N. config AMD_NUMA diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 3bf0487cf3b7..52d1808ee360 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -23,8 +23,5 @@ obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o obj-y += apic_flat_64.o endif -# APIC probe will depend on the listing order here -obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o - # For 32bit, probe_32 need to be listed last obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e893dc6f11c1..ddca8da6d468 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1371,8 +1371,6 @@ void __init apic_intr_mode_init(void) x86_64_probe_apic(); - x86_32_install_bigsmp(); - if (x86_platform.apic_post_init) x86_platform.apic_post_init(); @@ -1674,7 +1672,6 @@ static __init void apic_read_boot_cpu_id(bool x2apic) boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } topology_register_boot_apic(boot_cpu_physical_apicid); - x86_32_probe_bigsmp_early(); } #ifdef CONFIG_X86_X2APIC diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c deleted file mode 100644 index 9285d500d5b4..000000000000 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * APIC driver for "bigsmp" xAPIC machines with more than 8 virtual CPUs. - * - * Drives the local APIC in "clustered mode". - */ -#include -#include -#include - -#include -#include - -#include "local.h" - -static u32 bigsmp_get_apic_id(u32 x) -{ - return (x >> 24) & 0xFF; -} - -static void bigsmp_send_IPI_allbutself(int vector) -{ - default_send_IPI_mask_allbutself_phys(cpu_online_mask, vector); -} - -static void bigsmp_send_IPI_all(int vector) -{ - default_send_IPI_mask_sequence_phys(cpu_online_mask, vector); -} - -static int dmi_bigsmp; /* can be set by dmi scanners */ - -static int hp_ht_bigsmp(const struct dmi_system_id *d) -{ - printk(KERN_NOTICE "%s detected: force use of apic=bigsmp\n", d->ident); - dmi_bigsmp = 1; - - return 0; -} - - -static const struct dmi_system_id bigsmp_dmi_table[] = { - { hp_ht_bigsmp, "HP ProLiant DL760 G2", - { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), - DMI_MATCH(DMI_BIOS_VERSION, "P44-"), - } - }, - - { hp_ht_bigsmp, "HP ProLiant DL740", - { DMI_MATCH(DMI_BIOS_VENDOR, "HP"), - DMI_MATCH(DMI_BIOS_VERSION, "P47-"), - } - }, - { } /* NULL entry stops DMI scanning */ -}; - -static int probe_bigsmp(void) -{ - return dmi_check_system(bigsmp_dmi_table); -} - -static struct apic apic_bigsmp __ro_after_init = { - - .name = "bigsmp", - .probe = probe_bigsmp, - - .dest_mode_logical = false, - - .disable_esr = 1, - - .cpu_present_to_apicid = default_cpu_present_to_apicid, - - .max_apic_id = 0xFE, - .get_apic_id = bigsmp_get_apic_id, - - .calc_dest_apicid = apic_default_calc_apicid, - - .send_IPI = default_send_IPI_single_phys, - .send_IPI_mask = default_send_IPI_mask_sequence_phys, - .send_IPI_mask_allbutself = NULL, - .send_IPI_allbutself = bigsmp_send_IPI_allbutself, - .send_IPI_all = bigsmp_send_IPI_all, - .send_IPI_self = default_send_IPI_self, - - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .eoi = native_apic_mem_eoi, - .icr_read = native_apic_icr_read, - .icr_write = native_apic_icr_write, - .wait_icr_idle = apic_mem_wait_icr_idle, - .safe_wait_icr_idle = apic_mem_wait_icr_idle_timeout, -}; - -bool __init apic_bigsmp_possible(bool cmdline_override) -{ - return apic == &apic_bigsmp || !cmdline_override; -} - -void __init apic_bigsmp_force(void) -{ - if (apic != &apic_bigsmp) - apic_install_driver(&apic_bigsmp); -} - -apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/local.h b/arch/x86/kernel/apic/local.h index 842fe28496be..bdcf609eb283 100644 --- a/arch/x86/kernel/apic/local.h +++ b/arch/x86/kernel/apic/local.h @@ -65,17 +65,4 @@ void default_send_IPI_self(int vector); void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask, int vector); void default_send_IPI_mask_logical(const struct cpumask *mask, int vector); -void x86_32_probe_bigsmp_early(void); -void x86_32_install_bigsmp(void); -#else -static inline void x86_32_probe_bigsmp_early(void) { } -static inline void x86_32_install_bigsmp(void) { } -#endif - -#ifdef CONFIG_X86_BIGSMP -bool apic_bigsmp_possible(bool cmdline_selected); -void apic_bigsmp_force(void); -#else -static inline bool apic_bigsmp_possible(bool cmdline_selected) { return false; }; -static inline void apic_bigsmp_force(void) { } #endif diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index f75ee345c02d..87bc9e7ca5d6 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -93,35 +93,6 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init x86_32_probe_bigsmp_early(void) -{ - if (nr_cpu_ids <= 8 || xen_pv_domain()) - return; - - if (IS_ENABLED(CONFIG_X86_BIGSMP)) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(boot_cpu_apic_version)) - break; - /* P4 and above */ - fallthrough; - case X86_VENDOR_HYGON: - case X86_VENDOR_AMD: - if (apic_bigsmp_possible(cmdline_apic)) - return; - break; - } - } - pr_info("Limiting to 8 possible CPUs\n"); - set_nr_cpu_ids(8); -} - -void __init x86_32_install_bigsmp(void) -{ - if (nr_cpu_ids > 8 && !xen_pv_domain()) - apic_bigsmp_force(); -} - void __init x86_32_probe_apic(void) { if (!cmdline_apic) { -- cgit v1.2.3 From fc2d5cbe541032e74a66599ba843803cebbfed0e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:07 +0100 Subject: x86/build: Rework CONFIG_GENERIC_CPU compiler flags Building an x86-64 kernel with CONFIG_GENERIC_CPU is documented to run on all CPUs, but the Makefile does not actually pass an -march= argument, instead relying on the default that was used to configure the toolchain. In many cases, gcc will be configured to -march=x86-64 or -march=k8 for maximum compatibility, but in other cases a distribution default may be either raised to a more recent ISA, or set to -march=native to build for the CPU used for compilation. This still works in the case of building a custom kernel for the local machine. The point where it breaks down is building a kernel for another machine that is older the the default target. Changing the default to -march=x86-64 would make it work reliable, but possibly produce worse code on distros that intentionally default to a newer ISA. To allow reliably building a kernel for either the oldest x86-64 CPUs, pass the -march=x86-64 flag to the compiler. This was not possible in early versions of x86-64 gcc, but works on all currently supported versions down to at least gcc-5. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-4-arnd@kernel.org --- arch/x86/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5b773b34768d..5af3172fd51c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -183,14 +183,14 @@ else cflags-$(CONFIG_MPSC) += -march=nocona cflags-$(CONFIG_MCORE2) += -march=core2 cflags-$(CONFIG_MATOM) += -march=atom - cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic + cflags-$(CONFIG_GENERIC_CPU) += -march=x86-64 -mtune=generic KBUILD_CFLAGS += $(cflags-y) rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom - rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic + rustflags-$(CONFIG_GENERIC_CPU) += -Ctarget-cpu=x86-64 -Ztune-cpu=generic KBUILD_RUSTFLAGS += $(rustflags-y) KBUILD_CFLAGS += -mno-red-zone -- cgit v1.2.3 From f388f60ca9041a95c9b3f157d316ed7c8f297e44 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:08 +0100 Subject: x86/cpu: Drop configuration options for early 64-bit CPUs The x86 CPU selection menu is confusing for a number of reasons: When configuring 32-bit kernels, it shows a small number of early 64-bit microarchitectures (K8, Core 2) but not the regular generic 64-bit target that is the normal default. There is no longer a reason to run 32-bit kernels on production 64-bit systems, so only actual 32-bit CPUs need to be shown here. When configuring 64-bit kernels, the options also pointless as there is no way to pick any CPU from the past 15 years, leaving GENERIC_CPU as the only sensible choice. Address both of the above by removing the obsolete options and making all 64-bit kernels run on both Intel and AMD CPUs from any generation. Testing generic 32-bit kernels on 64-bit hardware remains possible, just not building a 32-bit kernel that requires a 64-bit CPU. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-5-arnd@kernel.org --- arch/x86/Kconfig.cpu | 95 ++++++----------------------------------- arch/x86/Makefile | 16 +------ arch/x86/Makefile_32.cpu | 5 +-- arch/x86/include/asm/vermagic.h | 4 -- 4 files changed, 17 insertions(+), 103 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 42e6a40876ea..8fcb8ccee44b 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -1,9 +1,9 @@ # SPDX-License-Identifier: GPL-2.0 # Put here option for CPU selection and depending optimization choice - prompt "Processor family" - default M686 if X86_32 - default GENERIC_CPU if X86_64 + prompt "x86-32 Processor family" + depends on X86_32 + default M686 help This is the processor type of your CPU. This information is used for optimizing purposes. In order to compile a kernel @@ -31,7 +31,6 @@ choice - "Pentium-4" for the Intel Pentium 4 or P4-based Celeron. - "K6" for the AMD K6, K6-II and K6-III (aka K6-3D). - "Athlon" for the AMD K7 family (Athlon/Duron/Thunderbird). - - "Opteron/Athlon64/Hammer/K8" for all K8 and newer AMD CPUs. - "Crusoe" for the Transmeta Crusoe series. - "Efficeon" for the Transmeta Efficeon series. - "Winchip-C6" for original IDT Winchip. @@ -42,13 +41,10 @@ choice - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. - "VIA C3-2" for VIA C3-2 "Nehemiah" (model 9 and above). - "VIA C7" for VIA C7. - - "Intel P4" for the Pentium 4/Netburst microarchitecture. - - "Core 2/newer Xeon" for all core2 and newer Intel CPUs. - "Intel Atom" for the Atom-microarchitecture CPUs. - - "Generic-x86-64" for a kernel which runs on any x86-64 CPU. See each option's help text for additional details. If you don't know - what to do, choose "486". + what to do, choose "Pentium-Pro". config M486SX bool "486SX" @@ -114,11 +110,11 @@ config MPENTIUMIII extensions. config MPENTIUMM - bool "Pentium M" + bool "Pentium M/Pentium Dual Core/Core Solo/Core Duo" depends on X86_32 help Select this for Intel Pentium M (not Pentium-4 M) - notebook chips. + "Merom" Core Solo/Duo notebook chips config MPENTIUM4 bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" @@ -139,22 +135,10 @@ config MPENTIUM4 -Mobile Pentium 4 -Mobile Pentium 4 M -Extreme Edition (Gallatin) - -Prescott - -Prescott 2M - -Cedar Mill - -Presler - -Smithfiled Xeons (Intel Xeon, Xeon MP, Xeon LV, Xeon MV) corename: -Foster -Prestonia -Gallatin - -Nocona - -Irwindale - -Cranford - -Potomac - -Paxville - -Dempsey - config MK6 bool "K6/K6-II/K6-III" @@ -172,13 +156,6 @@ config MK7 some extended instructions, and passes appropriate optimization flags to GCC. -config MK8 - bool "Opteron/Athlon64/Hammer/K8" - help - Select this for an AMD Opteron or Athlon64 Hammer-family processor. - Enables use of some extended instructions, and passes appropriate - optimization flags to GCC. - config MCRUSOE bool "Crusoe" depends on X86_32 @@ -258,42 +235,14 @@ config MVIAC7 Select this for a VIA C7. Selecting this uses the correct cache shift and tells gcc to treat the CPU as a 686. -config MPSC - bool "Intel P4 / older Netburst based Xeon" - depends on X86_64 - help - Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey - Xeon CPUs with Intel 64bit which is compatible with x86-64. - Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the - Netburst core and shouldn't use this option. You can distinguish them - using the cpu family field - in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. - -config MCORE2 - bool "Core 2/newer Xeon" - help - - Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and - 53xx) CPUs. You can distinguish newer from older Xeons by the CPU - family in /proc/cpuinfo. Newer ones have 6 and older ones 15 - (not a typo) - config MATOM bool "Intel Atom" help - Select this for the Intel Atom platform. Intel Atom CPUs have an in-order pipelining architecture and thus can benefit from accordingly optimized code. Use a recent GCC with specific Atom support in order to fully benefit from selecting this option. -config GENERIC_CPU - bool "Generic-x86-64" - depends on X86_64 - help - Generic x86-64 CPU. - Run equally well on all x86-64 CPUs. - endchoice config X86_GENERIC @@ -317,8 +266,8 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int - default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "7" if MPENTIUM4 + default "6" if MK7 || MPENTIUMM || MATOM || MVIAC7 || X86_GENERIC || X86_64 default "4" if MELAN || M486SX || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX @@ -336,35 +285,19 @@ config X86_ALIGNMENT_16 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 + depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK7 || MEFFICEON config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM - -# -# P6_NOPs are a relatively minor optimization that require a family >= -# 6 processor, except that it is broken on certain VIA chips. -# Furthermore, AMD chips prefer a totally different sequence of NOPs -# (which work on all CPUs). In addition, it looks like Virtual PC -# does not understand them. -# -# As a result, disallow these if we're not compiling for X86_64 (these -# NOPs do work on all x86-64 capable chips); the list of processors in -# the right-hand clause are the cores that benefit from this optimization. -# -config X86_P6_NOP - def_bool y - depends on X86_64 - depends on (MCORE2 || MPENTIUM4 || MPSC) + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MATOM config X86_TSC def_bool y - depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MATOM) || X86_64 config X86_HAVE_PAE def_bool y - depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC7 || MCORE2 || MATOM || X86_64 + depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC7 || MATOM || X86_64 config X86_CMPXCHG64 def_bool y @@ -374,12 +307,12 @@ config X86_CMPXCHG64 # generates cmov. config X86_CMOV def_bool y - depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) + depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || MATOM || MGEODE_LX || X86_64) config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8) + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MK7) default "5" if X86_32 && X86_CMPXCHG64 default "4" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5af3172fd51c..8120085b00a4 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -178,20 +178,8 @@ else # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) - # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) - cflags-$(CONFIG_MK8) += -march=k8 - cflags-$(CONFIG_MPSC) += -march=nocona - cflags-$(CONFIG_MCORE2) += -march=core2 - cflags-$(CONFIG_MATOM) += -march=atom - cflags-$(CONFIG_GENERIC_CPU) += -march=x86-64 -mtune=generic - KBUILD_CFLAGS += $(cflags-y) - - rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 - rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona - rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 - rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom - rustflags-$(CONFIG_GENERIC_CPU) += -Ctarget-cpu=x86-64 -Ztune-cpu=generic - KBUILD_RUSTFLAGS += $(rustflags-y) + KBUILD_CFLAGS += -march=x86-64 -mtune=generic + KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64 -Ztune-cpu=generic KBUILD_CFLAGS += -mno-red-zone KBUILD_CFLAGS += -mcmodel=kernel diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 94834c4b5e5e..af7de9a42752 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -24,7 +24,6 @@ cflags-$(CONFIG_MK6) += -march=k6 # Please note, that patches that add -march=athlon-xp and friends are pointless. # They make zero difference whatsosever to performance at this time. cflags-$(CONFIG_MK7) += -march=athlon -cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align) cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align) cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) @@ -32,9 +31,7 @@ cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align) cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) cflags-$(CONFIG_MVIAC7) += -march=i686 -cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) -cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ - $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) +cflags-$(CONFIG_MATOM) += -march=atom # AMD Elan support cflags-$(CONFIG_MELAN) += -march=i486 diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h index 75884d2cdec3..5d471253c755 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -15,8 +15,6 @@ #define MODULE_PROC_FAMILY "586TSC " #elif defined CONFIG_M586MMX #define MODULE_PROC_FAMILY "586MMX " -#elif defined CONFIG_MCORE2 -#define MODULE_PROC_FAMILY "CORE2 " #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 @@ -33,8 +31,6 @@ #define MODULE_PROC_FAMILY "K6 " #elif defined CONFIG_MK7 #define MODULE_PROC_FAMILY "K7 " -#elif defined CONFIG_MK8 -#define MODULE_PROC_FAMILY "K8 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE -- cgit v1.2.3 From bbeb69ce301323e84f1677484eb8e4cd8fb1f9f8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:09 +0100 Subject: x86/mm: Remove CONFIG_HIGHMEM64G support HIGHMEM64G support was added in linux-2.3.25 to support (then) high-end Pentium Pro and Pentium III Xeon servers with more than 4GB of addressing, NUMA and PCI-X slots started appearing. I have found no evidence of this ever being used in regular dual-socket servers or consumer devices, all the users seem obsolete these days, even by i386 standards: - Support for NUMA servers (NUMA-Q, IBM x440, unisys) was already removed ten years ago. - 4+ socket non-NUMA servers based on Intel 450GX/450NX, HP F8 and ServerWorks ServerSet/GrandChampion could theoretically still work with 8GB, but these were exceptionally rare even 20 years ago and would have usually been equipped with than the maximum amount of RAM. - Some SKUs of the Celeron D from 2004 had 64-bit mode fused off but could still work in a Socket 775 mainboard designed for the later Core 2 Duo and 8GB. Apparently most BIOSes at the time only allowed 64-bit CPUs. - The rare Xeon LV "Sossaman" came on a few motherboards with registered DDR2 memory support up to 16GB. - In the early days of x86-64 hardware, there was sometimes the need to run a 32-bit kernel to work around bugs in the hardware drivers, or in the syscall emulation for 32-bit userspace. This likely still works but there should never be a need for this any more. PAE mode is still required to get access to the 'NX' bit on Atom 'Pentium M' and 'Core Duo' CPUs. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-6-arnd@kernel.org --- arch/x86/Kconfig | 46 +++++------------------------------- arch/x86/configs/xen.config | 2 -- arch/x86/include/asm/page_32_types.h | 4 ++-- arch/x86/mm/init_32.c | 9 ++----- 4 files changed, 10 insertions(+), 51 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 887b77bdeb06..737a0c630527 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1388,15 +1388,11 @@ config X86_CPUID with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to /dev/cpu/31/cpuid. -choice - prompt "High Memory Support" - default HIGHMEM4G +config HIGHMEM4G + bool "High Memory Support" depends on X86_32 - -config NOHIGHMEM - bool "off" help - Linux can use up to 64 Gigabytes of physical memory on x86 systems. + Linux can use up to 4 Gigabytes of physical memory on x86 systems. However, the address space of 32-bit x86 processors is only 4 Gigabytes large. That means that, if you have a large amount of physical memory, not all of it can be "permanently mapped" by the @@ -1412,38 +1408,9 @@ config NOHIGHMEM possible. If the machine has between 1 and 4 Gigabytes physical RAM, then - answer "4GB" here. + answer "Y" here. - If more than 4 Gigabytes is used then answer "64GB" here. This - selection turns Intel PAE (Physical Address Extension) mode on. - PAE implements 3-level paging on IA32 processors. PAE is fully - supported by Linux, PAE mode is implemented on all recent Intel - processors (Pentium Pro and better). NOTE: If you say "64GB" here, - then the kernel will not boot on CPUs that don't support PAE! - - The actual amount of total physical memory will either be - auto detected or can be forced by using a kernel command line option - such as "mem=256M". (Try "man bootparam" or see the documentation of - your boot loader (lilo or loadlin) about how to pass options to the - kernel at boot time.) - - If unsure, say "off". - -config HIGHMEM4G - bool "4GB" - help - Select this if you have a 32-bit processor and between 1 and 4 - gigabytes of physical RAM. - -config HIGHMEM64G - bool "64GB" - depends on X86_HAVE_PAE - select X86_PAE - help - Select this if you have a 32-bit processor and more than 4 - gigabytes of physical RAM. - -endchoice + If unsure, say N. choice prompt "Memory split" if EXPERT @@ -1489,8 +1456,7 @@ config PAGE_OFFSET depends on X86_32 config HIGHMEM - def_bool y - depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) + def_bool HIGHMEM4G config X86_PAE bool "PAE (Physical Address Extension) Support" diff --git a/arch/x86/configs/xen.config b/arch/x86/configs/xen.config index 581296255b39..d5d091e03bd3 100644 --- a/arch/x86/configs/xen.config +++ b/arch/x86/configs/xen.config @@ -1,6 +1,4 @@ # global x86 required specific stuff -# On 32-bit HIGHMEM4G is not allowed -CONFIG_HIGHMEM64G=y CONFIG_64BIT=y # These enable us to allow some of the diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index faf9cc1c14bb..25c32652f404 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -11,8 +11,8 @@ * a virtual address space of one gigabyte, which limits the * amount of physical memory you can use to about 950MB. * - * If you want more physical memory than this then see the CONFIG_HIGHMEM4G - * and CONFIG_HIGHMEM64G options in the kernel configuration. + * If you want more physical memory than this then see the CONFIG_VMSPLIT_2G + * and CONFIG_HIGHMEM4G options in the kernel configuration. */ #define __PAGE_OFFSET_BASE _AC(CONFIG_PAGE_OFFSET, UL) #define __PAGE_OFFSET __PAGE_OFFSET_BASE diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ac41b1e0940d..f288aad8dc74 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -582,7 +582,7 @@ static void __init lowmem_pfn_init(void) "only %luMB highmem pages available, ignoring highmem size of %luMB!\n" #define MSG_HIGHMEM_TRIMMED \ - "Warning: only 4GB will be used. Use a HIGHMEM64G enabled kernel!\n" + "Warning: only 4GB will be used. Support for for CONFIG_HIGHMEM64G was removed!\n" /* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: @@ -606,18 +606,13 @@ static void __init highmem_pfn_init(void) #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); - if (max_pfn > MAX_NONPAE_PFN) - printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); - else - printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); + printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM */ -#ifndef CONFIG_HIGHMEM64G if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); } -#endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ } -- cgit v1.2.3 From a8331594036f22dcf037f1a75358bd0985c84cd9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:10 +0100 Subject: x86/mm: Drop CONFIG_SWIOTLB for PAE Since kernels with and without CONFIG_X86_PAE are now limited to the low 4GB of physical address space, there is no need to use swiotlb any more, so stop selecting this. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-7-arnd@kernel.org --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 737a0c630527..0e0ec2c8ef75 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1462,7 +1462,6 @@ config X86_PAE bool "PAE (Physical Address Extension) Support" depends on X86_32 && X86_HAVE_PAE select PHYS_ADDR_T_64BIT - select SWIOTLB help PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It -- cgit v1.2.3 From 0081fdeccbf610499b79784998b1fd36783209dd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:11 +0100 Subject: x86/mm: Drop support for CONFIG_HIGHPTE With the maximum amount of RAM now 4GB, there is very little point to still have PTE pages in highmem. Drop this for simplification. The only other architecture supporting HIGHPTE is 32-bit arm, and once that feature is removed as well, the highpte logic can be dropped from common code as well. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-8-arnd@kernel.org --- arch/x86/Kconfig | 9 --------- arch/x86/include/asm/pgalloc.h | 5 ----- arch/x86/mm/pgtable.c | 27 +-------------------------- 3 files changed, 1 insertion(+), 40 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0e0ec2c8ef75..73eeaf295b74 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1628,15 +1628,6 @@ config X86_PMEM_LEGACY Say Y if unsure. -config HIGHPTE - bool "Allocate 3rd-level pagetables from highmem" - depends on HIGHMEM - help - The VM uses one page table entry for each page of physical memory. - For systems with a lot of RAM, this can be wasteful of precious - low memory. Setting this option will put user-space page table - entries in high memory. - config X86_CHECK_BIOS_CORRUPTION bool "Check for low memory corruption" help diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dd4841231bb9..a33147520044 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -29,11 +29,6 @@ static inline void paravirt_release_pud(unsigned long pfn) {} static inline void paravirt_release_p4d(unsigned long pfn) {} #endif -/* - * Flags to use when allocating a user page table page. - */ -extern gfp_t __userpte_alloc_gfp; - #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* * Instead of one PGD, we acquire two PGDs. Being order-1, it is diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index b1c1f72c1fd1..cec321fb74f2 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -12,35 +12,10 @@ phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); #endif -#ifdef CONFIG_HIGHPTE -#define PGTABLE_HIGHMEM __GFP_HIGHMEM -#else -#define PGTABLE_HIGHMEM 0 -#endif - -gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; - pgtable_t pte_alloc_one(struct mm_struct *mm) { - return __pte_alloc_one(mm, __userpte_alloc_gfp); -} - -static int __init setup_userpte(char *arg) -{ - if (!arg) - return -EINVAL; - - /* - * "userpte=nohigh" disables allocation of user pagetables in - * high memory. - */ - if (strcmp(arg, "nohigh") == 0) - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; - else - return -EINVAL; - return 0; + return __pte_alloc_one(mm, GFP_PGTABLE_USER); } -early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { -- cgit v1.2.3 From ca5955dd5f08727605723b60767fbf2cc3d54046 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:12 +0100 Subject: x86/cpu: Document CONFIG_X86_INTEL_MID as 64-bit-only The X86_INTEL_MID code was originally introduced for the 32-bit Moorestown/Medfield/Clovertrail platform, later the 64-bit Merrifield/Moorefield variants were added, but the final Morganfield 14nm platform was canceled before it hit the market. To help users understand what the option actually refers to, update the help text, and add a dependency on 64-bit kernels. Ferry confirmed that all the hardware can run 64-bit kernels these days, but is still testing 32-bit kernels on the Intel Edison board, so this remains possible, but is guarded by a CONFIG_EXPERT dependency now, to gently push remaining users towards using CONFIG_64BIT. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Acked-by: Andy Shevchenko Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-9-arnd@kernel.org --- arch/x86/Kconfig | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 73eeaf295b74..acd4d73502d6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -549,12 +549,12 @@ config X86_EXTENDED_PLATFORM RDC R-321x SoC SGI 320/540 (Visual Workstation) STA2X11-based (e.g. Northville) - Moorestown MID devices 64-bit platforms (CONFIG_64BIT=y): Numascale NumaChip ScaleMP vSMP SGI Ultraviolet + Merrifield/Moorefield MID devices If you have one of these systems, or if you want to build a generic distribution kernel, say Y here - otherwise say N. @@ -599,8 +599,31 @@ config X86_UV This option is needed in order to support SGI Ultraviolet systems. If you don't have one of these, you should say N here. -# Following is an alphabetically sorted list of 32 bit extended platforms -# Please maintain the alphabetic order if and when there are additions +config X86_INTEL_MID + bool "Intel Z34xx/Z35xx MID platform support" + depends on X86_EXTENDED_PLATFORM + depends on X86_PLATFORM_DEVICES + depends on PCI + depends on X86_64 || (EXPERT && PCI_GOANY) + depends on X86_IO_APIC + select I2C + select DW_APB_TIMER + select INTEL_SCU_PCI + help + Select to build a kernel capable of supporting 64-bit Intel MID + (Mobile Internet Device) platform systems which do not have + the PCI legacy interfaces. + + The only supported devices are the 22nm Merrified (Z34xx) + and Moorefield (Z35xx) SoC used in the Intel Edison board and + a small number of Android devices such as the Asus Zenfone 2, + Asus FonePad 8 and Dell Venue 7. + + If you are building for a PC class system or non-MID tablet + SoCs like Bay Trail (Z36xx/Z37xx), say N here. + + Intel MID platforms are based on an Intel processor and chipset which + consume less power than most of the x86 derivatives. config X86_GOLDFISH bool "Goldfish (Virtual Platform)" @@ -610,6 +633,9 @@ config X86_GOLDFISH for Android development. Unless you are building for the Android Goldfish emulator say N here. +# Following is an alphabetically sorted list of 32 bit extended platforms +# Please maintain the alphabetic order if and when there are additions + config X86_INTEL_CE bool "CE4100 TV platform" depends on PCI @@ -625,24 +651,6 @@ config X86_INTEL_CE This option compiles in support for the CE4100 SOC for settop boxes and media devices. -config X86_INTEL_MID - bool "Intel MID platform support" - depends on X86_EXTENDED_PLATFORM - depends on X86_PLATFORM_DEVICES - depends on PCI - depends on X86_64 || (PCI_GOANY && X86_32) - depends on X86_IO_APIC - select I2C - select DW_APB_TIMER - select INTEL_SCU_PCI - help - Select to build a kernel capable of supporting Intel MID (Mobile - Internet Device) platform systems which do not have the PCI legacy - interfaces. If you are building for a PC class system say N here. - - Intel MID platforms are based on an Intel processor and chipset which - consume less power than most of the x86 derivatives. - config X86_INTEL_QUARK bool "Intel Quark platform support" depends on X86_32 -- cgit v1.2.3 From dcbb01fbb7aeed0fae4dc1389a36842c77f4f381 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:13 +0100 Subject: x86/pci: Remove old STA2x11 support ST ConneXt STA2x11 was an interface chip for Atom E6xx processors, using a number of components usually found on Arm SoCs. Most of this was merged upstream, but it was never complete enough to actually work and has been abandoned for many years. We already had an agreement on removing it in 2022, but nobody ever submitted the patch to do it. Without STA2x11, CONFIG_X86_32_NON_STANDARD no longer has any use - remove it. Suggested-by: Davide Ciminaghi Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Bjorn Helgaas Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-10-arnd@kernel.org --- arch/x86/Kconfig | 32 +----- arch/x86/include/asm/sta2x11.h | 13 --- arch/x86/pci/Makefile | 2 - arch/x86/pci/sta2x11-fixup.c | 233 ----------------------------------------- 4 files changed, 3 insertions(+), 277 deletions(-) delete mode 100644 arch/x86/include/asm/sta2x11.h delete mode 100644 arch/x86/pci/sta2x11-fixup.c (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index acd4d73502d6..383b145ffe9f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -548,7 +548,6 @@ config X86_EXTENDED_PLATFORM AMD Elan RDC R-321x SoC SGI 320/540 (Visual Workstation) - STA2X11-based (e.g. Northville) 64-bit platforms (CONFIG_64BIT=y): Numascale NumaChip @@ -732,18 +731,6 @@ config X86_RDC321X as R-8610-(G). If you don't have one of these chips, you should say N here. -config X86_32_NON_STANDARD - bool "Support non-standard 32-bit SMP architectures" - depends on X86_32 && SMP - depends on X86_EXTENDED_PLATFORM - help - This option compiles in the STA2X11 default - subarchitecture. It is intended for a generic binary - kernel. If you select them all, kernel will probe it one by - one and will fallback to default. - -# Alphabetically sorted list of Non standard 32 bit platforms - config X86_SUPPORTS_MEMORY_FAILURE def_bool y # MCE code calls memory_failure(): @@ -753,19 +740,6 @@ config X86_SUPPORTS_MEMORY_FAILURE depends on X86_64 || !SPARSEMEM select ARCH_SUPPORTS_MEMORY_FAILURE -config STA2X11 - bool "STA2X11 Companion Chip Support" - depends on X86_32_NON_STANDARD && PCI - select SWIOTLB - select MFD_STA2X11 - select GPIOLIB - help - This adds support for boards based on the STA2X11 IO-Hub, - a.k.a. "ConneXt". The chip is used in place of the standard - PC chipset, so all "standard" peripherals are missing. If this - option is selected the kernel will still be able to boot on - standard PC machines. - config X86_32_IRIS tristate "Eurobraille/Iris poweroff module" depends on X86_32 @@ -1103,7 +1077,7 @@ config UP_LATE_INIT config X86_UP_APIC bool "Local APIC support on uniprocessors" if !PCI_MSI default PCI_MSI - depends on X86_32 && !SMP && !X86_32_NON_STANDARD + depends on X86_32 && !SMP help A local APIC (Advanced Programmable Interrupt Controller) is an integrated interrupt controller in the CPU. If you have a single-CPU @@ -1128,7 +1102,7 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y - depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI + depends on X86_64 || SMP || X86_UP_APIC || PCI_MSI select IRQ_DOMAIN_HIERARCHY config ACPI_MADT_WAKEUP @@ -1590,7 +1564,7 @@ config ARCH_FLATMEM_ENABLE config ARCH_SPARSEMEM_ENABLE def_bool y - depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD + depends on X86_64 || NUMA || X86_32 select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 diff --git a/arch/x86/include/asm/sta2x11.h b/arch/x86/include/asm/sta2x11.h deleted file mode 100644 index e0975e9c4f47..000000000000 --- a/arch/x86/include/asm/sta2x11.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Header file for STMicroelectronics ConneXt (STA2X11) IOHub - */ -#ifndef __ASM_STA2X11_H -#define __ASM_STA2X11_H - -#include - -/* This needs to be called from the MFD to configure its sub-devices */ -struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev); - -#endif /* __ASM_STA2X11_H */ diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 48bcada5cabe..4933fb337983 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -12,8 +12,6 @@ obj-$(CONFIG_X86_INTEL_CE) += ce4100.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o -obj-$(CONFIG_STA2X11) += sta2x11-fixup.o - obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c deleted file mode 100644 index 8c8ddc4dcc08..000000000000 --- a/arch/x86/pci/sta2x11-fixup.c +++ /dev/null @@ -1,233 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * DMA translation between STA2x11 AMBA memory mapping and the x86 memory mapping - * - * ST Microelectronics ConneXt (STA2X11/STA2X10) - * - * Copyright (c) 2010-2011 Wind River Systems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#define STA2X11_SWIOTLB_SIZE (4*1024*1024) - -/* - * We build a list of bus numbers that are under the ConneXt. The - * main bridge hosts 4 busses, which are the 4 endpoints, in order. - */ -#define STA2X11_NR_EP 4 /* 0..3 included */ -#define STA2X11_NR_FUNCS 8 /* 0..7 included */ -#define STA2X11_AMBA_SIZE (512 << 20) - -struct sta2x11_ahb_regs { /* saved during suspend */ - u32 base, pexlbase, pexhbase, crw; -}; - -struct sta2x11_mapping { - int is_suspended; - struct sta2x11_ahb_regs regs[STA2X11_NR_FUNCS]; -}; - -struct sta2x11_instance { - struct list_head list; - int bus0; - struct sta2x11_mapping map[STA2X11_NR_EP]; -}; - -static LIST_HEAD(sta2x11_instance_list); - -/* At probe time, record new instances of this bridge (likely one only) */ -static void sta2x11_new_instance(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - - instance = kzalloc(sizeof(*instance), GFP_ATOMIC); - if (!instance) - return; - /* This has a subordinate bridge, with 4 more-subordinate ones */ - instance->bus0 = pdev->subordinate->number + 1; - - if (list_empty(&sta2x11_instance_list)) { - int size = STA2X11_SWIOTLB_SIZE; - /* First instance: register your own swiotlb area */ - dev_info(&pdev->dev, "Using SWIOTLB (size %i)\n", size); - if (swiotlb_init_late(size, GFP_DMA, NULL)) - dev_emerg(&pdev->dev, "init swiotlb failed\n"); - } - list_add(&instance->list, &sta2x11_instance_list); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, 0xcc17, sta2x11_new_instance); - -/* - * Utility functions used in this file from below - */ -static struct sta2x11_instance *sta2x11_pdev_to_instance(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - int ep; - - list_for_each_entry(instance, &sta2x11_instance_list, list) { - ep = pdev->bus->number - instance->bus0; - if (ep >= 0 && ep < STA2X11_NR_EP) - return instance; - } - return NULL; -} - -static int sta2x11_pdev_to_ep(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - - instance = sta2x11_pdev_to_instance(pdev); - if (!instance) - return -1; - - return pdev->bus->number - instance->bus0; -} - -/* This is exported, as some devices need to access the MFD registers */ -struct sta2x11_instance *sta2x11_get_instance(struct pci_dev *pdev) -{ - return sta2x11_pdev_to_instance(pdev); -} -EXPORT_SYMBOL(sta2x11_get_instance); - -/* At setup time, we use our own ops if the device is a ConneXt one */ -static void sta2x11_setup_pdev(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); - - if (!instance) /* either a sta2x11 bridge or another ST device */ - return; - - /* We must enable all devices as master, for audio DMA to work */ - pci_set_master(pdev); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_setup_pdev); - -/* - * At boot we must set up the mappings for the pcie-to-amba bridge. - * It involves device access, and the same happens at suspend/resume time - */ - -#define AHB_MAPB 0xCA4 -#define AHB_CRW(i) (AHB_MAPB + 0 + (i) * 0x10) -#define AHB_CRW_SZMASK 0xfffffc00UL -#define AHB_CRW_ENABLE (1 << 0) -#define AHB_CRW_WTYPE_MEM (2 << 1) -#define AHB_CRW_ROE (1UL << 3) /* Relax Order Ena */ -#define AHB_CRW_NSE (1UL << 4) /* No Snoop Enable */ -#define AHB_BASE(i) (AHB_MAPB + 4 + (i) * 0x10) -#define AHB_PEXLBASE(i) (AHB_MAPB + 8 + (i) * 0x10) -#define AHB_PEXHBASE(i) (AHB_MAPB + 12 + (i) * 0x10) - -/* At probe time, enable mapping for each endpoint, using the pdev */ -static void sta2x11_map_ep(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance = sta2x11_pdev_to_instance(pdev); - struct device *dev = &pdev->dev; - u32 amba_base, max_amba_addr; - int i, ret; - - if (!instance) - return; - - pci_read_config_dword(pdev, AHB_BASE(0), &amba_base); - max_amba_addr = amba_base + STA2X11_AMBA_SIZE - 1; - - ret = dma_direct_set_offset(dev, 0, amba_base, STA2X11_AMBA_SIZE); - if (ret) - dev_err(dev, "sta2x11: could not set DMA offset\n"); - - dev->bus_dma_limit = max_amba_addr; - dma_set_mask_and_coherent(&pdev->dev, max_amba_addr); - - /* Configure AHB mapping */ - pci_write_config_dword(pdev, AHB_PEXLBASE(0), 0); - pci_write_config_dword(pdev, AHB_PEXHBASE(0), 0); - pci_write_config_dword(pdev, AHB_CRW(0), STA2X11_AMBA_SIZE | - AHB_CRW_WTYPE_MEM | AHB_CRW_ENABLE); - - /* Disable all the other windows */ - for (i = 1; i < STA2X11_NR_FUNCS; i++) - pci_write_config_dword(pdev, AHB_CRW(i), 0); - - dev_info(&pdev->dev, - "sta2x11: Map EP %i: AMBA address %#8x-%#8x\n", - sta2x11_pdev_to_ep(pdev), amba_base, max_amba_addr); -} -DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, sta2x11_map_ep); - -#ifdef CONFIG_PM /* Some register values must be saved and restored */ - -static struct sta2x11_mapping *sta2x11_pdev_to_mapping(struct pci_dev *pdev) -{ - struct sta2x11_instance *instance; - int ep; - - instance = sta2x11_pdev_to_instance(pdev); - if (!instance) - return NULL; - ep = sta2x11_pdev_to_ep(pdev); - return instance->map + ep; -} - -static void suspend_mapping(struct pci_dev *pdev) -{ - struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); - int i; - - if (!map) - return; - - if (map->is_suspended) - return; - map->is_suspended = 1; - - /* Save all window configs */ - for (i = 0; i < STA2X11_NR_FUNCS; i++) { - struct sta2x11_ahb_regs *regs = map->regs + i; - - pci_read_config_dword(pdev, AHB_BASE(i), ®s->base); - pci_read_config_dword(pdev, AHB_PEXLBASE(i), ®s->pexlbase); - pci_read_config_dword(pdev, AHB_PEXHBASE(i), ®s->pexhbase); - pci_read_config_dword(pdev, AHB_CRW(i), ®s->crw); - } -} -DECLARE_PCI_FIXUP_SUSPEND(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, suspend_mapping); - -static void resume_mapping(struct pci_dev *pdev) -{ - struct sta2x11_mapping *map = sta2x11_pdev_to_mapping(pdev); - int i; - - if (!map) - return; - - - if (!map->is_suspended) - goto out; - map->is_suspended = 0; - - /* Restore all window configs */ - for (i = 0; i < STA2X11_NR_FUNCS; i++) { - struct sta2x11_ahb_regs *regs = map->regs + i; - - pci_write_config_dword(pdev, AHB_BASE(i), regs->base); - pci_write_config_dword(pdev, AHB_PEXLBASE(i), regs->pexlbase); - pci_write_config_dword(pdev, AHB_PEXHBASE(i), regs->pexhbase); - pci_write_config_dword(pdev, AHB_CRW(i), regs->crw); - } -out: - pci_set_master(pdev); /* Like at boot, enable master on all devices */ -} -DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_STMICRO, PCI_ANY_ID, resume_mapping); - -#endif /* CONFIG_PM */ -- cgit v1.2.3 From 976ba8da2f3c2f1e997f4f620da83ae65c0e3728 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Feb 2025 22:37:14 +0100 Subject: x86/platform: Only allow CONFIG_EISA for 32-bit The CONFIG_EISA menu was cleaned up in 2018, but this inadvertently brought the option back on 64-bit machines: ISA remains guarded by a CONFIG_X86_32 check, but EISA no longer depends on ISA. The last Intel machines ith EISA support used a 82375EB PCI/EISA bridge from 1993 that could be paired with the 440FX chipset on early Pentium-II CPUs, long before the first x86-64 products. Fixes: 6630a8e50105 ("eisa: consolidate EISA Kconfig entry in drivers/eisa") Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250226213714.4040853-11-arnd@kernel.org --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 383b145ffe9f..aa90f0355be1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -233,7 +233,7 @@ config X86 select HAVE_SAMPLE_FTRACE_DIRECT_MULTI if X86_64 select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_EISA + select HAVE_EISA if X86_32 select HAVE_EXIT_THREAD select HAVE_GUP_FAST select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE -- cgit v1.2.3 From 4a412c70af674198749fd16be695d53e1c41b5f9 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 11 Dec 2024 22:57:24 -0800 Subject: x86/cpu: Prefix hexadecimal values with 0x in cpu_debug_show() The hex values in CPU debug interface are not prefixed with 0x. This may cause misinterpretation of values. Fix it. [ mingo: Restore previous vertical alignment of the output. ] Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20241211-add-cpu-type-v5-1-2ae010f50370@linux.intel.com --- arch/x86/kernel/cpu/debugfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c index cacfd3f6abef..1976fef2dfe5 100644 --- a/arch/x86/kernel/cpu/debugfs.c +++ b/arch/x86/kernel/cpu/debugfs.c @@ -16,8 +16,8 @@ static int cpu_debug_show(struct seq_file *m, void *p) if (!c->initialized) return 0; - seq_printf(m, "initial_apicid: %x\n", c->topo.initial_apicid); - seq_printf(m, "apicid: %x\n", c->topo.apicid); + seq_printf(m, "initial_apicid: 0x%x\n", c->topo.initial_apicid); + seq_printf(m, "apicid: 0x%x\n", c->topo.apicid); seq_printf(m, "pkg_id: %u\n", c->topo.pkg_id); seq_printf(m, "die_id: %u\n", c->topo.die_id); seq_printf(m, "cu_id: %u\n", c->topo.cu_id); -- cgit v1.2.3 From c4a8b7116b9927f7b00bd68140e285662a03068e Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 11 Dec 2024 22:57:36 -0800 Subject: perf/x86/intel: Use cache cpu-type for hybrid PMU selection get_this_hybrid_cpu_type() misses a case when cpu-type is populated regardless of X86_FEATURE_HYBRID_CPU. This is particularly true for hybrid variants that have P or E cores fused off. Instead use the cpu-type cached in struct x86_topology, as it does not rely on hybrid feature to enumerate cpu-type. This can also help avoid the model-specific fixup get_hybrid_cpu_type(). Also replace the get_this_hybrid_cpu_native_id() with its cached value in struct x86_topology. While at it, remove enum hybrid_cpu_type as it serves no purpose when we have the exact cpu-types defined in enum intel_cpu_type. Also rename atom_native_id to intel_native_id and move it to intel-family.h where intel_cpu_type lives. Suggested-by: Dave Hansen Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20241211-add-cpu-type-v5-3-2ae010f50370@linux.intel.com --- arch/x86/events/intel/core.c | 19 ++++++++++--------- arch/x86/events/perf_event.h | 19 +------------------ arch/x86/include/asm/intel-family.h | 15 ++++++++++++++- 3 files changed, 25 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 3cf65e93a03f..397c545b8610 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4606,9 +4606,9 @@ static int adl_hw_config(struct perf_event *event) return -EOPNOTSUPP; } -static enum hybrid_cpu_type adl_get_hybrid_cpu_type(void) +static enum intel_cpu_type adl_get_hybrid_cpu_type(void) { - return HYBRID_INTEL_CORE; + return INTEL_CPU_TYPE_CORE; } static inline bool erratum_hsw11(struct perf_event *event) @@ -4953,7 +4953,8 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu) static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) { - u8 cpu_type = get_this_hybrid_cpu_type(); + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + enum intel_cpu_type cpu_type = c->topo.intel_type; int i; /* @@ -4962,7 +4963,7 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) * on it. There should be a fixup function provided for these * troublesome CPUs (->get_hybrid_cpu_type). */ - if (cpu_type == HYBRID_INTEL_NONE) { + if (cpu_type == INTEL_CPU_TYPE_UNKNOWN) { if (x86_pmu.get_hybrid_cpu_type) cpu_type = x86_pmu.get_hybrid_cpu_type(); else @@ -4979,16 +4980,16 @@ static struct x86_hybrid_pmu *find_hybrid_pmu_for_cpu(void) enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type; u32 native_id; - if (cpu_type == HYBRID_INTEL_CORE && pmu_type == hybrid_big) + if (cpu_type == INTEL_CPU_TYPE_CORE && pmu_type == hybrid_big) return &x86_pmu.hybrid_pmu[i]; - if (cpu_type == HYBRID_INTEL_ATOM) { + if (cpu_type == INTEL_CPU_TYPE_ATOM) { if (x86_pmu.num_hybrid_pmus == 2 && pmu_type == hybrid_small) return &x86_pmu.hybrid_pmu[i]; - native_id = get_this_hybrid_cpu_native_id(); - if (native_id == skt_native_id && pmu_type == hybrid_small) + native_id = c->topo.intel_native_model_id; + if (native_id == INTEL_ATOM_SKT_NATIVE_ID && pmu_type == hybrid_small) return &x86_pmu.hybrid_pmu[i]; - if (native_id == cmt_native_id && pmu_type == hybrid_tiny) + if (native_id == INTEL_ATOM_CMT_NATIVE_ID && pmu_type == hybrid_tiny) return &x86_pmu.hybrid_pmu[i]; } } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 31c2771545a6..7b18754084a6 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -669,18 +669,6 @@ enum { #define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10 #define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1) -/* - * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture - * of the core. Bits 31-24 indicates its core type (Core or Atom) - * and Bits [23:0] indicates the native model ID of the core. - * Core type and native model ID are defined in below enumerations. - */ -enum hybrid_cpu_type { - HYBRID_INTEL_NONE, - HYBRID_INTEL_ATOM = 0x20, - HYBRID_INTEL_CORE = 0x40, -}; - #define X86_HYBRID_PMU_ATOM_IDX 0 #define X86_HYBRID_PMU_CORE_IDX 1 #define X86_HYBRID_PMU_TINY_IDX 2 @@ -697,11 +685,6 @@ enum hybrid_pmu_type { hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny, }; -enum atom_native_id { - cmt_native_id = 0x2, /* Crestmont */ - skt_native_id = 0x3, /* Skymont */ -}; - struct x86_hybrid_pmu { struct pmu pmu; const char *name; @@ -994,7 +977,7 @@ struct x86_pmu { */ int num_hybrid_pmus; struct x86_hybrid_pmu *hybrid_pmu; - enum hybrid_cpu_type (*get_hybrid_cpu_type) (void); + enum intel_cpu_type (*get_hybrid_cpu_type) (void); }; struct x86_perf_task_context_opt { diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index f9f67afeb48a..b657d78071c6 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -182,10 +182,23 @@ /* Family 19 */ #define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ -/* CPU core types */ +/* + * Intel CPU core types + * + * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture + * of the core. Bits 31-24 indicates its core type (Core or Atom) + * and Bits [23:0] indicates the native model ID of the core. + * Core type and native model ID are defined in below enumerations. + */ enum intel_cpu_type { + INTEL_CPU_TYPE_UNKNOWN, INTEL_CPU_TYPE_ATOM = 0x20, INTEL_CPU_TYPE_CORE = 0x40, }; +enum intel_native_id { + INTEL_ATOM_CMT_NATIVE_ID = 0x2, /* Crestmont */ + INTEL_ATOM_SKT_NATIVE_ID = 0x3, /* Skymont */ +}; + #endif /* _ASM_X86_INTEL_FAMILY_H */ -- cgit v1.2.3 From db5157df149709c02e6a08c0b3498553bdd2a76c Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 11 Dec 2024 22:57:41 -0800 Subject: x86/cpu: Remove get_this_hybrid_cpu_*() Because calls to get_this_hybrid_cpu_type() and get_this_hybrid_cpu_native_id() are not required now. cpu-type and native-model-id are cached at boot in per-cpu struct cpuinfo_topology. Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Acked-by: Dave Hansen Link: https://lore.kernel.org/r/20241211-add-cpu-type-v5-4-2ae010f50370@linux.intel.com --- arch/x86/include/asm/cpu.h | 14 -------------- arch/x86/kernel/cpu/intel.c | 31 ------------------------------- 2 files changed, 45 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 98eced5084ca..0c8ec62789a1 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -50,20 +50,6 @@ static inline void split_lock_init(void) {} static inline void bus_lock_init(void) {} #endif -#ifdef CONFIG_CPU_SUP_INTEL -u8 get_this_hybrid_cpu_type(void); -u32 get_this_hybrid_cpu_native_id(void); -#else -static inline u8 get_this_hybrid_cpu_type(void) -{ - return 0; -} - -static inline u32 get_this_hybrid_cpu_native_id(void) -{ - return 0; -} -#endif #ifdef CONFIG_IA32_FEAT_CTL void init_ia32_feat_ctl(struct cpuinfo_x86 *c); #else diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3dce22f00dc3..045b439c653a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -873,34 +873,3 @@ static const struct cpu_dev intel_cpu_dev = { }; cpu_dev_register(intel_cpu_dev); - -#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24 - -/** - * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU - * - * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in - * a hybrid processor. If the processor is not hybrid, returns 0. - */ -u8 get_this_hybrid_cpu_type(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; -} - -/** - * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU - * - * Returns the uarch native ID [23:0] of a CPU in a hybrid processor. - * If the processor is not hybrid, returns 0. - */ -u32 get_this_hybrid_cpu_native_id(void) -{ - if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) - return 0; - - return cpuid_eax(0x0000001a) & - (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1); -} -- cgit v1.2.3 From 02410ac72ac3707936c07ede66e94360d0d65319 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 26 Feb 2025 12:06:51 +0000 Subject: mm: hugetlb: Add huge page size param to huge_ptep_get_and_clear() In order to fix a bug, arm64 needs to be told the size of the huge page for which the huge_pte is being cleared in huge_ptep_get_and_clear(). Provide for this by adding an `unsigned long sz` parameter to the function. This follows the same pattern as huge_pte_clear() and set_huge_pte_at(). This commit makes the required interface modifications to the core mm as well as all arches that implement this function (arm64, loongarch, mips, parisc, powerpc, riscv, s390, sparc). The actual arm64 bug will be fixed in a separate commit. Cc: stable@vger.kernel.org Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") Acked-by: David Hildenbrand Reviewed-by: Alexandre Ghiti # riscv Reviewed-by: Christophe Leroy Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Signed-off-by: Ryan Roberts Acked-by: Alexander Gordeev # s390 Link: https://lore.kernel.org/r/20250226120656.2400136-2-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/hugetlb.h | 4 ++-- arch/arm64/mm/hugetlbpage.c | 8 +++++--- arch/loongarch/include/asm/hugetlb.h | 6 ++++-- arch/mips/include/asm/hugetlb.h | 6 ++++-- arch/parisc/include/asm/hugetlb.h | 2 +- arch/parisc/mm/hugetlbpage.c | 2 +- arch/powerpc/include/asm/hugetlb.h | 6 ++++-- arch/riscv/include/asm/hugetlb.h | 3 ++- arch/riscv/mm/hugetlbpage.c | 2 +- arch/s390/include/asm/hugetlb.h | 16 ++++++++++++---- arch/s390/mm/hugetlbpage.c | 4 ++-- arch/sparc/include/asm/hugetlb.h | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- 13 files changed, 40 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index c6dff3e69539..03db9cb21ace 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -42,8 +42,8 @@ extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR -extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 98a2a0e64e25..06db4649af91 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -396,8 +396,8 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr, __pte_clear(mm, addr, ptep); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz) { int ncontig; size_t pgsize; @@ -549,6 +549,8 @@ bool __init arch_hugetlb_valid_size(unsigned long size) pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long psize = huge_page_size(hstate_vma(vma)); + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* * Break-before-make (BBM) is required for all user space mappings @@ -558,7 +560,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr if (pte_user_exec(__ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } - return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h index c8e4057734d0..4dc4b3e04225 100644 --- a/arch/loongarch/include/asm/hugetlb.h +++ b/arch/loongarch/include/asm/hugetlb.h @@ -36,7 +36,8 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { pte_t clear; pte_t pte = ptep_get(ptep); @@ -51,8 +52,9 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_tlb_page(vma, addr); return pte; } diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index d0a86ce83de9..fbc71ddcf0f6 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -27,7 +27,8 @@ static inline int prepare_hugepage_range(struct file *file, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { pte_t clear; pte_t pte = *ptep; @@ -42,13 +43,14 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); /* * clear the huge pte entry firstly, so that the other smp threads will * not get old pte entry after finishing flush_tlb_page and before * setting new huge pte entry */ - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_tlb_page(vma, addr); return pte; } diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 5b3a5429f71b..21e9ace17739 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -10,7 +10,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index e9d18cf25b79..a94fe546d434 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -126,7 +126,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { pte_t entry; diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index dad2e7980f24..86326587e58d 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -45,7 +45,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1)); } @@ -55,8 +56,9 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_hugetlb_page(vma, addr); return pte; } diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index faf3624d8057..446126497768 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -28,7 +28,8 @@ void set_huge_pte_at(struct mm_struct *mm, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); + unsigned long addr, pte_t *ptep, + unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 42314f093922..b4a78a4b35cf 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -293,7 +293,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { pte_t orig_pte = ptep_get(ptep); int pte_num; diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 7c52acaf9f82..663e87220e89 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -25,8 +25,16 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned long sz) +{ + return __huge_ptep_get_and_clear(mm, addr, ptep); +} static inline void arch_clear_hugetlb_flags(struct folio *folio) { @@ -48,7 +56,7 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); + return __huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS @@ -59,7 +67,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(huge_ptep_get(vma->vm_mm, addr, ptep), pte); if (changed) { - huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + __huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } return changed; @@ -69,7 +77,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep); + pte_t pte = __huge_ptep_get_and_clear(mm, addr, ptep); __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d9ce199953de..2e568f175cd4 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -188,8 +188,8 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) return __rste_to_pte(pte_val(*ptep)); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { pte_t pte = huge_ptep_get(mm, addr, ptep); pmd_t *pmdp = (pmd_t *) ptep; diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index c714ca6a05aa..e7a9cdd498dc 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -20,7 +20,7 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index eee601a0d2cf..80504148d8a5 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -260,7 +260,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { unsigned int i, nptes, orig_shift, shift; unsigned long size; -- cgit v1.2.3 From 49c87f7677746f3c5bd16c81b23700bb6b88bfd4 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 26 Feb 2025 12:06:52 +0000 Subject: arm64: hugetlb: Fix huge_ptep_get_and_clear() for non-present ptes arm64 supports multiple huge_pte sizes. Some of the sizes are covered by a single pte entry at a particular level (PMD_SIZE, PUD_SIZE), and some are covered by multiple ptes at a particular level (CONT_PTE_SIZE, CONT_PMD_SIZE). So the function has to figure out the size from the huge_pte pointer. This was previously done by walking the pgtable to determine the level and by using the PTE_CONT bit to determine the number of ptes at the level. But the PTE_CONT bit is only valid when the pte is present. For non-present pte values (e.g. markers, migration entries), the previous implementation was therefore erroneously determining the size. There is at least one known caller in core-mm, move_huge_pte(), which may call huge_ptep_get_and_clear() for a non-present pte. So we must be robust to this case. Additionally the "regular" ptep_get_and_clear() is robust to being called for non-present ptes so it makes sense to follow the behavior. Fix this by using the new sz parameter which is now provided to the function. Additionally when clearing each pte in a contig range, don't gather the access and dirty bits if the pte is not present. An alternative approach that would not require API changes would be to store the PTE_CONT bit in a spare bit in the swap entry pte for the non-present case. But it felt cleaner to follow other APIs' lead and just pass in the size. As an aside, PTE_CONT is bit 52, which corresponds to bit 40 in the swap entry offset field (layout of non-present pte). Since hugetlb is never swapped to disk, this field will only be populated for markers, which always set this bit to 0 and hwpoison swap entries, which set the offset field to a PFN; So it would only ever be 1 for a 52-bit PVA system where memory in that high half was poisoned (I think!). So in practice, this bit would almost always be zero for non-present ptes and we would only clear the first entry if it was actually a contiguous block. That's probably a less severe symptom than if it was always interpreted as 1 and cleared out potentially-present neighboring PTEs. Cc: stable@vger.kernel.org Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20250226120656.2400136-3-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/mm/hugetlbpage.c | 53 +++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 33 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 06db4649af91..b3a7fafe8892 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -100,20 +100,11 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, static inline int num_contig_ptes(unsigned long size, size_t *pgsize) { - int contig_ptes = 0; + int contig_ptes = 1; *pgsize = size; switch (size) { -#ifndef __PAGETABLE_PMD_FOLDED - case PUD_SIZE: - if (pud_sect_supported()) - contig_ptes = 1; - break; -#endif - case PMD_SIZE: - contig_ptes = 1; - break; case CONT_PMD_SIZE: *pgsize = PMD_SIZE; contig_ptes = CONT_PMDS; @@ -122,6 +113,8 @@ static inline int num_contig_ptes(unsigned long size, size_t *pgsize) *pgsize = PAGE_SIZE; contig_ptes = CONT_PTES; break; + default: + WARN_ON(!__hugetlb_valid_size(size)); } return contig_ptes; @@ -163,24 +156,23 @@ static pte_t get_clear_contig(struct mm_struct *mm, unsigned long pgsize, unsigned long ncontig) { - pte_t orig_pte = __ptep_get(ptep); - unsigned long i; - - for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) { - pte_t pte = __ptep_get_and_clear(mm, addr, ptep); - - /* - * If HW_AFDBM is enabled, then the HW could turn on - * the dirty or accessed bit for any page in the set, - * so check them all. - */ - if (pte_dirty(pte)) - orig_pte = pte_mkdirty(orig_pte); - - if (pte_young(pte)) - orig_pte = pte_mkyoung(orig_pte); + pte_t pte, tmp_pte; + bool present; + + pte = __ptep_get_and_clear(mm, addr, ptep); + present = pte_present(pte); + while (--ncontig) { + ptep++; + addr += pgsize; + tmp_pte = __ptep_get_and_clear(mm, addr, ptep); + if (present) { + if (pte_dirty(tmp_pte)) + pte = pte_mkdirty(pte); + if (pte_young(tmp_pte)) + pte = pte_mkyoung(pte); + } } - return orig_pte; + return pte; } static pte_t get_clear_contig_flush(struct mm_struct *mm, @@ -401,13 +393,8 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, { int ncontig; size_t pgsize; - pte_t orig_pte = __ptep_get(ptep); - - if (!pte_cont(orig_pte)) - return __ptep_get_and_clear(mm, addr, ptep); - - ncontig = find_num_contig(mm, addr, ptep, &pgsize); + ncontig = num_contig_ptes(sz, &pgsize); return get_clear_contig(mm, addr, ptep, pgsize, ncontig); } -- cgit v1.2.3 From eed6bfa8b28230382b797a88569f2c7569a1a419 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 26 Feb 2025 12:06:53 +0000 Subject: arm64: hugetlb: Fix flush_hugetlb_tlb_range() invalidation level commit c910f2b65518 ("arm64/mm: Update tlb invalidation routines for FEAT_LPA2") changed the "invalidation level unknown" hint from 0 to TLBI_TTL_UNKNOWN (INT_MAX). But the fallback "unknown level" path in flush_hugetlb_tlb_range() was not updated. So as it stands, when trying to invalidate CONT_PMD_SIZE or CONT_PTE_SIZE hugetlb mappings, we will spuriously try to invalidate at level 0 on LPA2-enabled systems. Fix this so that the fallback passes TLBI_TTL_UNKNOWN, and while we are at it, explicitly use the correct stride and level for CONT_PMD_SIZE and CONT_PTE_SIZE, which should provide a minor optimization. Cc: stable@vger.kernel.org Fixes: c910f2b65518 ("arm64/mm: Update tlb invalidation routines for FEAT_LPA2") Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20250226120656.2400136-4-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/hugetlb.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 03db9cb21ace..07fbf5bf85a7 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -76,12 +76,22 @@ static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, { unsigned long stride = huge_page_size(hstate_vma(vma)); - if (stride == PMD_SIZE) - __flush_tlb_range(vma, start, end, stride, false, 2); - else if (stride == PUD_SIZE) - __flush_tlb_range(vma, start, end, stride, false, 1); - else - __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0); + switch (stride) { +#ifndef __PAGETABLE_PMD_FOLDED + case PUD_SIZE: + __flush_tlb_range(vma, start, end, PUD_SIZE, false, 1); + break; +#endif + case CONT_PMD_SIZE: + case PMD_SIZE: + __flush_tlb_range(vma, start, end, PMD_SIZE, false, 2); + break; + case CONT_PTE_SIZE: + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 3); + break; + default: + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN); + } } #endif /* __ASM_HUGETLB_H */ -- cgit v1.2.3 From a4248ee16f411ac1ea7dfab228a6659b111e3d65 Mon Sep 17 00:00:00 2001 From: Max Grobecker Date: Thu, 27 Feb 2025 21:45:05 +0100 Subject: x86/cpu: Don't clear X86_FEATURE_LAHF_LM flag in init_amd_k8() on AMD when running in a virtual machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running in a virtual machine, we might see the original hardware CPU vendor string (i.e. "AuthenticAMD"), but a model and family ID set by the hypervisor. In case we run on AMD hardware and the hypervisor sets a model ID < 0x14, the LAHF cpu feature is eliminated from the the list of CPU capabilities present to circumvent a bug with some BIOSes in conjunction with AMD K8 processors. Parsing the flags list from /proc/cpuinfo seems to be happening mostly in bash scripts and prebuilt Docker containers, as it does not need to have additionals tools present – even though more reliable ways like using "kcpuid", which calls the CPUID instruction instead of parsing a list, should be preferred. Scripts, that use /proc/cpuinfo to determine if the current CPU is "compliant" with defined microarchitecture levels like x86-64-v2 will falsely claim the CPU is incapable of modern CPU instructions when "lahf_lm" is missing in that flags list. This can prevent some docker containers from starting or build scripts to create unoptimized binaries. Admittably, this is more a small inconvenience than a severe bug in the kernel and the shoddy scripts that rely on parsing /proc/cpuinfo should be fixed instead. This patch adds an additional check to see if we're running inside a virtual machine (X86_FEATURE_HYPERVISOR is present), which, to my understanding, can't be present on a real K8 processor as it was introduced only with the later/other Athlon64 models. Example output with the "lahf_lm" flag missing in the flags list (should be shown between "hypervisor" and "abm"): $ cat /proc/cpuinfo processor : 0 vendor_id : AuthenticAMD cpu family : 15 model : 6 model name : Common KVM processor stepping : 1 microcode : 0x1000065 cpu MHz : 2599.998 cache size : 512 KB physical id : 0 siblings : 1 core id : 0 cpu cores : 1 apicid : 0 initial apicid : 0 fpu : yes fpu_exception : yes cpuid level : 13 wp : yes flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx rdtscp lm rep_good nopl cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c hypervisor abm 3dnowprefetch vmmcall bmi1 avx2 bmi2 xsaveopt ... while kcpuid shows the feature to be present in the CPU: # kcpuid -d | grep lahf lahf_lm - LAHF/SAHF available in 64-bit mode [ mingo: Updated the comment a bit, incorporated Boris's review feedback. ] Signed-off-by: Max Grobecker Signed-off-by: Ingo Molnar Cc: linux-kernel@vger.kernel.org Cc: Borislav Petkov --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 54194f5995de..d747515ad013 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -632,7 +632,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c) * (model = 0x14) and later actually support it. * (AMD Erratum #110, docId: 25759). */ - if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM)) { + if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) { clear_cpu_cap(c, X86_FEATURE_LAHF_LM); if (!rdmsrl_amd_safe(0xc001100d, &value)) { value &= ~BIT_64(32); -- cgit v1.2.3 From f034937f5af32188cd1c07865c885b2f171e17bf Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 20 Dec 2024 15:18:31 +0000 Subject: x86/cpu: Create helper function to parse the 'clearcpuid=' boot parameter This is in preparation for a later commit that will reuse this code, to make review convenient. Factor out a helper function which does the full handling for this arg including printing info to the console. No functional change intended. Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241220-force-cpu-bug-v2-1-7dc71bce742a@google.com --- arch/x86/kernel/cpu/common.c | 96 ++++++++++++++++++++++++-------------------- 1 file changed, 52 insertions(+), 44 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 76598a93a8fa..137d3e00a5be 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1479,56 +1479,18 @@ static void detect_nopl(void) #endif } -/* - * We parse cpu parameters early because fpu__init_system() is executed - * before parse_early_param(). - */ -static void __init cpu_parse_early_param(void) +static inline void parse_clearcpuid(char *arg) { - char arg[128]; - char *argptr = arg, *opt; - int arglen, taint = 0; - -#ifdef CONFIG_X86_32 - if (cmdline_find_option_bool(boot_command_line, "no387")) -#ifdef CONFIG_MATH_EMULATION - setup_clear_cpu_cap(X86_FEATURE_FPU); -#else - pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); -#endif - - if (cmdline_find_option_bool(boot_command_line, "nofxsr")) - setup_clear_cpu_cap(X86_FEATURE_FXSR); -#endif - - if (cmdline_find_option_bool(boot_command_line, "noxsave")) - setup_clear_cpu_cap(X86_FEATURE_XSAVE); - - if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - - if (cmdline_find_option_bool(boot_command_line, "noxsaves")) - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - - if (cmdline_find_option_bool(boot_command_line, "nousershstk")) - setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); - - /* Minimize the gap between FRED is available and available but disabled. */ - arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg)); - if (arglen != 2 || strncmp(arg, "on", 2)) - setup_clear_cpu_cap(X86_FEATURE_FRED); - - arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); - if (arglen <= 0) - return; + char *opt; + int taint = 0; pr_info("Clearing CPUID bits:"); - while (argptr) { + while (arg) { bool found __maybe_unused = false; unsigned int bit; - opt = strsep(&argptr, ","); + opt = strsep(&arg, ","); /* * Handle naked numbers first for feature flags which don't @@ -1570,10 +1532,56 @@ static void __init cpu_parse_early_param(void) if (!found) pr_cont(" (unknown: %s)", opt); } - pr_cont("\n"); if (taint) add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + + pr_cont("\n"); +} + + +/* + * We parse cpu parameters early because fpu__init_system() is executed + * before parse_early_param(). + */ +static void __init cpu_parse_early_param(void) +{ + char arg[128]; + int arglen; + +#ifdef CONFIG_X86_32 + if (cmdline_find_option_bool(boot_command_line, "no387")) +#ifdef CONFIG_MATH_EMULATION + setup_clear_cpu_cap(X86_FEATURE_FPU); +#else + pr_err("Option 'no387' required CONFIG_MATH_EMULATION enabled.\n"); +#endif + + if (cmdline_find_option_bool(boot_command_line, "nofxsr")) + setup_clear_cpu_cap(X86_FEATURE_FXSR); +#endif + + if (cmdline_find_option_bool(boot_command_line, "noxsave")) + setup_clear_cpu_cap(X86_FEATURE_XSAVE); + + if (cmdline_find_option_bool(boot_command_line, "noxsaveopt")) + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + + if (cmdline_find_option_bool(boot_command_line, "noxsaves")) + setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option_bool(boot_command_line, "nousershstk")) + setup_clear_cpu_cap(X86_FEATURE_USER_SHSTK); + + /* Minimize the gap between FRED is available and available but disabled. */ + arglen = cmdline_find_option(boot_command_line, "fred", arg, sizeof(arg)); + if (arglen != 2 || strncmp(arg, "on", 2)) + setup_clear_cpu_cap(X86_FEATURE_FRED); + + arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); + if (arglen <= 0) + return; + parse_clearcpuid(arg); } /* -- cgit v1.2.3 From 814165e9fd1f62332b5444d730b8d6e432328463 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 20 Dec 2024 15:18:32 +0000 Subject: x86/cpu: Add the 'setcpuid=' boot parameter In preparation for adding support to inject fake CPU bugs at boot-time, add a general facility to force enablement of CPU flags. The flag taints the kernel and the documentation attempts to be clear that this is highly unsuitable for uses outside of kernel development and platform experimentation. The new arg is parsed just like clearcpuid, but instead of leading to setup_clear_cpu_cap() it leads to setup_force_cpu_cap(). I've tested this by booting a nested QEMU guest on an Intel host, which with setcpuid=svm will claim that it supports AMD virtualization. Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241220-force-cpu-bug-v2-2-7dc71bce742a@google.com --- arch/x86/kernel/cpu/common.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 137d3e00a5be..ff483c9a56c3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1479,12 +1479,12 @@ static void detect_nopl(void) #endif } -static inline void parse_clearcpuid(char *arg) +static inline void parse_set_clear_cpuid(char *arg, bool set) { char *opt; int taint = 0; - pr_info("Clearing CPUID bits:"); + pr_info("%s CPUID bits:", set ? "Force-enabling" : "Clearing"); while (arg) { bool found __maybe_unused = false; @@ -1505,7 +1505,10 @@ static inline void parse_clearcpuid(char *arg) else pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); - setup_clear_cpu_cap(bit); + if (set) + setup_force_cpu_cap(bit); + else + setup_clear_cpu_cap(bit); taint++; } /* @@ -1523,7 +1526,10 @@ static inline void parse_clearcpuid(char *arg) continue; pr_cont(" %s", opt); - setup_clear_cpu_cap(bit); + if (set) + setup_force_cpu_cap(bit); + else + setup_clear_cpu_cap(bit); taint++; found = true; break; @@ -1579,9 +1585,12 @@ static void __init cpu_parse_early_param(void) setup_clear_cpu_cap(X86_FEATURE_FRED); arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); - if (arglen <= 0) - return; - parse_clearcpuid(arg); + if (arglen > 0) + parse_set_clear_cpuid(arg, false); + + arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg)); + if (arglen > 0) + parse_set_clear_cpuid(arg, true); } /* @@ -2013,15 +2022,23 @@ void print_cpu_info(struct cpuinfo_x86 *c) } /* - * clearcpuid= was already parsed in cpu_parse_early_param(). This dummy - * function prevents it from becoming an environment variable for init. + * clearcpuid= and setcpuid= were already parsed in cpu_parse_early_param(). + * These dummy functions prevent them from becoming an environment variable for + * init. */ + static __init int setup_clearcpuid(char *arg) { return 1; } __setup("clearcpuid=", setup_clearcpuid); +static __init int setup_setcpuid(char *arg) +{ + return 1; +} +__setup("setcpuid=", setup_setcpuid); + DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = { .current_task = &init_task, .preempt_count = INIT_PREEMPT_COUNT, -- cgit v1.2.3 From ab68d2e36532806b8f86ff2f60861dbb8443f0be Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 20 Dec 2024 15:18:33 +0000 Subject: x86/cpu: Enable modifying CPU bug flags with '{clear,set}puid=' Sometimes it can be very useful to run CPU vulnerability mitigations on systems where they aren't known to mitigate any real-world vulnerabilities. This can be handy for mundane reasons like debugging HW-agnostic logic on whatever machine is to hand, but also for research reasons: while some mitigations are focused on individual vulns and uarches, others are fairly general, and it's strategically useful to have an idea how they'd perform on systems where they aren't currently needed. As evidence for this being useful, a flag specifically for Retbleed was added in: 5c9a92dec323 ("x86/bugs: Add retbleed=force"). Since CPU bugs are tracked using the same basic mechanism as features, and there are already parameters for manipulating them by hand, extend that mechanism to support bug as well as capabilities. With this patch and setcpuid=srso, a QEMU guest running on an Intel host will boot with Safe-RET enabled. Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20241220-force-cpu-bug-v2-3-7dc71bce742a@google.com --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/kernel/cpu/common.c | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index de1ad09fe8d7..e5fc0038c8f6 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -50,6 +50,7 @@ extern const char * const x86_power_flags[32]; * X86_BUG_ - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; +#define x86_bug_flag(flag) x86_bug_flags[flag] #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ff483c9a56c3..0f32b6ffbf04 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1494,7 +1494,8 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) /* * Handle naked numbers first for feature flags which don't - * have names. + * have names. It doesn't make sense for a bug not to have a + * name so don't handle bug flags here. */ if (!kstrtouint(opt, 10, &bit)) { if (bit < NCAPINTS * 32) { @@ -1518,11 +1519,18 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) continue; } - for (bit = 0; bit < 32 * NCAPINTS; bit++) { - if (!x86_cap_flag(bit)) + for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) { + const char *flag; + + if (bit < 32 * NCAPINTS) + flag = x86_cap_flag(bit); + else + flag = x86_bug_flag(bit - (32 * NCAPINTS)); + + if (!flag) continue; - if (strcmp(x86_cap_flag(bit), opt)) + if (strcmp(flag, opt)) continue; pr_cont(" %s", opt); -- cgit v1.2.3 From 909639aa58fe4789644104c1fd89264c57da0979 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin (Intel)" Date: Fri, 28 Feb 2025 00:23:34 -0800 Subject: x86/cpufeatures: Rename X86_CMPXCHG64 to X86_CX8 Replace X86_CMPXCHG64 with X86_CX8, as CX8 is the name of the CPUID flag, thus to make it consistent with X86_FEATURE_CX8 defined in . No functional change intended. Signed-off-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li (Intel) Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Ingo Molnar Link: https://lore.kernel.org/r/20250228082338.73859-2-xin@zytor.com --- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.cpu | 4 ++-- arch/x86/include/asm/asm-prototypes.h | 2 +- arch/x86/include/asm/atomic64_32.h | 2 +- arch/x86/include/asm/cmpxchg_32.h | 2 +- arch/x86/include/asm/required-features.h | 2 +- arch/x86/lib/Makefile | 2 +- arch/x86/lib/cmpxchg8b_emu.S | 2 +- 8 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index aa90f0355be1..017035f461cf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -133,7 +133,7 @@ config X86 select ARCH_SUPPORTS_AUTOFDO_CLANG select ARCH_SUPPORTS_PROPELLER_CLANG if X86_64 select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_CMPXCHG_LOCKREF if X86_CMPXCHG64 + select ARCH_USE_CMPXCHG_LOCKREF if X86_CX8 select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8fcb8ccee44b..f8b3296fe2e1 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -299,7 +299,7 @@ config X86_HAVE_PAE def_bool y depends on MCRUSOE || MEFFICEON || MCYRIXIII || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC7 || MATOM || X86_64 -config X86_CMPXCHG64 +config X86_CX8 def_bool y depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 || MGEODEGX1 || MGEODE_LX @@ -313,7 +313,7 @@ config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MK7) - default "5" if X86_32 && X86_CMPXCHG64 + default "5" if X86_32 && X86_CX8 default "4" config X86_DEBUGCTLMSR diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index 3674006e3974..8d9e62725202 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -16,7 +16,7 @@ #include #include -#ifndef CONFIG_X86_CMPXCHG64 +#ifndef CONFIG_X86_CX8 extern void cmpxchg8b_emu(void); #endif diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h index 6c6e9b9f98a4..797085ecaaa4 100644 --- a/arch/x86/include/asm/atomic64_32.h +++ b/arch/x86/include/asm/atomic64_32.h @@ -48,7 +48,7 @@ static __always_inline s64 arch_atomic64_read_nonatomic(const atomic64_t *v) ATOMIC64_EXPORT(atomic64_##sym) #endif -#ifdef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CX8 #define __alternative_atomic64(f, g, out, in...) \ asm volatile("call %c[func]" \ : ALT_OUTPUT_SP(out) \ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index fd1282a783dd..c38d4ed94cb3 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -69,7 +69,7 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp, return __arch_try_cmpxchg64(ptr, oldp, new,); } -#ifdef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CX8 #define arch_cmpxchg64 __cmpxchg64 diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index e9187ddd3d1f..0068133cb622 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -23,7 +23,7 @@ # define NEED_PAE 0 #endif -#ifdef CONFIG_X86_CMPXCHG64 +#ifdef CONFIG_X86_CX8 # define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31)) #else # define NEED_CX8 0 diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 8a59c61624c2..9bbe2819881f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -56,7 +56,7 @@ ifeq ($(CONFIG_X86_32),y) lib-y += string_32.o lib-y += memmove_32.o lib-y += cmpxchg8b_emu.o -ifneq ($(CONFIG_X86_CMPXCHG64),y) +ifneq ($(CONFIG_X86_CX8),y) lib-y += atomic64_386_32.o endif else diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S index 1c96be769adc..d4bb24347ff8 100644 --- a/arch/x86/lib/cmpxchg8b_emu.S +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -7,7 +7,7 @@ .text -#ifndef CONFIG_X86_CMPXCHG64 +#ifndef CONFIG_X86_CX8 /* * Emulate 'cmpxchg8b (%esi)' on UP -- cgit v1.2.3 From cb380909ae3b1ebf14d6a455a4f92d7916d790cb Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 15:06:30 -0800 Subject: vhost: return task creation error instead of NULL Lets callers distinguish why the vhost task creation failed. No one currently cares why it failed, so no real runtime change from this patch, but that will not be the case for long. Signed-off-by: Keith Busch Message-ID: <20250227230631.303431-2-kbusch@meta.com> Reviewed-by: Mike Christie Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d4ac4a1f8b81..18ca1ea6dc24 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7471,7 +7471,7 @@ static void kvm_mmu_start_lpage_recovery(struct once *once) kvm_nx_huge_page_recovery_worker_kill, kvm, "kvm-nx-lpage-recovery"); - if (!nx_thread) + if (IS_ERR(nx_thread)) return; vhost_task_start(nx_thread); -- cgit v1.2.3 From 916b7f42b3b3b539a71c204a9b49fdc4ca92cd82 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 15:06:31 -0800 Subject: kvm: retry nx_huge_page_recovery_thread creation A VMM may send a non-fatal signal to its threads, including vCPU tasks, at any time, and thus may signal vCPU tasks during KVM_RUN. If a vCPU task receives the signal while its trying to spawn the huge page recovery vhost task, then KVM_RUN will fail due to copy_process() returning -ERESTARTNOINTR. Rework call_once() to mark the call complete if and only if the called function succeeds, and plumb the function's true error code back to the call_once() invoker. This provides userspace with the correct, non-fatal error code so that the VMM doesn't terminate the VM on -ENOMEM, and allows subsequent KVM_RUN a succeed by virtue of retrying creation of the NX huge page task. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson [implemented the kvm user side] Signed-off-by: Keith Busch Message-ID: <20250227230631.303431-3-kbusch@meta.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 18ca1ea6dc24..8160870398b9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7460,7 +7460,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) return true; } -static void kvm_mmu_start_lpage_recovery(struct once *once) +static int kvm_mmu_start_lpage_recovery(struct once *once) { struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); struct kvm *kvm = container_of(ka, struct kvm, arch); @@ -7472,12 +7472,13 @@ static void kvm_mmu_start_lpage_recovery(struct once *once) kvm, "kvm-nx-lpage-recovery"); if (IS_ERR(nx_thread)) - return; + return PTR_ERR(nx_thread); vhost_task_start(nx_thread); /* Make the task visible only once it is fully started. */ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); + return 0; } int kvm_mmu_post_init_vm(struct kvm *kvm) @@ -7485,10 +7486,7 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) if (nx_hugepage_mitigation_hard_disabled) return 0; - call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); - if (!kvm->arch.nx_huge_page_recovery_thread) - return -ENOMEM; - return 0; + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) -- cgit v1.2.3 From 010c4a461c1dbf3fa75ddea8df018a6128b700c6 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 28 Feb 2025 18:35:43 -0800 Subject: x86/speculation: Simplify and make CALL_NOSPEC consistent CALL_NOSPEC macro is used to generate Spectre-v2 mitigation friendly indirect branches. At compile time the macro defaults to indirect branch, and at runtime those can be patched to thunk based mitigations. This approach is opposite of what is done for the rest of the kernel, where the compile time default is to replace indirect calls with retpoline thunk calls. Make CALL_NOSPEC consistent with the rest of the kernel, default to retpoline thunk at compile time when CONFIG_MITIGATION_RETPOLINE is enabled. Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250228-call-nospec-v3-1-96599fed0f33@linux.intel.com --- arch/x86/include/asm/nospec-branch.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7e8bf78c03d5..1e6b915ce956 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -424,16 +424,11 @@ static inline void call_depth_return_thunk(void) {} * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ -# define CALL_NOSPEC \ - ALTERNATIVE_2( \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ - X86_FEATURE_RETPOLINE, \ - "lfence;\n" \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_LFENCE) +#ifdef CONFIG_MITIGATION_RETPOLINE +#define CALL_NOSPEC "call __x86_indirect_thunk_%V[thunk_target]\n" +#else +#define CALL_NOSPEC "call *%[thunk_target]\n" +#endif # define THUNK_TARGET(addr) [thunk_target] "r" (addr) -- cgit v1.2.3 From 9af9ad85ac44cb754e526d468c3006b48db5dfd8 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 28 Feb 2025 18:35:58 -0800 Subject: x86/speculation: Add a conditional CS prefix to CALL_NOSPEC Retpoline mitigation for spectre-v2 uses thunks for indirect branches. To support this mitigation compilers add a CS prefix with -mindirect-branch-cs-prefix. For an indirect branch in asm, this needs to be added manually. CS prefix is already being added to indirect branches in asm files, but not in inline asm. Add CS prefix to CALL_NOSPEC for inline asm as well. There is no JMP_NOSPEC for inline asm. Reported-by: Josh Poimboeuf Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250228-call-nospec-v3-2-96599fed0f33@linux.intel.com --- arch/x86/include/asm/nospec-branch.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 1e6b915ce956..aee26bb8230f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -198,9 +198,8 @@ .endm /* - * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call - * to the retpoline thunk with a CS prefix when the register requires - * a RAX prefix byte to encode. Also see apply_retpolines(). + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. */ .macro __CS_PREFIX reg:req .irp rs,r8,r9,r10,r11,r12,r13,r14,r15 @@ -420,12 +419,24 @@ static inline void call_depth_return_thunk(void) {} #ifdef CONFIG_X86_64 +/* + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. + */ +#define __CS_PREFIX(reg) \ + ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \ + ".ifc \\rs," reg "\n" \ + ".byte 0x2e\n" \ + ".endif\n" \ + ".endr\n" + /* * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ #ifdef CONFIG_MITIGATION_RETPOLINE -#define CALL_NOSPEC "call __x86_indirect_thunk_%V[thunk_target]\n" +#define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \ + "call __x86_indirect_thunk_%V[thunk_target]\n" #else #define CALL_NOSPEC "call *%[thunk_target]\n" #endif -- cgit v1.2.3 From 8177c6bedb7013cf736137da586cf783922309dd Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:12 +0100 Subject: x86/cacheinfo: Validate CPUID leaf 0x2 EDX output CPUID leaf 0x2 emits one-byte descriptors in its four output registers EAX, EBX, ECX, and EDX. For these descriptors to be valid, the most significant bit (MSB) of each register must be clear. The historical Git commit: 019361a20f016 ("- pre6: Intel: start to add Pentium IV specific stuff (128-byte cacheline etc)...") introduced leaf 0x2 output parsing. It only validated the MSBs of EAX, EBX, and ECX, but left EDX unchecked. Validate EDX's most-significant bit. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250304085152.51092-2-darwi@linutronix.de --- arch/x86/kernel/cpu/cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index e6fa03ed9172..a6c6bccfa8b8 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -808,7 +808,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) + for (j = 0 ; j < 4 ; j++) if (regs[j] & (1 << 31)) regs[j] = 0; -- cgit v1.2.3 From 1881148215c67151b146450fb89ec22fd92337a7 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:13 +0100 Subject: x86/cpu: Validate CPUID leaf 0x2 EDX output CPUID leaf 0x2 emits one-byte descriptors in its four output registers EAX, EBX, ECX, and EDX. For these descriptors to be valid, the most significant bit (MSB) of each register must be clear. Leaf 0x2 parsing at intel.c only validated the MSBs of EAX, EBX, and ECX, but left EDX unchecked. Validate EDX's most-significant bit as well. Fixes: e0ba94f14f74 ("x86/tlb_info: get last level TLB entry number of CPU") Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Cc: stable@kernel.org Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250304085152.51092-3-darwi@linutronix.de --- arch/x86/kernel/cpu/intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 3dce22f00dc3..2a3716afee63 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -799,7 +799,7 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c) cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 3 ; j++) + for (j = 0 ; j < 4 ; j++) if (regs[j] & (1 << 31)) regs[j] = 0; -- cgit v1.2.3 From f6bdaab79ee4228a143ee1b4cb80416d6ffc0c63 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:14 +0100 Subject: x86/cpu: Properly parse CPUID leaf 0x2 TLB descriptor 0x63 CPUID leaf 0x2's one-byte TLB descriptors report the number of entries for specific TLB types, among other properties. Typically, each emitted descriptor implies the same number of entries for its respective TLB type(s). An emitted 0x63 descriptor is an exception: it implies 4 data TLB entries for 1GB pages and 32 data TLB entries for 2MB or 4MB pages. For the TLB descriptors parsing code, the entry count for 1GB pages is encoded at the intel_tlb_table[] mapping, but the 2MB/4MB entry count is totally ignored. Update leaf 0x2's parsing logic 0x2 to account for 32 data TLB entries for 2MB/4MB pages implied by the 0x63 descriptor. Fixes: e0ba94f14f74 ("x86/tlb_info: get last level TLB entry number of CPU") Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Cc: stable@kernel.org Cc: "H. Peter Anvin" Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250304085152.51092-4-darwi@linutronix.de --- arch/x86/kernel/cpu/intel.c | 50 ++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 2a3716afee63..134368a3f4b1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -635,26 +635,37 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) } #endif -#define TLB_INST_4K 0x01 -#define TLB_INST_4M 0x02 -#define TLB_INST_2M_4M 0x03 +#define TLB_INST_4K 0x01 +#define TLB_INST_4M 0x02 +#define TLB_INST_2M_4M 0x03 -#define TLB_INST_ALL 0x05 -#define TLB_INST_1G 0x06 +#define TLB_INST_ALL 0x05 +#define TLB_INST_1G 0x06 -#define TLB_DATA_4K 0x11 -#define TLB_DATA_4M 0x12 -#define TLB_DATA_2M_4M 0x13 -#define TLB_DATA_4K_4M 0x14 +#define TLB_DATA_4K 0x11 +#define TLB_DATA_4M 0x12 +#define TLB_DATA_2M_4M 0x13 +#define TLB_DATA_4K_4M 0x14 -#define TLB_DATA_1G 0x16 +#define TLB_DATA_1G 0x16 +#define TLB_DATA_1G_2M_4M 0x17 -#define TLB_DATA0_4K 0x21 -#define TLB_DATA0_4M 0x22 -#define TLB_DATA0_2M_4M 0x23 +#define TLB_DATA0_4K 0x21 +#define TLB_DATA0_4M 0x22 +#define TLB_DATA0_2M_4M 0x23 -#define STLB_4K 0x41 -#define STLB_4K_2M 0x42 +#define STLB_4K 0x41 +#define STLB_4K_2M 0x42 + +/* + * All of leaf 0x2's one-byte TLB descriptors implies the same number of + * entries for their respective TLB types. The 0x63 descriptor is an + * exception: it implies 4 dTLB entries for 1GB pages 32 dTLB entries + * for 2MB or 4MB pages. Encode descriptor 0x63 dTLB entry count for + * 2MB/4MB pages here, as its count for dTLB 1GB pages is already at the + * intel_tlb_table[] mapping. + */ +#define TLB_0x63_2M_4M_ENTRIES 32 static const struct _tlb_table intel_tlb_table[] = { { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, @@ -676,7 +687,8 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, - { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, + { 0x63, TLB_DATA_1G_2M_4M, 4, " TLB_DATA 1 GByte pages, 4-way set associative" + " (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here)" }, { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, @@ -776,6 +788,12 @@ static void intel_tlb_lookup(const unsigned char desc) if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; break; + case TLB_DATA_1G_2M_4M: + if (tlb_lld_2m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) + tlb_lld_2m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; + if (tlb_lld_4m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) + tlb_lld_4m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; + fallthrough; case TLB_DATA_1G: if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; -- cgit v1.2.3 From 091b768604a8df7822aade75dd5bfc5c788154ee Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 3 Mar 2025 10:37:59 +0100 Subject: xen: Kconfig: Drop reference to obsolete configs MCORE2 and MK8 Commit f388f60ca904 ("x86/cpu: Drop configuration options for early 64-bit CPUs") removes the config symbols MCORE2 and MK8. With that, the references to those two config symbols in xen's x86 Kconfig are obsolete. Drop them. Fixes: f388f60ca904 ("x86/cpu: Drop configuration options for early 64-bit CPUs") Signed-off-by: Lukas Bulwahn Signed-off-by: Ingo Molnar Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20250303093759.371445-1-lukas.bulwahn@redhat.com --- arch/x86/xen/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 77e788e928cd..98d8a50d2aed 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -9,7 +9,7 @@ config XEN select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) - depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8) + depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM) depends on X86_LOCAL_APIC && X86_TSC help This is the Linux Xen port. Enabling this will allow the -- cgit v1.2.3 From 4e32645cd8f97a308300623f81c902747df6b97b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sun, 2 Mar 2025 16:48:51 -0800 Subject: x86/smp: Fix mwait_play_dead() and acpi_processor_ffh_play_dead() noreturn behavior Fix some related issues (done in a single patch to avoid introducing intermediate bisect warnings): 1) The SMP version of mwait_play_dead() doesn't return, but its !SMP counterpart does. Make its calling behavior consistent by resolving the !SMP version to a BUG(). It should never be called anyway, this just enforces that at runtime and enables its callers to be marked as __noreturn. 2) While the SMP definition of mwait_play_dead() is annotated as __noreturn, the declaration isn't. Nor is it listed in tools/objtool/noreturns.h. Fix that. 3) Similar to #1, the SMP version of acpi_processor_ffh_play_dead() doesn't return but its !SMP counterpart does. Make the !SMP version a BUG(). It should never be called. 4) acpi_processor_ffh_play_dead() doesn't return, but is lacking any __noreturn annotations. Fix that. This fixes the following objtool warnings: vmlinux.o: warning: objtool: acpi_processor_ffh_play_dead+0x67: mwait_play_dead() is missing a __noreturn annotation vmlinux.o: warning: objtool: acpi_idle_play_dead+0x3c: acpi_processor_ffh_play_dead() is missing a __noreturn annotation Fixes: a7dd183f0b38 ("x86/smp: Allow calling mwait_play_dead with an arbitrary hint") Fixes: 541ddf31e300 ("ACPI/processor_idle: Add FFH state handling") Reported-by: Paul E. McKenney Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Tested-by: Paul E. McKenney Link: https://lore.kernel.org/r/e885c6fa9e96a61471b33e48c2162d28b15b14c5.1740962711.git.jpoimboe@kernel.org --- arch/x86/include/asm/smp.h | 4 ++-- arch/x86/kernel/acpi/cstate.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 80f8bfd83fc7..1d3b11eba084 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -114,7 +114,7 @@ void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); void smp_kick_mwait_play_dead(void); -void mwait_play_dead(unsigned int eax_hint); +void __noreturn mwait_play_dead(unsigned int eax_hint); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); @@ -166,7 +166,7 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu) return (struct cpumask *)cpumask_of(0); } -static inline void mwait_play_dead(unsigned int eax_hint) { } +static inline void __noreturn mwait_play_dead(unsigned int eax_hint) { BUG(); } #endif /* CONFIG_SMP */ #ifdef CONFIG_DEBUG_NMI_SELFTEST diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 86c87c01d23d..d25584255ab8 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -206,7 +206,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -void acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) +void __noreturn acpi_processor_ffh_play_dead(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); struct cstate_entry *percpu_entry; -- cgit v1.2.3 From cfceff8526a426948b53445c02bcb98453c7330d Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 28 Feb 2025 18:35:43 -0800 Subject: x86/speculation: Simplify and make CALL_NOSPEC consistent CALL_NOSPEC macro is used to generate Spectre-v2 mitigation friendly indirect branches. At compile time the macro defaults to indirect branch, and at runtime those can be patched to thunk based mitigations. This approach is opposite of what is done for the rest of the kernel, where the compile time default is to replace indirect calls with retpoline thunk calls. Make CALL_NOSPEC consistent with the rest of the kernel, default to retpoline thunk at compile time when CONFIG_MITIGATION_RETPOLINE is enabled. Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250228-call-nospec-v3-1-96599fed0f33@linux.intel.com --- arch/x86/include/asm/nospec-branch.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7e8bf78c03d5..1e6b915ce956 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -424,16 +424,11 @@ static inline void call_depth_return_thunk(void) {} * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ -# define CALL_NOSPEC \ - ALTERNATIVE_2( \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ - X86_FEATURE_RETPOLINE, \ - "lfence;\n" \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_LFENCE) +#ifdef CONFIG_MITIGATION_RETPOLINE +#define CALL_NOSPEC "call __x86_indirect_thunk_%V[thunk_target]\n" +#else +#define CALL_NOSPEC "call *%[thunk_target]\n" +#endif # define THUNK_TARGET(addr) [thunk_target] "r" (addr) -- cgit v1.2.3 From 052040e34c08428a5a388b85787e8531970c0c67 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 28 Feb 2025 18:35:58 -0800 Subject: x86/speculation: Add a conditional CS prefix to CALL_NOSPEC Retpoline mitigation for spectre-v2 uses thunks for indirect branches. To support this mitigation compilers add a CS prefix with -mindirect-branch-cs-prefix. For an indirect branch in asm, this needs to be added manually. CS prefix is already being added to indirect branches in asm files, but not in inline asm. Add CS prefix to CALL_NOSPEC for inline asm as well. There is no JMP_NOSPEC for inline asm. Reported-by: Josh Poimboeuf Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250228-call-nospec-v3-2-96599fed0f33@linux.intel.com --- arch/x86/include/asm/nospec-branch.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 1e6b915ce956..aee26bb8230f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -198,9 +198,8 @@ .endm /* - * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call - * to the retpoline thunk with a CS prefix when the register requires - * a RAX prefix byte to encode. Also see apply_retpolines(). + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. */ .macro __CS_PREFIX reg:req .irp rs,r8,r9,r10,r11,r12,r13,r14,r15 @@ -420,12 +419,24 @@ static inline void call_depth_return_thunk(void) {} #ifdef CONFIG_X86_64 +/* + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. + */ +#define __CS_PREFIX(reg) \ + ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \ + ".ifc \\rs," reg "\n" \ + ".byte 0x2e\n" \ + ".endif\n" \ + ".endr\n" + /* * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_MITIGATION_RETPOLINE is defined. */ #ifdef CONFIG_MITIGATION_RETPOLINE -#define CALL_NOSPEC "call __x86_indirect_thunk_%V[thunk_target]\n" +#define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \ + "call __x86_indirect_thunk_%V[thunk_target]\n" #else #define CALL_NOSPEC "call *%[thunk_target]\n" #endif -- cgit v1.2.3 From 27c3b452c1a554483ac692702639c826602d1089 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 3 Mar 2025 15:45:37 +0000 Subject: x86/cpu: Remove unnecessary macro indirection related to CPU feature names These macros used to abstract over CONFIG_X86_FEATURE_NAMES, but that was removed in: 7583e8fbdc49 ("x86/cpu: Remove X86_FEATURE_NAMES") Now they are just an unnecessary indirection, remove them. Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250303-setcpuid-taint-louder-v1-1-8d255032cb4c@google.com --- arch/x86/include/asm/cpufeature.h | 5 ----- arch/x86/kernel/cpu/common.c | 12 ++++++------ 2 files changed, 6 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e5fc0038c8f6..e955da397e0e 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -37,13 +37,8 @@ enum cpuid_leafs NR_CPUID_WORDS, }; -#define X86_CAP_FMT_NUM "%d:%d" -#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) - extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; -#define X86_CAP_FMT "%s" -#define x86_cap_flag(flag) x86_cap_flags[flag] /* * In order to save room, we index into this array by doing diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0f32b6ffbf04..b5fdaa6fd4c4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -667,8 +667,8 @@ static void filter_cpuid_features(struct cpuinfo_x86 *c, bool warn) if (!warn) continue; - pr_warn("CPU: CPU feature " X86_CAP_FMT " disabled, no CPUID level 0x%x\n", - x86_cap_flag(df->feature), df->level); + pr_warn("CPU: CPU feature %s disabled, no CPUID level 0x%x\n", + x86_cap_flags[df->feature], df->level); } } @@ -1502,9 +1502,9 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) /* empty-string, i.e., ""-defined feature flags */ if (!x86_cap_flags[bit]) - pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); + pr_cont(" %d:%d", bit >> 5, bit & 31); else - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + pr_cont(" %s", x86_cap_flags[bit]); if (set) setup_force_cpu_cap(bit); @@ -1523,9 +1523,9 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) const char *flag; if (bit < 32 * NCAPINTS) - flag = x86_cap_flag(bit); + flag = x86_cap_flags[bit]; else - flag = x86_bug_flag(bit - (32 * NCAPINTS)); + flag = x86_bug_flags[bit - (32 * NCAPINTS)]; if (!flag) continue; -- cgit v1.2.3 From 681955761bf6845241c6d33e2fb222f5e92c8b89 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 3 Mar 2025 15:45:38 +0000 Subject: x86/cpu: Warn louder about the {set,clear}cpuid boot parameters Commit 814165e9fd1f6 ("x86/cpu: Add the 'setcpuid=' boot parameter") recently expanded the user's ability to break their system horribly by overriding effective CPU flags. This was reflected with updates to the documentation to try and make people aware that this is dangerous. To further reduce the risk of users mistaking this for a "real feature", and try to help them figure out why their kernel is tainted if they do use it: - Upgrade the existing printk to pr_warn, to help ensure kernel logs reflect what changes are in effect. - Print an extra warning that tries to be as dramatic as possible, while also highlighting the fact that it tainted the kernel. Suggested-by: Ingo Molnar Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250303-setcpuid-taint-louder-v1-2-8d255032cb4c@google.com --- arch/x86/kernel/cpu/common.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b5fdaa6fd4c4..c1ced31f976d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1479,12 +1479,12 @@ static void detect_nopl(void) #endif } -static inline void parse_set_clear_cpuid(char *arg, bool set) +static inline bool parse_set_clear_cpuid(char *arg, bool set) { char *opt; int taint = 0; - pr_info("%s CPUID bits:", set ? "Force-enabling" : "Clearing"); + pr_warn("%s CPUID bits:", set ? "Force-enabling" : "Clearing"); while (arg) { bool found __maybe_unused = false; @@ -1547,10 +1547,9 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) pr_cont(" (unknown: %s)", opt); } - if (taint) - add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); - pr_cont("\n"); + + return taint; } @@ -1560,6 +1559,7 @@ static inline void parse_set_clear_cpuid(char *arg, bool set) */ static void __init cpu_parse_early_param(void) { + bool cpuid_taint = false; char arg[128]; int arglen; @@ -1594,11 +1594,16 @@ static void __init cpu_parse_early_param(void) arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg)); if (arglen > 0) - parse_set_clear_cpuid(arg, false); + cpuid_taint |= parse_set_clear_cpuid(arg, false); arglen = cmdline_find_option(boot_command_line, "setcpuid", arg, sizeof(arg)); if (arglen > 0) - parse_set_clear_cpuid(arg, true); + cpuid_taint |= parse_set_clear_cpuid(arg, true); + + if (cpuid_taint) { + pr_warn("!!! setcpuid=/clearcpuid= in use, this is for TESTING ONLY, may break things horribly. Tainting kernel.\n"); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } } /* -- cgit v1.2.3 From d0ba9bcf001c7907e4755b0e498f5ff9d1a228ef Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Mon, 3 Mar 2025 15:45:39 +0000 Subject: x86/cpu: Log CPU flag cmdline hacks more verbosely Since using these options is very dangerous, make details as visible as possible: - Instead of a single message for each of the cmdline options, print a separate pr_warn() for each individual flag. - Say explicitly whether the flag is a "feature" or a "bug". Suggested-by: Peter Zijlstra Signed-off-by: Brendan Jackman Signed-off-by: Ingo Molnar Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20250303-setcpuid-taint-louder-v1-3-8d255032cb4c@google.com --- arch/x86/kernel/cpu/common.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c1ced31f976d..8eba9ca9c216 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1484,8 +1484,6 @@ static inline bool parse_set_clear_cpuid(char *arg, bool set) char *opt; int taint = 0; - pr_warn("%s CPUID bits:", set ? "Force-enabling" : "Clearing"); - while (arg) { bool found __maybe_unused = false; unsigned int bit; @@ -1500,16 +1498,19 @@ static inline bool parse_set_clear_cpuid(char *arg, bool set) if (!kstrtouint(opt, 10, &bit)) { if (bit < NCAPINTS * 32) { + if (set) { + pr_warn("setcpuid: force-enabling CPU feature flag:"); + setup_force_cpu_cap(bit); + } else { + pr_warn("clearcpuid: force-disabling CPU feature flag:"); + setup_clear_cpu_cap(bit); + } /* empty-string, i.e., ""-defined feature flags */ if (!x86_cap_flags[bit]) - pr_cont(" %d:%d", bit >> 5, bit & 31); + pr_cont(" %d:%d\n", bit >> 5, bit & 31); else - pr_cont(" %s", x86_cap_flags[bit]); + pr_cont(" %s\n", x86_cap_flags[bit]); - if (set) - setup_force_cpu_cap(bit); - else - setup_clear_cpu_cap(bit); taint++; } /* @@ -1521,11 +1522,15 @@ static inline bool parse_set_clear_cpuid(char *arg, bool set) for (bit = 0; bit < 32 * (NCAPINTS + NBUGINTS); bit++) { const char *flag; + const char *kind; - if (bit < 32 * NCAPINTS) + if (bit < 32 * NCAPINTS) { flag = x86_cap_flags[bit]; - else + kind = "feature"; + } else { + kind = "bug"; flag = x86_bug_flags[bit - (32 * NCAPINTS)]; + } if (!flag) continue; @@ -1533,22 +1538,24 @@ static inline bool parse_set_clear_cpuid(char *arg, bool set) if (strcmp(flag, opt)) continue; - pr_cont(" %s", opt); - if (set) + if (set) { + pr_warn("setcpuid: force-enabling CPU %s flag: %s\n", + kind, flag); setup_force_cpu_cap(bit); - else + } else { + pr_warn("clearcpuid: force-disabling CPU %s flag: %s\n", + kind, flag); setup_clear_cpu_cap(bit); + } taint++; found = true; break; } if (!found) - pr_cont(" (unknown: %s)", opt); + pr_warn("%s: unknown CPU flag: %s", set ? "setcpuid" : "clearcpuid", opt); } - pr_cont("\n"); - return taint; } -- cgit v1.2.3 From 97c7d5723537de08e076892e07d6089ae9777965 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:15 +0100 Subject: x86/cpuid: Include in uses static_assert() at multiple locations but it does not include the CPP macro's definition at linux/build_bug.h. Include the needed header to make self-sufficient. This gets triggered when cpuid.h is included in new C files, which is to be done in further commits. Fixes: 43d86e3cd9a7 ("x86/cpu: Provide cpuid_read() et al.") Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-5-darwi@linutronix.de --- arch/x86/include/asm/cpuid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h index b2b9b4ef3dae..a92e4b08820a 100644 --- a/arch/x86/include/asm/cpuid.h +++ b/arch/x86/include/asm/cpuid.h @@ -6,6 +6,7 @@ #ifndef _ASM_X86_CPUID_H #define _ASM_X86_CPUID_H +#include #include #include -- cgit v1.2.3 From dec7fdc0b79c2ae0a537343b17f5ba1c6c47e1ca Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:16 +0100 Subject: x86/cpu: Remove unnecessary headers and reorder the rest Remove the headers at intel.c that are no longer required. Alphabetically reorder what remains since more headers will be included in further commits. Suggested-by: Thomas Gleixner Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-6-darwi@linutronix.de --- arch/x86/kernel/cpu/intel.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c5d833f5bffb..60b58b1a0c69 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1,40 +1,30 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include #include -#include -#include -#include -#include #include -#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +#include +#endif -#include -#include #include +#include +#include #include +#include #include #include -#include -#include -#include -#include +#include #include +#include #include - -#ifdef CONFIG_X86_64 -#include -#endif +#include #include "cpu.h" -#ifdef CONFIG_X86_LOCAL_APIC -#include -#include -#endif - /* * Processors which have self-snooping capability can handle conflicting * memory type across CPUs by snooping its own cache. However, there exists -- cgit v1.2.3 From cb5f4c76b2a9314c35e00c67c98ccd03542c2634 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:17 +0100 Subject: x86/cpu: Use max() for CPUID leaf 0x2 TLB descriptors parsing The conditional statement "if (x < y) { x = y; }" appears 22 times at the Intel leaf 0x2 descriptors parsing logic. Replace each of such instances with a max() expression to simplify the code. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-7-darwi@linutronix.de --- arch/x86/kernel/cpu/intel.c | 76 +++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 60b58b1a0c69..42a57b85f93b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -700,7 +701,9 @@ static const struct _tlb_table intel_tlb_table[] = { static void intel_tlb_lookup(const unsigned char desc) { + unsigned int entries; unsigned char k; + if (desc == 0) return; @@ -712,81 +715,58 @@ static void intel_tlb_lookup(const unsigned char desc) if (intel_tlb_table[k].tlb_type == 0) return; + entries = intel_tlb_table[k].entries; switch (intel_tlb_table[k].tlb_type) { case STLB_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); + tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); break; case STLB_4K_2M: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); + tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); + tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); + tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], entries); + tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); + tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); break; case TLB_INST_ALL: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); + tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); + tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); break; case TLB_INST_4K: - if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); break; case TLB_INST_4M: - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); break; case TLB_INST_2M_4M: - if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); + tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); break; case TLB_DATA_4K: case TLB_DATA0_4K: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); break; case TLB_DATA_4M: case TLB_DATA0_4M: - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); break; case TLB_DATA_2M_4M: case TLB_DATA0_2M_4M: - if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], entries); + tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); break; case TLB_DATA_4K_4M: - if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; - if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); + tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); break; case TLB_DATA_1G_2M_4M: - if (tlb_lld_2m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) - tlb_lld_2m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; - if (tlb_lld_4m[ENTRIES] < TLB_0x63_2M_4M_ENTRIES) - tlb_lld_4m[ENTRIES] = TLB_0x63_2M_4M_ENTRIES; + tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], TLB_0x63_2M_4M_ENTRIES); + tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], TLB_0x63_2M_4M_ENTRIES); fallthrough; case TLB_DATA_1G: - if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) - tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; + tlb_lld_1g[ENTRIES] = max(tlb_lld_1g[ENTRIES], entries); break; } } -- cgit v1.2.3 From 8b7e54b542103753619a37cbb3216849a934872f Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:18 +0100 Subject: x86/cpu: Simplify TLB entry count storage Commit: e0ba94f14f74 ("x86/tlb_info: get last level TLB entry number of CPU") introduced u16 "info" arrays for each TLB type. Since 2012 and each array stores just one type of information: the number of TLB entries for its respective TLB type. Replace such arrays with simple variables. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-8-darwi@linutronix.de --- arch/x86/include/asm/processor.h | 19 ++++++---------- arch/x86/kernel/cpu/amd.c | 18 +++++++-------- arch/x86/kernel/cpu/common.c | 20 ++++++++--------- arch/x86/kernel/cpu/hygon.c | 16 +++++++------- arch/x86/kernel/cpu/intel.c | 48 ++++++++++++++++++++-------------------- 5 files changed, 57 insertions(+), 64 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c0cd10182e90..0ea227fa027c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -60,18 +60,13 @@ struct vm86; # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif -enum tlb_infos { - ENTRIES, - NR_INFO -}; - -extern u16 __read_mostly tlb_lli_4k[NR_INFO]; -extern u16 __read_mostly tlb_lli_2m[NR_INFO]; -extern u16 __read_mostly tlb_lli_4m[NR_INFO]; -extern u16 __read_mostly tlb_lld_4k[NR_INFO]; -extern u16 __read_mostly tlb_lld_2m[NR_INFO]; -extern u16 __read_mostly tlb_lld_4m[NR_INFO]; -extern u16 __read_mostly tlb_lld_1g[NR_INFO]; +extern u16 __read_mostly tlb_lli_4k; +extern u16 __read_mostly tlb_lli_2m; +extern u16 __read_mostly tlb_lli_4m; +extern u16 __read_mostly tlb_lld_4k; +extern u16 __read_mostly tlb_lld_2m; +extern u16 __read_mostly tlb_lld_4m; +extern u16 __read_mostly tlb_lld_1g; /* * CPU type and hardware bug flags. Kept separately for each CPU. diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d747515ad013..315766440201 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1105,8 +1105,8 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB @@ -1119,26 +1119,26 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { /* Erratum 658 */ if (c->x86 == 0x15 && c->x86_model <= 0x1f) { - tlb_lli_2m[ENTRIES] = 1024; + tlb_lli_2m = 1024; } else { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + tlb_lli_4m = tlb_lli_2m >> 1; } static const struct cpu_dev amd_cpu_dev = { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8eba9ca9c216..3a1a957e0c60 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -846,13 +846,13 @@ void cpu_detect_cache_sizes(struct cpuinfo_x86 *c) c->x86_cache_size = l2size; } -u16 __read_mostly tlb_lli_4k[NR_INFO]; -u16 __read_mostly tlb_lli_2m[NR_INFO]; -u16 __read_mostly tlb_lli_4m[NR_INFO]; -u16 __read_mostly tlb_lld_4k[NR_INFO]; -u16 __read_mostly tlb_lld_2m[NR_INFO]; -u16 __read_mostly tlb_lld_4m[NR_INFO]; -u16 __read_mostly tlb_lld_1g[NR_INFO]; +u16 __read_mostly tlb_lli_4k; +u16 __read_mostly tlb_lli_2m; +u16 __read_mostly tlb_lli_4m; +u16 __read_mostly tlb_lld_4k; +u16 __read_mostly tlb_lld_2m; +u16 __read_mostly tlb_lld_4m; +u16 __read_mostly tlb_lld_1g; static void cpu_detect_tlb(struct cpuinfo_x86 *c) { @@ -860,12 +860,10 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c) this_cpu->c_detect_tlb(c); pr_info("Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n", - tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], - tlb_lli_4m[ENTRIES]); + tlb_lli_4k, tlb_lli_2m, tlb_lli_4m); pr_info("Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", - tlb_lld_4k[ENTRIES], tlb_lld_2m[ENTRIES], - tlb_lld_4m[ENTRIES], tlb_lld_1g[ENTRIES]); + tlb_lld_4k, tlb_lld_2m, tlb_lld_4m, tlb_lld_1g); } void get_cpu_vendor(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index c5191b06f9f2..6af4a4a90a52 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -240,26 +240,26 @@ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); - tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; - tlb_lli_4k[ENTRIES] = ebx & mask; + tlb_lld_4k = (ebx >> 16) & mask; + tlb_lli_4k = ebx & mask; /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!((eax >> 16) & mask)) - tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; + tlb_lld_2m = (cpuid_eax(0x80000005) >> 16) & 0xff; else - tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; + tlb_lld_2m = (eax >> 16) & mask; /* a 4M entry uses two 2M entries */ - tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + tlb_lld_4m = tlb_lld_2m >> 1; /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ if (!(eax & mask)) { cpuid(0x80000005, &eax, &ebx, &ecx, &edx); - tlb_lli_2m[ENTRIES] = eax & 0xff; + tlb_lli_2m = eax & 0xff; } else - tlb_lli_2m[ENTRIES] = eax & mask; + tlb_lli_2m = eax & mask; - tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + tlb_lli_4m = tlb_lli_2m >> 1; } static const struct cpu_dev hygon_cpu_dev = { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 42a57b85f93b..61d3fd31baee 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -718,55 +718,55 @@ static void intel_tlb_lookup(const unsigned char desc) entries = intel_tlb_table[k].entries; switch (intel_tlb_table[k].tlb_type) { case STLB_4K: - tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); - tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); break; case STLB_4K_2M: - tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); - tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); - tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); - tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], entries); - tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); - tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_INST_ALL: - tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); - tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); - tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); + tlb_lli_4k = max(tlb_lli_4k, entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_4K: - tlb_lli_4k[ENTRIES] = max(tlb_lli_4k[ENTRIES], entries); + tlb_lli_4k = max(tlb_lli_4k, entries); break; case TLB_INST_4M: - tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_INST_2M_4M: - tlb_lli_2m[ENTRIES] = max(tlb_lli_2m[ENTRIES], entries); - tlb_lli_4m[ENTRIES] = max(tlb_lli_4m[ENTRIES], entries); + tlb_lli_2m = max(tlb_lli_2m, entries); + tlb_lli_4m = max(tlb_lli_4m, entries); break; case TLB_DATA_4K: case TLB_DATA0_4K: - tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); + tlb_lld_4k = max(tlb_lld_4k, entries); break; case TLB_DATA_4M: case TLB_DATA0_4M: - tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_2M_4M: case TLB_DATA0_2M_4M: - tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], entries); - tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); + tlb_lld_2m = max(tlb_lld_2m, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_4K_4M: - tlb_lld_4k[ENTRIES] = max(tlb_lld_4k[ENTRIES], entries); - tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], entries); + tlb_lld_4k = max(tlb_lld_4k, entries); + tlb_lld_4m = max(tlb_lld_4m, entries); break; case TLB_DATA_1G_2M_4M: - tlb_lld_2m[ENTRIES] = max(tlb_lld_2m[ENTRIES], TLB_0x63_2M_4M_ENTRIES); - tlb_lld_4m[ENTRIES] = max(tlb_lld_4m[ENTRIES], TLB_0x63_2M_4M_ENTRIES); + tlb_lld_2m = max(tlb_lld_2m, TLB_0x63_2M_4M_ENTRIES); + tlb_lld_4m = max(tlb_lld_4m, TLB_0x63_2M_4M_ENTRIES); fallthrough; case TLB_DATA_1G: - tlb_lld_1g[ENTRIES] = max(tlb_lld_1g[ENTRIES], entries); + tlb_lld_1g = max(tlb_lld_1g, entries); break; } } -- cgit v1.2.3 From 535d9a82702ee75b0da6e4547f367beeeef184a3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Mar 2025 09:51:19 +0100 Subject: x86/cpu: Get rid of the smp_store_cpu_info() indirection smp_store_cpu_info() is just a wrapper around identify_secondary_cpu() without further value. Move the extra bits from smp_store_cpu_info() into identify_secondary_cpu() and remove the wrapper. [ darwi: Make it compile and fix up the xen/smp_pv.c instance ] Signed-off-by: Thomas Gleixner Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-9-darwi@linutronix.de --- arch/x86/include/asm/processor.h | 2 +- arch/x86/include/asm/smp.h | 2 -- arch/x86/kernel/cpu/common.c | 11 +++++++++-- arch/x86/kernel/smpboot.c | 24 ++---------------------- arch/x86/xen/smp_pv.c | 2 +- 5 files changed, 13 insertions(+), 28 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0ea227fa027c..d5d9a071cddc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -229,7 +229,7 @@ static inline unsigned long long l1tf_pfn_limit(void) void init_cpu_devs(void); void get_cpu_vendor(struct cpuinfo_x86 *c); extern void early_cpu_init(void); -extern void identify_secondary_cpu(struct cpuinfo_x86 *); +extern void identify_secondary_cpu(unsigned int cpu); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 1d3b11eba084..128e06a18e51 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -120,8 +120,6 @@ void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); -void smp_store_cpu_info(int id); - asmlinkage __visible void smp_reboot_interrupt(void); __visible void smp_reschedule_interrupt(struct pt_regs *regs); __visible void smp_call_function_interrupt(struct pt_regs *regs); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3a1a957e0c60..5f81c553e733 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1997,9 +1997,15 @@ static __init void identify_boot_cpu(void) lkgs_init(); } -void identify_secondary_cpu(struct cpuinfo_x86 *c) +void identify_secondary_cpu(unsigned int cpu) { - BUG_ON(c == &boot_cpu_data); + struct cpuinfo_x86 *c = &cpu_data(cpu); + + /* Copy boot_cpu_data only on the first bringup */ + if (!c->initialized) + *c = boot_cpu_data; + c->cpu_index = cpu; + identify_cpu(c); #ifdef CONFIG_X86_32 enable_sep_cpu(); @@ -2010,6 +2016,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) update_gds_msr(); tsx_ap_init(); + c->initialized = true; } void print_cpu_info(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 5746084bafe4..8ecf1bf57103 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -190,7 +190,7 @@ static void ap_starting(void) apic_ap_setup(); /* Save the processor parameters. */ - smp_store_cpu_info(cpuid); + identify_secondary_cpu(cpuid); /* * The topology information must be up to date before @@ -215,7 +215,7 @@ static void ap_calibrate_delay(void) { /* * Calibrate the delay loop and update loops_per_jiffy in cpu_data. - * smp_store_cpu_info() stored a value that is close but not as + * identify_secondary_cpu() stored a value that is close but not as * accurate as the value just calculated. * * As this is invoked after the TSC synchronization check, @@ -315,26 +315,6 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -/* - * The bootstrap kernel entry code has set these up. Save them for - * a given CPU - */ -void smp_store_cpu_info(int id) -{ - struct cpuinfo_x86 *c = &cpu_data(id); - - /* Copy boot_cpu_data only on the first bringup */ - if (!c->initialized) - *c = boot_cpu_data; - c->cpu_index = id; - /* - * During boot time, CPU0 has this setup already. Save the info when - * bringing up an AP. - */ - identify_secondary_cpu(c); - c->initialized = true; -} - static bool topology_same_node(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 6863d3da7dec..688ff59318ae 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -70,7 +70,7 @@ static void cpu_bringup(void) xen_enable_syscall(); } cpu = smp_processor_id(); - smp_store_cpu_info(cpu); + identify_secondary_cpu(cpu); set_cpu_sibling_map(cpu); speculative_store_bypass_ht_init(); -- cgit v1.2.3 From 1f61dfdf16cd3bab383741c2eb43e7f69e9f592f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Mar 2025 09:51:20 +0100 Subject: x86/cpu: Remove unused TLB strings Commit: e0ba94f14f74 ("x86/tlb_info: get last level TLB entry number of CPU") added the TLB table for parsing CPUID(0x4), including strings describing them. The string entry in the table was never used. Convert them to comments. Signed-off-by: Thomas Gleixner Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-10-darwi@linutronix.de --- arch/x86/kernel/cpu/cpu.h | 8 ----- arch/x86/kernel/cpu/intel.c | 80 ++++++++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1beccefbaff9..51deb60a9d26 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -33,14 +33,6 @@ struct cpu_dev { #endif }; -struct _tlb_table { - unsigned char descriptor; - char tlb_type; - unsigned int entries; - /* unsigned int ways; */ - char info[128]; -}; - #define cpu_dev_register(cpu_devX) \ static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ __section(".x86_cpu_dev.init") = \ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 61d3fd31baee..291c82816797 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -658,44 +658,50 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) */ #define TLB_0x63_2M_4M_ENTRIES 32 +struct _tlb_table { + unsigned char descriptor; + char tlb_type; + unsigned int entries; +}; + static const struct _tlb_table intel_tlb_table[] = { - { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" }, - { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, - { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, - { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, - { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" }, - { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" }, - { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" }, - { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" }, - { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, - { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, - { 0x63, TLB_DATA_1G_2M_4M, 4, " TLB_DATA 1 GByte pages, 4-way set associative" - " (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here)" }, - { 0x6b, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 8-way associative" }, - { 0x6c, TLB_DATA_2M_4M, 128, " TLB_DATA 2 MByte or 4 MByte pages, 8-way associative" }, - { 0x6d, TLB_DATA_1G, 16, " TLB_DATA 1 GByte pages, fully associative" }, - { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, - { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, - { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, - { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, - { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, - { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set associative" }, - { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, - { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, - { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, - { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, + { 0x01, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages, 4-way set associative */ + { 0x02, TLB_INST_4M, 2}, /* TLB_INST 4 MByte pages, full associative */ + { 0x03, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way set associative */ + { 0x04, TLB_DATA_4M, 8}, /* TLB_DATA 4 MByte pages, 4-way set associative */ + { 0x05, TLB_DATA_4M, 32}, /* TLB_DATA 4 MByte pages, 4-way set associative */ + { 0x0b, TLB_INST_4M, 4}, /* TLB_INST 4 MByte pages, 4-way set associative */ + { 0x4f, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages */ + { 0x50, TLB_INST_ALL, 64}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x51, TLB_INST_ALL, 128}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x52, TLB_INST_ALL, 256}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + { 0x55, TLB_INST_2M_4M, 7}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + { 0x56, TLB_DATA0_4M, 16}, /* TLB_DATA0 4 MByte pages, 4-way set associative */ + { 0x57, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, 4-way associative */ + { 0x59, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, fully associative */ + { 0x5a, TLB_DATA0_2M_4M, 32}, /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ + { 0x5b, TLB_DATA_4K_4M, 64}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x5c, TLB_DATA_4K_4M, 128}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x5d, TLB_DATA_4K_4M, 256}, /* TLB_DATA 4 KByte and 4 MByte pages */ + { 0x61, TLB_INST_4K, 48}, /* TLB_INST 4 KByte pages, full associative */ + { 0x63, TLB_DATA_1G_2M_4M, 4}, /* TLB_DATA 1 GByte pages, 4-way set associative + * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ + { 0x6b, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 8-way associative */ + { 0x6c, TLB_DATA_2M_4M, 128}, /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ + { 0x6d, TLB_DATA_1G, 16}, /* TLB_DATA 1 GByte pages, fully associative */ + { 0x76, TLB_INST_2M_4M, 8}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + { 0xb0, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 4-way set associative */ + { 0xb1, TLB_INST_2M_4M, 4}, /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ + { 0xb2, TLB_INST_4K, 64}, /* TLB_INST 4KByte pages, 4-way set associative */ + { 0xb3, TLB_DATA_4K, 128}, /* TLB_DATA 4 KByte pages, 4-way set associative */ + { 0xb4, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 4-way associative */ + { 0xb5, TLB_INST_4K, 64}, /* TLB_INST 4 KByte pages, 8-way set associative */ + { 0xb6, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 8-way set associative */ + { 0xba, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way associative */ + { 0xc0, TLB_DATA_4K_4M, 8}, /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ + { 0xc1, STLB_4K_2M, 1024}, /* STLB 4 KByte and 2 MByte pages, 8-way associative */ + { 0xc2, TLB_DATA_2M_4M, 16}, /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ + { 0xca, STLB_4K, 512}, /* STLB 4 KByte pages, 4-way associative */ { 0x00, 0, 0 } }; -- cgit v1.2.3 From b3a756bd72ec8d1ba43334b17115e0ece1144a88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 4 Mar 2025 09:51:22 +0100 Subject: x86/cacheinfo: Remove the P4 trace leftovers for real Commit 851026a2bf54 ("x86/cacheinfo: Remove unused trace variable") removed the switch case for LVL_TRACE but did not get rid of the surrounding gunk. Signed-off-by: Thomas Gleixner Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-12-darwi@linutronix.de --- arch/x86/kernel/cpu/cacheinfo.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index a6c6bccfa8b8..eccffe2ea06c 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -31,7 +31,6 @@ #define LVL_1_DATA 2 #define LVL_2 3 #define LVL_3 4 -#define LVL_TRACE 5 /* Shared last level cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); @@ -96,10 +95,6 @@ static const struct _cache_table cache_table[] = { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x70, LVL_TRACE, 12 }, /* 8-way set assoc */ - { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ - { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ - { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ @@ -787,19 +782,13 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) } } } - /* - * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for - * trace cache - */ - if ((!ci->num_leaves || c->x86 == 15) && c->cpuid_level > 1) { + + /* Don't use CPUID(2) if CPUID(4) is supported. */ + if (!ci->num_leaves && c->cpuid_level > 1) { /* supports eax=2 call */ int j, n; unsigned int regs[4]; unsigned char *dp = (unsigned char *)regs; - int only_trace = 0; - - if (ci->num_leaves && c->x86 == 15) - only_trace = 1; /* Number of times to iterate */ n = cpuid_eax(2) & 0xFF; @@ -820,8 +809,6 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) /* look up this descriptor in the table */ while (cache_table[k].descriptor != 0) { if (cache_table[k].descriptor == des) { - if (only_trace && cache_table[k].cache_type != LVL_TRACE) - break; switch (cache_table[k].cache_type) { case LVL_1_INST: l1i += cache_table[k].size; -- cgit v1.2.3 From 6309ff98f00bad118812f7f250fbbee4867e88d3 Mon Sep 17 00:00:00 2001 From: "Ahmed S. Darwish" Date: Tue, 4 Mar 2025 09:51:23 +0100 Subject: x86/cacheinfo: Remove unnecessary headers and reorder the rest Remove the headers at cacheinfo.c that are no longer required. Alphabetically reorder what remains since more headers will be included in further commits. Signed-off-by: Ahmed S. Darwish Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250304085152.51092-13-darwi@linutronix.de --- arch/x86/kernel/cpu/cacheinfo.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index eccffe2ea06c..b3a520959b51 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -8,21 +8,19 @@ * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ -#include #include +#include #include #include -#include -#include -#include #include #include +#include -#include -#include #include -#include +#include +#include #include +#include #include #include "cpu.h" -- cgit v1.2.3