From 89079520cef65d6da1e864eab4464effe5396e23 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Fri, 11 Apr 2025 10:46:00 +0800 Subject: RISC-V: vDSO: Wire up getrandom() vDSO implementation Hook up the generic vDSO implementation to the generic vDSO getrandom implementation by providing the required __arch_chacha20_blocks_nostack and getrandom_syscall implementations. Also wire up the selftests. The benchmark result: vdso: 25000000 times in 2.466341333 seconds libc: 25000000 times in 41.447720005 seconds syscall: 25000000 times in 41.043926672 seconds vdso: 25000000 x 256 times in 162.286219353 seconds libc: 25000000 x 256 times in 2953.855018685 seconds syscall: 25000000 x 256 times in 2796.268546000 seconds Signed-off-by: Xi Ruoyao Link: https://lore.kernel.org/r/20250411024600.16045-1-xry111@xry111.site Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/vdso/getrandom.h | 30 ++++ arch/riscv/kernel/vdso/Makefile | 12 ++ arch/riscv/kernel/vdso/getrandom.c | 10 ++ arch/riscv/kernel/vdso/vdso.lds.S | 1 + arch/riscv/kernel/vdso/vgetrandom-chacha.S | 244 +++++++++++++++++++++++++++++ 6 files changed, 298 insertions(+) create mode 100644 arch/riscv/include/asm/vdso/getrandom.h create mode 100644 arch/riscv/kernel/vdso/getrandom.c create mode 100644 arch/riscv/kernel/vdso/vgetrandom-chacha.S (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bbec87b79309..fbca724302ab 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -218,6 +218,7 @@ config RISCV select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU + select VDSO_GETRANDOM if HAVE_GENERIC_VDSO select USER_STACKTRACE_SUPPORT select ZONE_DMA32 if 64BIT diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h new file mode 100644 index 000000000000..8dc92441702a --- /dev/null +++ b/arch/riscv/include/asm/vdso/getrandom.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Xi Ruoyao . All Rights Reserved. + */ +#ifndef __ASM_VDSO_GETRANDOM_H +#define __ASM_VDSO_GETRANDOM_H + +#ifndef __ASSEMBLY__ + +#include + +static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags) +{ + register long ret asm("a0"); + register long nr asm("a7") = __NR_getrandom; + register void *buffer asm("a0") = _buffer; + register size_t len asm("a1") = _len; + register unsigned int flags asm("a2") = _flags; + + asm volatile ("ecall\n" + : "+r" (ret) + : "r" (nr), "r" (buffer), "r" (len), "r" (flags) + : "memory"); + + return ret; +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index ad73607abc28..7575ef088adc 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -13,9 +13,17 @@ vdso-syms += flush_icache vdso-syms += hwprobe vdso-syms += sys_hwprobe +ifdef CONFIG_VDSO_GETRANDOM +vdso-syms += getrandom +endif + # Files to link into the vdso obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o +ifdef CONFIG_VDSO_GETRANDOM +obj-vdso += vgetrandom-chacha.o +endif + ccflags-y := -fno-stack-protector ccflags-y += -DDISABLE_BRANCH_PROFILING ccflags-y += -fno-builtin @@ -24,6 +32,10 @@ ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y) endif +ifneq ($(c-getrandom-y),) + CFLAGS_getrandom.o += -fPIC -include $(c-getrandom-y) +endif + CFLAGS_hwprobe.o += -fPIC # Build rules diff --git a/arch/riscv/kernel/vdso/getrandom.c b/arch/riscv/kernel/vdso/getrandom.c new file mode 100644 index 000000000000..f21922e8cebd --- /dev/null +++ b/arch/riscv/kernel/vdso/getrandom.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Xi Ruoyao . All Rights Reserved. + */ +#include + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); +} diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index 8e86965a8aae..abc69cda0445 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -80,6 +80,7 @@ VERSION #ifndef COMPAT_VDSO __vdso_riscv_hwprobe; #endif + __vdso_getrandom; local: *; }; } diff --git a/arch/riscv/kernel/vdso/vgetrandom-chacha.S b/arch/riscv/kernel/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..d793cadc78a6 --- /dev/null +++ b/arch/riscv/kernel/vdso/vgetrandom-chacha.S @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Xi Ruoyao . All Rights Reserved. + * + * Based on arch/loongarch/vdso/vgetrandom-chacha.S. + */ + +#include +#include + +.text + +.macro ROTRI rd rs imm + slliw t0, \rs, 32 - \imm + srliw \rd, \rs, \imm + or \rd, \rd, t0 +.endm + +.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 + \op \d0, \d0, \s0 + \op \d1, \d1, \s1 + \op \d2, \d2, \s2 + \op \d3, \d3, \s3 +.endm + +/* + * a0: output bytes + * a1: 32-byte key input + * a2: 8-byte counter input/output + * a3: number of 64-byte blocks to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + +#define output a0 +#define key a1 +#define counter a2 +#define nblocks a3 +#define i a4 +#define state0 s0 +#define state1 s1 +#define state2 s2 +#define state3 s3 +#define state4 s4 +#define state5 s5 +#define state6 s6 +#define state7 s7 +#define state8 s8 +#define state9 s9 +#define state10 s10 +#define state11 s11 +#define state12 a5 +#define state13 a6 +#define state14 a7 +#define state15 t1 +#define cnt t2 +#define copy0 t3 +#define copy1 t4 +#define copy2 t5 +#define copy3 t6 + +/* Packs to be used with OP_4REG */ +#define line0 state0, state1, state2, state3 +#define line1 state4, state5, state6, state7 +#define line2 state8, state9, state10, state11 +#define line3 state12, state13, state14, state15 + +#define line1_perm state5, state6, state7, state4 +#define line2_perm state10, state11, state8, state9 +#define line3_perm state15, state12, state13, state14 + +#define copy copy0, copy1, copy2, copy3 + +#define _16 16, 16, 16, 16 +#define _20 20, 20, 20, 20 +#define _24 24, 24, 24, 24 +#define _25 25, 25, 25, 25 + + addi sp, sp, -12*SZREG + REG_S s0, (sp) + REG_S s1, SZREG(sp) + REG_S s2, 2*SZREG(sp) + REG_S s3, 3*SZREG(sp) + REG_S s4, 4*SZREG(sp) + REG_S s5, 5*SZREG(sp) + REG_S s6, 6*SZREG(sp) + REG_S s7, 7*SZREG(sp) + REG_S s8, 8*SZREG(sp) + REG_S s9, 9*SZREG(sp) + REG_S s10, 10*SZREG(sp) + REG_S s11, 11*SZREG(sp) + + ld cnt, (counter) + + li copy0, 0x61707865 + li copy1, 0x3320646e + li copy2, 0x79622d32 + li copy3, 0x6b206574 + +.Lblock: + /* state[0,1,2,3] = "expand 32-byte k" */ + mv state0, copy0 + mv state1, copy1 + mv state2, copy2 + mv state3, copy3 + + /* state[4,5,..,11] = key */ + lw state4, (key) + lw state5, 4(key) + lw state6, 8(key) + lw state7, 12(key) + lw state8, 16(key) + lw state9, 20(key) + lw state10, 24(key) + lw state11, 28(key) + + /* state[12,13] = counter */ + mv state12, cnt + srli state13, cnt, 32 + + /* state[14,15] = 0 */ + mv state14, zero + mv state15, zero + + li i, 10 +.Lpermute: + /* odd round */ + OP_4REG addw line0, line1 + OP_4REG xor line3, line0 + OP_4REG ROTRI line3, _16 + + OP_4REG addw line2, line3 + OP_4REG xor line1, line2 + OP_4REG ROTRI line1, _20 + + OP_4REG addw line0, line1 + OP_4REG xor line3, line0 + OP_4REG ROTRI line3, _24 + + OP_4REG addw line2, line3 + OP_4REG xor line1, line2 + OP_4REG ROTRI line1, _25 + + /* even round */ + OP_4REG addw line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG ROTRI line3_perm, _16 + + OP_4REG addw line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG ROTRI line1_perm, _20 + + OP_4REG addw line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG ROTRI line3_perm, _24 + + OP_4REG addw line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG ROTRI line1_perm, _25 + + addi i, i, -1 + bnez i, .Lpermute + + /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ + OP_4REG addw line0, copy + sw state0, (output) + sw state1, 4(output) + sw state2, 8(output) + sw state3, 12(output) + + /* from now on state[0,1,2,3] are scratch registers */ + + /* state[0,1,2,3] = lo(key) */ + lw state0, (key) + lw state1, 4(key) + lw state2, 8(key) + lw state3, 12(key) + + /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ + OP_4REG addw line1, line0 + sw state4, 16(output) + sw state5, 20(output) + sw state6, 24(output) + sw state7, 28(output) + + /* state[0,1,2,3] = hi(key) */ + lw state0, 16(key) + lw state1, 20(key) + lw state2, 24(key) + lw state3, 28(key) + + /* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */ + OP_4REG addw line2, line0 + sw state8, 32(output) + sw state9, 36(output) + sw state10, 40(output) + sw state11, 44(output) + + /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ + addw state12, state12, cnt + srli state0, cnt, 32 + addw state13, state13, state0 + sw state12, 48(output) + sw state13, 52(output) + sw state14, 56(output) + sw state15, 60(output) + + /* ++counter */ + addi cnt, cnt, 1 + + /* output += 64 */ + addi output, output, 64 + /* --nblocks */ + addi nblocks, nblocks, -1 + bnez nblocks, .Lblock + + /* counter = [cnt_lo, cnt_hi] */ + sd cnt, (counter) + + /* Zero out the potentially sensitive regs, in case nothing uses these + * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and + * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we + * only need to zero state[12,...,15]. + */ + mv state12, zero + mv state13, zero + mv state14, zero + mv state15, zero + + REG_L s0, (sp) + REG_L s1, SZREG(sp) + REG_L s2, 2*SZREG(sp) + REG_L s3, 3*SZREG(sp) + REG_L s4, 4*SZREG(sp) + REG_L s5, 5*SZREG(sp) + REG_L s6, 6*SZREG(sp) + REG_L s7, 7*SZREG(sp) + REG_L s8, 8*SZREG(sp) + REG_L s9, 9*SZREG(sp) + REG_L s10, 10*SZREG(sp) + REG_L s11, 11*SZREG(sp) + addi sp, sp, 12*SZREG + + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) -- cgit v1.2.3 From bafa451a96d0f1404aa1a5a267f78767a55fac71 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 15 Apr 2025 21:58:31 +0930 Subject: riscv: defconfig: Remove EXPERT The setting of EXPERT is a leftover from when the riscv defconfig was first added. As mentioned in the EXPERT Kconfig help text it is not intended to be set in the usual case. Upon removal a bunch of intrusive debug-related kernel options are no longer set, which is good. A few may want to come back in the future but let those be advocated for on a case by case basis. NAMESPACES, SYSFS_SYSCALL and MEDIA_SUPPORT_FILTER default on and thus fall out of the defconfig. Set VIDEO_CADENCE_CSI2RX=y to ensure VIDEO_CADENCE_CSI2RX stays enabled. Set DEBUG_KERNEL=y in line with other arch defconfigs. This turns on tracing. Signed-off-by: Joel Stanley Link: https://lore.kernel.org/r/20250415122832.982610-1-joel@jms.id.au Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 3c8e16d71e17..286f490ead37 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -18,12 +18,9 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_BPF=y -CONFIG_NAMESPACES=y CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_BLK_DEV_INITRD=y -CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set CONFIG_PROFILING=y CONFIG_ARCH_MICROCHIP=y CONFIG_ARCH_SIFIVE=y @@ -181,6 +178,7 @@ CONFIG_REGULATOR_FIXED_VOLTAGE=y CONFIG_REGULATOR_AXP20X=y CONFIG_REGULATOR_GPIO=y CONFIG_MEDIA_SUPPORT=m +CONFIG_MEDIA_PLATFORM_SUPPORT=y CONFIG_VIDEO_CADENCE_CSI2RX=m CONFIG_DRM=m CONFIG_DRM_RADEON=m @@ -294,25 +292,7 @@ CONFIG_DEFAULT_SECURITY_DAC=y CONFIG_CRYPTO_USER_API_HASH=y CONFIG_CRYPTO_DEV_VIRTIO=y CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_FS=y -CONFIG_DEBUG_PAGEALLOC=y -CONFIG_SCHED_STACK_END_CHECK=y -CONFIG_DEBUG_VM=y -CONFIG_DEBUG_VM_PGFLAGS=y -CONFIG_DEBUG_MEMORY_INIT=y -CONFIG_DEBUG_PER_CPU_MAPS=y -CONFIG_SOFTLOCKUP_DETECTOR=y -CONFIG_WQ_WATCHDOG=y -CONFIG_DEBUG_RT_MUTEXES=y -CONFIG_DEBUG_SPINLOCK=y -CONFIG_DEBUG_MUTEXES=y -CONFIG_DEBUG_RWSEMS=y -CONFIG_DEBUG_ATOMIC_SLEEP=y -CONFIG_DEBUG_LIST=y -CONFIG_DEBUG_PLIST=y -CONFIG_DEBUG_SG=y -# CONFIG_RCU_TRACE is not set -CONFIG_RCU_EQS_DEBUG=y -# CONFIG_FTRACE is not set # CONFIG_RUNTIME_TESTING_MENU is not set CONFIG_MEMTEST=y -- cgit v1.2.3 From 96e0b355883006554a0bee3697da475971d6bba8 Mon Sep 17 00:00:00 2001 From: Ross Stutterheim Date: Wed, 16 Apr 2025 14:50:06 +0100 Subject: ARM: 9447/1: arm/memremap: fix arch_memremap_can_ram_remap() arm/memremap: fix arch_memremap_can_ram_remap() commit 260364d112bc ("arm[64]/memremap: don't abuse pfn_valid() to ensure presence of linear map") added the definition of arch_memremap_can_ram_remap() for arm[64] specific filtering of what pages can be used from the linear mapping. memblock_is_map_memory() was called with the pfn of the address given to arch_memremap_can_ram_remap(); however, memblock_is_map_memory() expects to be given an address for arm, not a pfn. This results in calls to memremap() returning a newly mapped area when it should return an address in the existing linear mapping. Fix this by removing the address to pfn translation and pass the address directly. Fixes: 260364d112bc ("arm[64]/memremap: don't abuse pfn_valid() to ensure presence of linear map") Signed-off-by: Ross Stutterheim Cc: Mike Rapoport Cc: stable@vger.kernel.org Reviewed-by: Catalin Marinas Reviewed-by: Linus Walleij Signed-off-by: Russell King (Oracle) --- arch/arm/mm/ioremap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index 748698e91a4b..27e64f782cb3 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -515,7 +515,5 @@ void __init early_ioremap_init(void) bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, unsigned long flags) { - unsigned long pfn = PHYS_PFN(offset); - - return memblock_is_map_memory(pfn); + return memblock_is_map_memory(offset); } -- cgit v1.2.3 From a2c6c1c23bed3506fd87e00c88540ac47832c867 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 7 Apr 2025 10:03:21 +0300 Subject: x86/PCI: Drop 'pci' suffix from intel_mid_pci.c CE4100 PCI specific code has no 'pci' suffix in the filename, intel_mid_pci.c is the only one that duplicates the folder name in its filename, drop that redundancy. While at it, group the respective modules in the Makefile. Signed-off-by: Andy Shevchenko Signed-off-by: Bjorn Helgaas Acked-by: Ingo Molnar Link: https://patch.msgid.link/20250407070321.3761063-1-andriy.shevchenko@linux.intel.com --- arch/x86/pci/Makefile | 6 +- arch/x86/pci/intel_mid.c | 406 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/pci/intel_mid_pci.c | 406 ------------------------------------------- 3 files changed, 409 insertions(+), 409 deletions(-) create mode 100644 arch/x86/pci/intel_mid.c delete mode 100644 arch/x86/pci/intel_mid_pci.c (limited to 'arch') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 4933fb337983..c1efd5b0d198 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -8,13 +8,13 @@ obj-$(CONFIG_PCI_OLPC) += olpc.o obj-$(CONFIG_PCI_XEN) += xen.o obj-y += fixup.o -obj-$(CONFIG_X86_INTEL_CE) += ce4100.o obj-$(CONFIG_ACPI) += acpi.o obj-y += legacy.o irq.o -obj-$(CONFIG_X86_NUMACHIP) += numachip.o +obj-$(CONFIG_X86_INTEL_CE) += ce4100.o +obj-$(CONFIG_X86_INTEL_MID) += intel_mid.o -obj-$(CONFIG_X86_INTEL_MID) += intel_mid_pci.o +obj-$(CONFIG_X86_NUMACHIP) += numachip.o obj-y += common.o early.o obj-y += bus_numa.o diff --git a/arch/x86/pci/intel_mid.c b/arch/x86/pci/intel_mid.c new file mode 100644 index 000000000000..b433b1753016 --- /dev/null +++ b/arch/x86/pci/intel_mid.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel MID PCI support + * Copyright (c) 2008 Intel Corporation + * Jesse Barnes + * + * Moorestown has an interesting PCI implementation: + * - configuration space is memory mapped (as defined by MCFG) + * - Lincroft devices also have a real, type 1 configuration space + * - Early Lincroft silicon has a type 1 access bug that will cause + * a hang if non-existent devices are accessed + * - some devices have the "fixed BAR" capability, which means + * they can't be relocated or modified; check for that during + * BAR sizing + * + * So, we use the MCFG space for all reads and writes, but also send + * Lincroft writes to type 1 space. But only read/write if the device + * actually exists, otherwise return all 1s for reads and bit bucket + * the writes. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PCIE_CAP_OFFSET 0x100 + +/* Quirks for the listed devices */ +#define PCI_DEVICE_ID_INTEL_MRFLD_MMC 0x1190 +#define PCI_DEVICE_ID_INTEL_MRFLD_HSU 0x1191 + +/* Fixed BAR fields */ +#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */ +#define PCI_FIXED_BAR_0_SIZE 0x04 +#define PCI_FIXED_BAR_1_SIZE 0x08 +#define PCI_FIXED_BAR_2_SIZE 0x0c +#define PCI_FIXED_BAR_3_SIZE 0x10 +#define PCI_FIXED_BAR_4_SIZE 0x14 +#define PCI_FIXED_BAR_5_SIZE 0x1c + +static int pci_soc_mode; + +/** + * fixed_bar_cap - return the offset of the fixed BAR cap if found + * @bus: PCI bus + * @devfn: device in question + * + * Look for the fixed BAR cap on @bus and @devfn, returning its offset + * if found or 0 otherwise. + */ +static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn) +{ + int pos; + u32 pcie_cap = 0, cap_data; + + pos = PCIE_CAP_OFFSET; + + if (!raw_pci_ext_ops) + return 0; + + while (pos) { + if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, pos, 4, &pcie_cap)) + return 0; + + if (PCI_EXT_CAP_ID(pcie_cap) == 0x0000 || + PCI_EXT_CAP_ID(pcie_cap) == 0xffff) + break; + + if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) { + raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, pos + 4, 4, &cap_data); + if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR) + return pos; + } + + pos = PCI_EXT_CAP_NEXT(pcie_cap); + } + + return 0; +} + +static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, + int reg, int len, u32 val, int offset) +{ + u32 size; + unsigned int domain, busnum; + int bar = (reg - PCI_BASE_ADDRESS_0) >> 2; + + domain = pci_domain_nr(bus); + busnum = bus->number; + + if (val == ~0 && len == 4) { + unsigned long decode; + + raw_pci_ext_ops->read(domain, busnum, devfn, + offset + 8 + (bar * 4), 4, &size); + + /* Turn the size into a decode pattern for the sizing code */ + if (size) { + decode = size - 1; + decode |= decode >> 1; + decode |= decode >> 2; + decode |= decode >> 4; + decode |= decode >> 8; + decode |= decode >> 16; + decode++; + decode = ~(decode - 1); + } else { + decode = 0; + } + + /* + * If val is all ones, the core code is trying to size the reg, + * so update the mmconfig space with the real size. + * + * Note: this assumes the fixed size we got is a power of two. + */ + return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4, + decode); + } + + /* This is some other kind of BAR write, so just do it. */ + return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val); +} + +/** + * type1_access_ok - check whether to use type 1 + * @bus: bus number + * @devfn: device & function in question + * @reg: configuration register offset + * + * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at + * all, the we can go ahead with any reads & writes. If it's on a Lincroft, + * but doesn't exist, avoid the access altogether to keep the chip from + * hanging. + */ +static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) +{ + /* + * This is a workaround for A0 LNC bug where PCI status register does + * not have new CAP bit set. can not be written by SW either. + * + * PCI header type in real LNC indicates a single function device, this + * will prevent probing other devices under the same function in PCI + * shim. Therefore, use the header type in shim instead. + */ + if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE) + return false; + if (bus == 0 && (devfn == PCI_DEVFN(2, 0) + || devfn == PCI_DEVFN(0, 0) + || devfn == PCI_DEVFN(3, 0))) + return true; + return false; /* Langwell on others */ +} + +static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, + int size, u32 *value) +{ + if (type1_access_ok(bus->number, devfn, where)) + return pci_direct_conf1.read(pci_domain_nr(bus), bus->number, + devfn, where, size, value); + return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, + devfn, where, size, value); +} + +static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, + int size, u32 value) +{ + int offset; + + /* + * On MRST, there is no PCI ROM BAR, this will cause a subsequent read + * to ROM BAR return 0 then being ignored. + */ + if (where == PCI_ROM_ADDRESS) + return 0; + + /* + * Devices with fixed BARs need special handling: + * - BAR sizing code will save, write ~0, read size, restore + * - so writes to fixed BARs need special handling + * - other writes to fixed BAR devices should go through mmconfig + */ + offset = fixed_bar_cap(bus, devfn); + if (offset && + (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) { + return pci_device_update_fixed(bus, devfn, where, size, value, + offset); + } + + /* + * On Moorestown update both real & mmconfig space + * Note: early Lincroft silicon can't handle type 1 accesses to + * non-existent devices, so just eat the write in that case. + */ + if (type1_access_ok(bus->number, devfn, where)) + return pci_direct_conf1.write(pci_domain_nr(bus), bus->number, + devfn, where, size, value); + return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn, + where, size, value); +} + +static const struct x86_cpu_id intel_mid_cpu_ids[] = { + X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, NULL), + {} +}; + +static int intel_mid_pci_irq_enable(struct pci_dev *dev) +{ + const struct x86_cpu_id *id; + struct irq_alloc_info info; + bool polarity_low; + u16 model = 0; + int ret; + u8 gsi; + + if (dev->irq_managed && dev->irq > 0) + return 0; + + ret = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); + if (ret) { + dev_warn(&dev->dev, "Failed to read interrupt line: %d\n", ret); + return pcibios_err_to_errno(ret); + } + + id = x86_match_cpu(intel_mid_cpu_ids); + if (id) + model = id->model; + + switch (model) { + case VFM_MODEL(INTEL_ATOM_SILVERMONT_MID): + polarity_low = false; + + /* Special treatment for IRQ0 */ + if (gsi == 0) { + /* + * Skip HS UART common registers device since it has + * IRQ0 assigned and not used by the kernel. + */ + if (dev->device == PCI_DEVICE_ID_INTEL_MRFLD_HSU) + return -EBUSY; + /* + * TNG has IRQ0 assigned to eMMC controller. But there + * are also other devices with bogus PCI configuration + * that have IRQ0 assigned. This check ensures that + * eMMC gets it. The rest of devices still could be + * enabled without interrupt line being allocated. + */ + if (dev->device != PCI_DEVICE_ID_INTEL_MRFLD_MMC) + return 0; + } + break; + default: + polarity_low = true; + break; + } + + ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low); + + /* + * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to + * IOAPIC RTE entries, so we just enable RTE for the device. + */ + ret = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); + if (ret < 0) + return ret; + + dev->irq = ret; + dev->irq_managed = 1; + + return 0; +} + +static void intel_mid_pci_irq_disable(struct pci_dev *dev) +{ + if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed && + dev->irq > 0) { + mp_unmap_irq(dev->irq); + dev->irq_managed = 0; + } +} + +static const struct pci_ops intel_mid_pci_ops __initconst = { + .read = pci_read, + .write = pci_write, +}; + +/** + * intel_mid_pci_init - installs intel_mid_pci_ops + * + * Moorestown has an interesting PCI implementation (see above). + * Called when the early platform detection installs it. + */ +int __init intel_mid_pci_init(void) +{ + pr_info("Intel MID platform detected, using MID PCI ops\n"); + pci_mmcfg_late_init(); + pcibios_enable_irq = intel_mid_pci_irq_enable; + pcibios_disable_irq = intel_mid_pci_irq_disable; + pci_root_ops = intel_mid_pci_ops; + pci_soc_mode = 1; + /* Continue with standard init */ + acpi_noirq_set(); + return 1; +} + +/* + * Langwell devices are not true PCI devices; they are not subject to 10 ms + * d3 to d0 delay required by PCI spec. + */ +static void pci_d3delay_fixup(struct pci_dev *dev) +{ + /* + * PCI fixups are effectively decided compile time. If we have a dual + * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices. + */ + if (!pci_soc_mode) + return; + /* + * True PCI devices in Lincroft should allow type 1 access, the rest + * are Langwell fake PCI devices. + */ + if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) + return; + dev->d3hot_delay = 0; +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); + +static void mid_power_off_one_device(struct pci_dev *dev) +{ + u16 pmcsr; + + /* + * Update current state first, otherwise PCI core enforces PCI_D0 in + * pci_set_power_state() for devices which status was PCI_UNKNOWN. + */ + pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); + dev->current_state = (pci_power_t __force)(pmcsr & PCI_PM_CTRL_STATE_MASK); + + pci_set_power_state(dev, PCI_D3hot); +} + +static void mid_power_off_devices(struct pci_dev *dev) +{ + int id; + + if (!pci_soc_mode) + return; + + id = intel_mid_pwr_get_lss_id(dev); + if (id < 0) + return; + + /* + * This sets only PMCSR bits. The actual power off will happen in + * arch/x86/platform/intel-mid/pwr.c. + */ + mid_power_off_one_device(dev); +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, mid_power_off_devices); + +/* + * Langwell devices reside at fixed offsets, don't try to move them. + */ +static void pci_fixed_bar_fixup(struct pci_dev *dev) +{ + unsigned long offset; + u32 size; + int i; + + if (!pci_soc_mode) + return; + + /* Must have extended configuration space */ + if (dev->cfg_size < PCIE_CAP_OFFSET + 4) + return; + + /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */ + offset = fixed_bar_cap(dev->bus, dev->devfn); + if (!offset || PCI_DEVFN(2, 0) == dev->devfn || + PCI_DEVFN(2, 2) == dev->devfn) + return; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + pci_read_config_dword(dev, offset + 8 + (i * 4), &size); + dev->resource[i].end = dev->resource[i].start + size - 1; + dev->resource[i].flags |= IORESOURCE_PCI_FIXED; + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup); diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c deleted file mode 100644 index b433b1753016..000000000000 --- a/arch/x86/pci/intel_mid_pci.c +++ /dev/null @@ -1,406 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Intel MID PCI support - * Copyright (c) 2008 Intel Corporation - * Jesse Barnes - * - * Moorestown has an interesting PCI implementation: - * - configuration space is memory mapped (as defined by MCFG) - * - Lincroft devices also have a real, type 1 configuration space - * - Early Lincroft silicon has a type 1 access bug that will cause - * a hang if non-existent devices are accessed - * - some devices have the "fixed BAR" capability, which means - * they can't be relocated or modified; check for that during - * BAR sizing - * - * So, we use the MCFG space for all reads and writes, but also send - * Lincroft writes to type 1 space. But only read/write if the device - * actually exists, otherwise return all 1s for reads and bit bucket - * the writes. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#define PCIE_CAP_OFFSET 0x100 - -/* Quirks for the listed devices */ -#define PCI_DEVICE_ID_INTEL_MRFLD_MMC 0x1190 -#define PCI_DEVICE_ID_INTEL_MRFLD_HSU 0x1191 - -/* Fixed BAR fields */ -#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */ -#define PCI_FIXED_BAR_0_SIZE 0x04 -#define PCI_FIXED_BAR_1_SIZE 0x08 -#define PCI_FIXED_BAR_2_SIZE 0x0c -#define PCI_FIXED_BAR_3_SIZE 0x10 -#define PCI_FIXED_BAR_4_SIZE 0x14 -#define PCI_FIXED_BAR_5_SIZE 0x1c - -static int pci_soc_mode; - -/** - * fixed_bar_cap - return the offset of the fixed BAR cap if found - * @bus: PCI bus - * @devfn: device in question - * - * Look for the fixed BAR cap on @bus and @devfn, returning its offset - * if found or 0 otherwise. - */ -static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn) -{ - int pos; - u32 pcie_cap = 0, cap_data; - - pos = PCIE_CAP_OFFSET; - - if (!raw_pci_ext_ops) - return 0; - - while (pos) { - if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, - devfn, pos, 4, &pcie_cap)) - return 0; - - if (PCI_EXT_CAP_ID(pcie_cap) == 0x0000 || - PCI_EXT_CAP_ID(pcie_cap) == 0xffff) - break; - - if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) { - raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, - devfn, pos + 4, 4, &cap_data); - if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR) - return pos; - } - - pos = PCI_EXT_CAP_NEXT(pcie_cap); - } - - return 0; -} - -static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn, - int reg, int len, u32 val, int offset) -{ - u32 size; - unsigned int domain, busnum; - int bar = (reg - PCI_BASE_ADDRESS_0) >> 2; - - domain = pci_domain_nr(bus); - busnum = bus->number; - - if (val == ~0 && len == 4) { - unsigned long decode; - - raw_pci_ext_ops->read(domain, busnum, devfn, - offset + 8 + (bar * 4), 4, &size); - - /* Turn the size into a decode pattern for the sizing code */ - if (size) { - decode = size - 1; - decode |= decode >> 1; - decode |= decode >> 2; - decode |= decode >> 4; - decode |= decode >> 8; - decode |= decode >> 16; - decode++; - decode = ~(decode - 1); - } else { - decode = 0; - } - - /* - * If val is all ones, the core code is trying to size the reg, - * so update the mmconfig space with the real size. - * - * Note: this assumes the fixed size we got is a power of two. - */ - return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4, - decode); - } - - /* This is some other kind of BAR write, so just do it. */ - return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val); -} - -/** - * type1_access_ok - check whether to use type 1 - * @bus: bus number - * @devfn: device & function in question - * @reg: configuration register offset - * - * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at - * all, the we can go ahead with any reads & writes. If it's on a Lincroft, - * but doesn't exist, avoid the access altogether to keep the chip from - * hanging. - */ -static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) -{ - /* - * This is a workaround for A0 LNC bug where PCI status register does - * not have new CAP bit set. can not be written by SW either. - * - * PCI header type in real LNC indicates a single function device, this - * will prevent probing other devices under the same function in PCI - * shim. Therefore, use the header type in shim instead. - */ - if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE) - return false; - if (bus == 0 && (devfn == PCI_DEVFN(2, 0) - || devfn == PCI_DEVFN(0, 0) - || devfn == PCI_DEVFN(3, 0))) - return true; - return false; /* Langwell on others */ -} - -static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, - int size, u32 *value) -{ - if (type1_access_ok(bus->number, devfn, where)) - return pci_direct_conf1.read(pci_domain_nr(bus), bus->number, - devfn, where, size, value); - return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, - devfn, where, size, value); -} - -static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, - int size, u32 value) -{ - int offset; - - /* - * On MRST, there is no PCI ROM BAR, this will cause a subsequent read - * to ROM BAR return 0 then being ignored. - */ - if (where == PCI_ROM_ADDRESS) - return 0; - - /* - * Devices with fixed BARs need special handling: - * - BAR sizing code will save, write ~0, read size, restore - * - so writes to fixed BARs need special handling - * - other writes to fixed BAR devices should go through mmconfig - */ - offset = fixed_bar_cap(bus, devfn); - if (offset && - (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) { - return pci_device_update_fixed(bus, devfn, where, size, value, - offset); - } - - /* - * On Moorestown update both real & mmconfig space - * Note: early Lincroft silicon can't handle type 1 accesses to - * non-existent devices, so just eat the write in that case. - */ - if (type1_access_ok(bus->number, devfn, where)) - return pci_direct_conf1.write(pci_domain_nr(bus), bus->number, - devfn, where, size, value); - return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn, - where, size, value); -} - -static const struct x86_cpu_id intel_mid_cpu_ids[] = { - X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, NULL), - {} -}; - -static int intel_mid_pci_irq_enable(struct pci_dev *dev) -{ - const struct x86_cpu_id *id; - struct irq_alloc_info info; - bool polarity_low; - u16 model = 0; - int ret; - u8 gsi; - - if (dev->irq_managed && dev->irq > 0) - return 0; - - ret = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); - if (ret) { - dev_warn(&dev->dev, "Failed to read interrupt line: %d\n", ret); - return pcibios_err_to_errno(ret); - } - - id = x86_match_cpu(intel_mid_cpu_ids); - if (id) - model = id->model; - - switch (model) { - case VFM_MODEL(INTEL_ATOM_SILVERMONT_MID): - polarity_low = false; - - /* Special treatment for IRQ0 */ - if (gsi == 0) { - /* - * Skip HS UART common registers device since it has - * IRQ0 assigned and not used by the kernel. - */ - if (dev->device == PCI_DEVICE_ID_INTEL_MRFLD_HSU) - return -EBUSY; - /* - * TNG has IRQ0 assigned to eMMC controller. But there - * are also other devices with bogus PCI configuration - * that have IRQ0 assigned. This check ensures that - * eMMC gets it. The rest of devices still could be - * enabled without interrupt line being allocated. - */ - if (dev->device != PCI_DEVICE_ID_INTEL_MRFLD_MMC) - return 0; - } - break; - default: - polarity_low = true; - break; - } - - ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low); - - /* - * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to - * IOAPIC RTE entries, so we just enable RTE for the device. - */ - ret = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info); - if (ret < 0) - return ret; - - dev->irq = ret; - dev->irq_managed = 1; - - return 0; -} - -static void intel_mid_pci_irq_disable(struct pci_dev *dev) -{ - if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed && - dev->irq > 0) { - mp_unmap_irq(dev->irq); - dev->irq_managed = 0; - } -} - -static const struct pci_ops intel_mid_pci_ops __initconst = { - .read = pci_read, - .write = pci_write, -}; - -/** - * intel_mid_pci_init - installs intel_mid_pci_ops - * - * Moorestown has an interesting PCI implementation (see above). - * Called when the early platform detection installs it. - */ -int __init intel_mid_pci_init(void) -{ - pr_info("Intel MID platform detected, using MID PCI ops\n"); - pci_mmcfg_late_init(); - pcibios_enable_irq = intel_mid_pci_irq_enable; - pcibios_disable_irq = intel_mid_pci_irq_disable; - pci_root_ops = intel_mid_pci_ops; - pci_soc_mode = 1; - /* Continue with standard init */ - acpi_noirq_set(); - return 1; -} - -/* - * Langwell devices are not true PCI devices; they are not subject to 10 ms - * d3 to d0 delay required by PCI spec. - */ -static void pci_d3delay_fixup(struct pci_dev *dev) -{ - /* - * PCI fixups are effectively decided compile time. If we have a dual - * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices. - */ - if (!pci_soc_mode) - return; - /* - * True PCI devices in Lincroft should allow type 1 access, the rest - * are Langwell fake PCI devices. - */ - if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) - return; - dev->d3hot_delay = 0; -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); - -static void mid_power_off_one_device(struct pci_dev *dev) -{ - u16 pmcsr; - - /* - * Update current state first, otherwise PCI core enforces PCI_D0 in - * pci_set_power_state() for devices which status was PCI_UNKNOWN. - */ - pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr); - dev->current_state = (pci_power_t __force)(pmcsr & PCI_PM_CTRL_STATE_MASK); - - pci_set_power_state(dev, PCI_D3hot); -} - -static void mid_power_off_devices(struct pci_dev *dev) -{ - int id; - - if (!pci_soc_mode) - return; - - id = intel_mid_pwr_get_lss_id(dev); - if (id < 0) - return; - - /* - * This sets only PMCSR bits. The actual power off will happen in - * arch/x86/platform/intel-mid/pwr.c. - */ - mid_power_off_one_device(dev); -} - -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, mid_power_off_devices); - -/* - * Langwell devices reside at fixed offsets, don't try to move them. - */ -static void pci_fixed_bar_fixup(struct pci_dev *dev) -{ - unsigned long offset; - u32 size; - int i; - - if (!pci_soc_mode) - return; - - /* Must have extended configuration space */ - if (dev->cfg_size < PCIE_CAP_OFFSET + 4) - return; - - /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */ - offset = fixed_bar_cap(dev->bus, dev->devfn); - if (!offset || PCI_DEVFN(2, 0) == dev->devfn || - PCI_DEVFN(2, 2) == dev->devfn) - return; - - for (i = 0; i < PCI_STD_NUM_BARS; i++) { - pci_read_config_dword(dev, offset + 8 + (i * 4), &size); - dev->resource[i].end = dev->resource[i].start + size - 1; - dev->resource[i].flags |= IORESOURCE_PCI_FIXED; - } -} -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup); -- cgit v1.2.3 From 61a74ad254628ccd9e88838c3c622885dfb6c588 Mon Sep 17 00:00:00 2001 From: Nylon Chen Date: Fri, 11 Apr 2025 15:38:50 +0800 Subject: riscv: misaligned: fix sleeping function called during misaligned access handling Use copy_from_user_nofault() and copy_to_user_nofault() instead of copy_from/to_user functions in the misaligned access trap handlers. The following bug report was found when executing misaligned memory accesses: BUG: sleeping function called from invalid context at ./include/linux/uaccess.h:162 in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 115, name: two preempt_count: 0, expected: 0 CPU: 0 UID: 0 PID: 115 Comm: two Not tainted 6.14.0-rc5 #24 Hardware name: riscv-virtio,qemu (DT) Call Trace: [] dump_backtrace+0x1c/0x24 [] show_stack+0x28/0x34 [] dump_stack_lvl+0x4a/0x68 [] dump_stack+0x14/0x1c [] __might_resched+0xfa/0x104 [] __might_sleep+0x3e/0x62 [] __might_fault+0x1c/0x24 [] _copy_from_user+0x28/0xaa [] handle_misaligned_store+0x204/0x254 [] do_trap_store_misaligned+0x24/0xee [] handle_exception+0x146/0x152 Fixes: b686ecdeacf6 ("riscv: misaligned: Restrict user access to kernel memory") Fixes: 441381506ba7 ("riscv: misaligned: remove CONFIG_RISCV_M_MODE specific code") Signed-off-by: Zong Li Signed-off-by: Nylon Chen Link: https://lore.kernel.org/r/20250411073850.3699180-3-nylon.chen@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps_misaligned.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 4354c87c0376..0173ed80818c 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -441,7 +441,7 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs) val.data_u64 = 0; if (user_mode(regs)) { - if (copy_from_user(&val, (u8 __user *)addr, len)) + if (copy_from_user_nofault(&val, (u8 __user *)addr, len)) return -1; } else { memcpy(&val, (u8 *)addr, len); @@ -539,7 +539,7 @@ static int handle_scalar_misaligned_store(struct pt_regs *regs) return -EOPNOTSUPP; if (user_mode(regs)) { - if (copy_to_user((u8 __user *)addr, &val, len)) + if (copy_to_user_nofault((u8 __user *)addr, &val, len)) return -1; } else { memcpy((u8 *)addr, &val, len); -- cgit v1.2.3 From 7b30b1b04e0d04e56e848c3b9c2952c30de05af9 Mon Sep 17 00:00:00 2001 From: Nylon Chen Date: Fri, 11 Apr 2025 15:38:49 +0800 Subject: riscv: misaligned: Add handling for ZCB instructions Add support for the Zcb extension's compressed half-word instructions (C.LHU, C.LH, and C.SH) in the RISC-V misaligned access trap handler. Signed-off-by: Zong Li Signed-off-by: Nylon Chen Link: https://lore.kernel.org/r/20250411073850.3699180-2-nylon.chen@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps_misaligned.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch') diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 0173ed80818c..7443632445d4 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -88,6 +88,13 @@ #define INSN_MATCH_C_FSWSP 0xe002 #define INSN_MASK_C_FSWSP 0xe003 +#define INSN_MATCH_C_LHU 0x8400 +#define INSN_MASK_C_LHU 0xfc43 +#define INSN_MATCH_C_LH 0x8440 +#define INSN_MASK_C_LH 0xfc43 +#define INSN_MATCH_C_SH 0x8c00 +#define INSN_MASK_C_SH 0xfc43 + #define INSN_LEN(insn) ((((insn) & 0x3) < 0x3) ? 2 : 4) #if defined(CONFIG_64BIT) @@ -431,6 +438,13 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs) fp = 1; len = 4; #endif + } else if ((insn & INSN_MASK_C_LHU) == INSN_MATCH_C_LHU) { + len = 2; + insn = RVC_RS2S(insn) << SH_RD; + } else if ((insn & INSN_MASK_C_LH) == INSN_MATCH_C_LH) { + len = 2; + shift = 8 * (sizeof(ulong) - len); + insn = RVC_RS2S(insn) << SH_RD; } else { regs->epc = epc; return -1; @@ -530,6 +544,9 @@ static int handle_scalar_misaligned_store(struct pt_regs *regs) len = 4; val.data_ulong = GET_F32_RS2C(insn, regs); #endif + } else if ((insn & INSN_MASK_C_SH) == INSN_MATCH_C_SH) { + len = 2; + val.data_ulong = GET_RS2S(insn, regs); } else { regs->epc = epc; return -1; -- cgit v1.2.3 From 55ba5375ba1c45404a917aed4f772da9a82fe723 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 7 Apr 2025 09:25:07 +0200 Subject: MIPS: rb532: gpio: use new line value setter callbacks struct gpio_chip now has callbacks for setting line values that return an integer, allowing to indicate failures. Convert the driver to using them. Signed-off-by: Bartosz Golaszewski Reviewed-by: Linus Walleij Signed-off-by: Thomas Bogendoerfer --- arch/mips/rb532/gpio.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/mips/rb532/gpio.c b/arch/mips/rb532/gpio.c index ea6ebfea4a67..0e47cd59b6cb 100644 --- a/arch/mips/rb532/gpio.c +++ b/arch/mips/rb532/gpio.c @@ -105,13 +105,15 @@ static int rb532_gpio_get(struct gpio_chip *chip, unsigned offset) /* * Set output GPIO level */ -static void rb532_gpio_set(struct gpio_chip *chip, - unsigned offset, int value) +static int rb532_gpio_set(struct gpio_chip *chip, unsigned int offset, + int value) { struct rb532_gpio_chip *gpch; gpch = gpiochip_get_data(chip); rb532_set_bit(value, offset, gpch->regbase + GPIOD); + + return 0; } /* @@ -162,7 +164,7 @@ static struct rb532_gpio_chip rb532_gpio_chip[] = { .direction_input = rb532_gpio_direction_input, .direction_output = rb532_gpio_direction_output, .get = rb532_gpio_get, - .set = rb532_gpio_set, + .set_rv = rb532_gpio_set, .to_irq = rb532_gpio_to_irq, .base = 0, .ngpio = 32, -- cgit v1.2.3 From 64f3322bea9404d2dd1e8bcdc5fb025044e48d96 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 7 Apr 2025 09:25:08 +0200 Subject: MIPS: bcm63xx: gpio: use new line value setter callbacks struct gpio_chip now has callbacks for setting line values that return an integer, allowing to indicate failures. Convert the driver to using them. Signed-off-by: Bartosz Golaszewski Reviewed-by: Linus Walleij Signed-off-by: Thomas Bogendoerfer --- arch/mips/bcm63xx/gpio.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/mips/bcm63xx/gpio.c b/arch/mips/bcm63xx/gpio.c index 5c4a233db55f..e7a53cd0dec5 100644 --- a/arch/mips/bcm63xx/gpio.c +++ b/arch/mips/bcm63xx/gpio.c @@ -35,8 +35,7 @@ static void bcm63xx_gpio_out_low_reg_init(void) static DEFINE_SPINLOCK(bcm63xx_gpio_lock); static u32 gpio_out_low, gpio_out_high; -static void bcm63xx_gpio_set(struct gpio_chip *chip, - unsigned gpio, int val) +static int bcm63xx_gpio_set(struct gpio_chip *chip, unsigned int gpio, int val) { u32 reg; u32 mask; @@ -62,6 +61,8 @@ static void bcm63xx_gpio_set(struct gpio_chip *chip, *v &= ~mask; bcm_gpio_writel(*v, reg); spin_unlock_irqrestore(&bcm63xx_gpio_lock, flags); + + return 0; } static int bcm63xx_gpio_get(struct gpio_chip *chip, unsigned gpio) @@ -130,7 +131,7 @@ static struct gpio_chip bcm63xx_gpio_chip = { .direction_input = bcm63xx_gpio_direction_input, .direction_output = bcm63xx_gpio_direction_output, .get = bcm63xx_gpio_get, - .set = bcm63xx_gpio_set, + .set_rv = bcm63xx_gpio_set, .base = 0, }; -- cgit v1.2.3 From 68bdc4dc1130ec5a3d9fdc1b4d90cdc9e39a8792 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 7 Apr 2025 09:25:09 +0200 Subject: MIPS: alchemy: gpio: use new line value setter callbacks struct gpio_chip now has callbacks for setting line values that return an integer, allowing to indicate failures. Convert the driver to using them. Signed-off-by: Bartosz Golaszewski Reviewed-by: Linus Walleij Signed-off-by: Thomas Bogendoerfer --- arch/mips/alchemy/common/gpiolib.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/mips/alchemy/common/gpiolib.c b/arch/mips/alchemy/common/gpiolib.c index 1b16daaa86ae..411f70ceb762 100644 --- a/arch/mips/alchemy/common/gpiolib.c +++ b/arch/mips/alchemy/common/gpiolib.c @@ -119,9 +119,11 @@ static int alchemy_gpic_get(struct gpio_chip *chip, unsigned int off) return !!au1300_gpio_get_value(off + AU1300_GPIO_BASE); } -static void alchemy_gpic_set(struct gpio_chip *chip, unsigned int off, int v) +static int alchemy_gpic_set(struct gpio_chip *chip, unsigned int off, int v) { au1300_gpio_set_value(off + AU1300_GPIO_BASE, v); + + return 0; } static int alchemy_gpic_dir_input(struct gpio_chip *chip, unsigned int off) @@ -145,7 +147,7 @@ static struct gpio_chip au1300_gpiochip = { .direction_input = alchemy_gpic_dir_input, .direction_output = alchemy_gpic_dir_output, .get = alchemy_gpic_get, - .set = alchemy_gpic_set, + .set_rv = alchemy_gpic_set, .to_irq = alchemy_gpic_gpio_to_irq, .base = AU1300_GPIO_BASE, .ngpio = AU1300_GPIO_NUM, -- cgit v1.2.3 From 37022f745b58436e7d3c3c924c78d78a139b00e7 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 7 Apr 2025 09:25:10 +0200 Subject: MIPS: txx9: gpio: use new line value setter callbacks struct gpio_chip now has callbacks for setting line values that return an integer, allowing to indicate failures. Convert the drivers to using them. Signed-off-by: Bartosz Golaszewski Reviewed-by: Linus Walleij Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/gpio_txx9.c | 8 +++++--- arch/mips/txx9/generic/setup.c | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/mips/kernel/gpio_txx9.c b/arch/mips/kernel/gpio_txx9.c index 8c083612df9d..027fb57d0d79 100644 --- a/arch/mips/kernel/gpio_txx9.c +++ b/arch/mips/kernel/gpio_txx9.c @@ -32,14 +32,16 @@ static void txx9_gpio_set_raw(unsigned int offset, int value) __raw_writel(val, &txx9_pioptr->dout); } -static void txx9_gpio_set(struct gpio_chip *chip, unsigned int offset, - int value) +static int txx9_gpio_set(struct gpio_chip *chip, unsigned int offset, + int value) { unsigned long flags; spin_lock_irqsave(&txx9_gpio_lock, flags); txx9_gpio_set_raw(offset, value); mmiowb(); spin_unlock_irqrestore(&txx9_gpio_lock, flags); + + return 0; } static int txx9_gpio_dir_in(struct gpio_chip *chip, unsigned int offset) @@ -68,7 +70,7 @@ static int txx9_gpio_dir_out(struct gpio_chip *chip, unsigned int offset, static struct gpio_chip txx9_gpio_chip = { .get = txx9_gpio_get, - .set = txx9_gpio_set, + .set_rv = txx9_gpio_set, .direction_input = txx9_gpio_dir_in, .direction_output = txx9_gpio_dir_out, .label = "TXx9", diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c index 1e67fecd466e..0586ca7668b4 100644 --- a/arch/mips/txx9/generic/setup.c +++ b/arch/mips/txx9/generic/setup.c @@ -603,8 +603,8 @@ static int txx9_iocled_get(struct gpio_chip *chip, unsigned int offset) return !!(data->cur_val & (1 << offset)); } -static void txx9_iocled_set(struct gpio_chip *chip, unsigned int offset, - int value) +static int txx9_iocled_set(struct gpio_chip *chip, unsigned int offset, + int value) { struct txx9_iocled_data *data = gpiochip_get_data(chip); unsigned long flags; @@ -616,6 +616,8 @@ static void txx9_iocled_set(struct gpio_chip *chip, unsigned int offset, writeb(data->cur_val, data->mmioaddr); mmiowb(); spin_unlock_irqrestore(&txx9_iocled_lock, flags); + + return 0; } static int txx9_iocled_dir_in(struct gpio_chip *chip, unsigned int offset) @@ -653,7 +655,7 @@ void __init txx9_iocled_init(unsigned long baseaddr, if (!iocled->mmioaddr) goto out_free; iocled->chip.get = txx9_iocled_get; - iocled->chip.set = txx9_iocled_set; + iocled->chip.set_rv = txx9_iocled_set; iocled->chip.direction_input = txx9_iocled_dir_in; iocled->chip.direction_output = txx9_iocled_dir_out; iocled->chip.label = "iocled"; -- cgit v1.2.3 From 3b61b6a369d9d81ffaa8a87045782352d08a91aa Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Wed, 16 Apr 2025 08:54:25 +1200 Subject: mips: dts: realtek: Add MDIO controller Add a device tree node for the MDIO controller on the RTL9300 chips. Signed-off-by: Chris Packham Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/realtek/rtl930x.dtsi | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'arch') diff --git a/arch/mips/boot/dts/realtek/rtl930x.dtsi b/arch/mips/boot/dts/realtek/rtl930x.dtsi index f2e57ea3a60c..101bab72a95f 100644 --- a/arch/mips/boot/dts/realtek/rtl930x.dtsi +++ b/arch/mips/boot/dts/realtek/rtl930x.dtsi @@ -69,6 +69,39 @@ #size-cells = <0>; status = "disabled"; }; + + mdio_controller: mdio-controller@ca00 { + compatible = "realtek,rtl9301-mdio"; + reg = <0xca00 0x200>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + + mdio0: mdio-bus@0 { + reg = <0>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + mdio1: mdio-bus@1 { + reg = <1>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + mdio2: mdio-bus@2 { + reg = <2>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + mdio3: mdio-bus@3 { + reg = <3>; + #address-cells = <1>; + #size-cells = <0>; + status = "disabled"; + }; + }; }; soc: soc@18000000 { -- cgit v1.2.3 From 6d223b8ffcd1593d032b71875def2daa71c53111 Mon Sep 17 00:00:00 2001 From: WangYuli Date: Wed, 16 Apr 2025 11:45:48 +0800 Subject: MIPS: Loongson64: Add missing '#interrupt-cells' for loongson64c_ls7a MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to commit 98a9e2ac3755 ("MIPS: Loongson64: DTS: Fix msi node for ls7a"). Fix follow warnings: arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts:28.31-36.4: Warning (interrupt_provider): /bus@10000000/msi-controller@2ff00000: Missing '#interrupt-cells' in interrupt provider arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dtb: Warning (interrupt_map): Failed prerequisite 'interrupt_provider' Fixes: 24af105962c8 ("MIPS: Loongson64: DeviceTree for LS7A PCH") Tested-by: WangYuli Signed-off-by: WangYuli Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts b/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts index c7ea4f1c0bb2..6c277ab83d4b 100644 --- a/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts +++ b/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts @@ -29,6 +29,7 @@ compatible = "loongson,pch-msi-1.0"; reg = <0 0x2ff00000 0 0x8>; interrupt-controller; + #interrupt-cells = <1>; msi-controller; loongson,msi-base-vec = <64>; loongson,msi-num-vecs = <64>; -- cgit v1.2.3 From 0f4ae7c6ecb89bfda026d210dcf8216fb67d2333 Mon Sep 17 00:00:00 2001 From: Khem Raj Date: Sat, 29 Mar 2025 08:39:03 -0700 Subject: mips: Add -std= flag specified in KBUILD_CFLAGS to vdso CFLAGS GCC 15 changed the default C standard dialect from gnu17 to gnu23, which should not have impacted the kernel because it explicitly requests the gnu11 standard in the main Makefile. However, mips/vdso code uses its own CFLAGS without a '-std=' value, which break with this dialect change because of the kernel's own definitions of bool, false, and true conflicting with the C23 reserved keywords. include/linux/stddef.h:11:9: error: cannot use keyword 'false' as enumeration constant 11 | false = 0, | ^~~~~ include/linux/stddef.h:11:9: note: 'false' is a keyword with '-std=c23' onwards include/linux/types.h:35:33: error: 'bool' cannot be defined via 'typedef' 35 | typedef _Bool bool; | ^~~~ include/linux/types.h:35:33: note: 'bool' is a keyword with '-std=c23' onwards Add -std as specified in KBUILD_CFLAGS to the decompressor and purgatory CFLAGS to eliminate these errors and make the C standard version of these areas match the rest of the kernel. Signed-off-by: Khem Raj Cc: stable@vger.kernel.org Signed-off-by: Thomas Bogendoerfer --- arch/mips/vdso/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index fb4c493aaffa..69d4593f64fe 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -27,6 +27,7 @@ endif # offsets. cflags-vdso := $(ccflags-vdso) \ $(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \ + $(filter -std=%,$(KBUILD_CFLAGS)) \ -O3 -g -fPIC -fno-strict-aliasing -fno-common -fno-builtin -G 0 \ -mrelax-pic-calls $(call cc-option, -mexplicit-relocs) \ -fno-stack-protector -fno-jump-tables -DDISABLE_BRANCH_PROFILING \ -- cgit v1.2.3 From 76c43eb507bc1162850fdae6cc44790d1c9a83ea Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Sun, 13 Apr 2025 21:12:32 +0200 Subject: MIPS: SMP: Implement parallel CPU bring up for EyeQ Added support for starting CPUs in parallel on EyeQ to speed up boot time. On EyeQ5, booting 8 CPUs is now ~90ms faster. On EyeQ6, booting 32 CPUs is now ~650ms faster. Signed-off-by: Gregory CLEMENT Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kconfig | 2 ++ arch/mips/include/asm/topology.h | 3 +++ arch/mips/kernel/smp-cps.c | 2 ++ arch/mips/kernel/smp.c | 18 ++++++++++++++++++ 4 files changed, 25 insertions(+) (limited to 'arch') diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index fc0772c1bad4..e0e6ce2592b4 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -617,6 +617,7 @@ config EYEQ select USB_UHCI_BIG_ENDIAN_DESC if CPU_BIG_ENDIAN select USB_UHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN select USE_OF + select HOTPLUG_PARALLEL if SMP help Select this to build a kernel supporting EyeQ SoC from Mobileye. @@ -2287,6 +2288,7 @@ config MIPS_CPS select MIPS_CM select MIPS_CPS_PM if HOTPLUG_CPU select SMP + select HOTPLUG_SMT if HOTPLUG_PARALLEL select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU select SYNC_R4K if (CEVT_R4K || CSRC_R4K) select SYS_SUPPORTS_HOTPLUG_CPU diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h index 0673d2d0f2e6..5158c802eb65 100644 --- a/arch/mips/include/asm/topology.h +++ b/arch/mips/include/asm/topology.h @@ -16,6 +16,9 @@ #define topology_core_id(cpu) (cpu_core(&cpu_data[cpu])) #define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) #define topology_sibling_cpumask(cpu) (&cpu_sibling_map[cpu]) + +extern struct cpumask __cpu_primary_thread_mask; +#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask) #endif #endif /* __ASM_TOPOLOGY_H */ diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c index e85bd087467e..02bbd7ecd1b9 100644 --- a/arch/mips/kernel/smp-cps.c +++ b/arch/mips/kernel/smp-cps.c @@ -236,6 +236,7 @@ static void __init cps_smp_setup(void) /* Use the number of VPEs in cluster 0 core 0 for smp_num_siblings */ if (!cl && !c) smp_num_siblings = core_vpes; + cpumask_set_cpu(nvpes, &__cpu_primary_thread_mask); for (v = 0; v < min_t(int, core_vpes, NR_CPUS - nvpes); v++) { cpu_set_cluster(&cpu_data[nvpes + v], cl); @@ -364,6 +365,7 @@ static void __init cps_prepare_cpus(unsigned int max_cpus) cl = cpu_cluster(¤t_cpu_data); c = cpu_core(¤t_cpu_data); cluster_bootcfg = &mips_cps_cluster_bootcfg[cl]; + cpu_smt_set_num_threads(core_vpes, core_vpes); core_bootcfg = &cluster_bootcfg->core_config[c]; bitmap_set(cluster_bootcfg->core_power, cpu_core(¤t_cpu_data), 1); atomic_set(&core_bootcfg->vpe_mask, 1 << cpu_vpe_id(¤t_cpu_data)); diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 39e193cad2b9..1726744f2b2e 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -56,8 +56,10 @@ EXPORT_SYMBOL(cpu_sibling_map); cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); +#ifndef CONFIG_HOTPLUG_PARALLEL static DECLARE_COMPLETION(cpu_starting); static DECLARE_COMPLETION(cpu_running); +#endif /* * A logical cpu mask containing only one VPE per core to @@ -74,6 +76,8 @@ static cpumask_t cpu_core_setup_map; cpumask_t cpu_coherent_mask; +struct cpumask __cpu_primary_thread_mask __read_mostly; + unsigned int smp_max_threads __initdata = UINT_MAX; static int __init early_nosmt(char *s) @@ -374,10 +378,15 @@ asmlinkage void start_secondary(void) set_cpu_core_map(cpu); cpumask_set_cpu(cpu, &cpu_coherent_mask); +#ifdef CONFIG_HOTPLUG_PARALLEL + cpuhp_ap_sync_alive(); +#endif notify_cpu_starting(cpu); +#ifndef CONFIG_HOTPLUG_PARALLEL /* Notify boot CPU that we're starting & ready to sync counters */ complete(&cpu_starting); +#endif synchronise_count_slave(cpu); @@ -386,11 +395,13 @@ asmlinkage void start_secondary(void) calculate_cpu_foreign_map(); +#ifndef CONFIG_HOTPLUG_PARALLEL /* * Notify boot CPU that we're up & online and it can safely return * from __cpu_up */ complete(&cpu_running); +#endif /* * irq will be enabled in ->smp_finish(), enabling it too early @@ -447,6 +458,12 @@ void __init smp_prepare_boot_cpu(void) set_cpu_online(0, true); } +#ifdef CONFIG_HOTPLUG_PARALLEL +int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle) +{ + return mp_ops->boot_secondary(cpu, tidle); +} +#else int __cpu_up(unsigned int cpu, struct task_struct *tidle) { int err; @@ -466,6 +483,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) wait_for_completion(&cpu_running); return 0; } +#endif #ifdef CONFIG_PROFILING /* Not really SMP stuff ... */ -- cgit v1.2.3 From cd956a5cb48aaccfa63291b4efd289d2f1e5b025 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 19 Apr 2025 12:27:44 +0200 Subject: mips: ptrace: Improve code formatting and indentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use tabs instead of spaces in regs_query_register_offset() and syscall_trace_leave(), and properly indent multiple getters. Signed-off-by: Thorsten Blum Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/ptrace.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index f7107479c7fa..b890d64d352c 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -922,11 +922,13 @@ static const struct pt_regs_offset regoffset_table[] = { */ int regs_query_register_offset(const char *name) { - const struct pt_regs_offset *roff; - for (roff = regoffset_table; roff->name != NULL; roff++) - if (!strcmp(roff->name, name)) - return roff->offset; - return -EINVAL; + const struct pt_regs_offset *roff; + + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + + return -EINVAL; } #if defined(CONFIG_32BIT) || defined(CONFIG_MIPS32_O32) @@ -937,7 +939,7 @@ static const struct user_regset mips_regsets[] = { .n = ELF_NGREG, .size = sizeof(unsigned int), .align = sizeof(unsigned int), - .regset_get = gpr32_get, + .regset_get = gpr32_get, .set = gpr32_set, }, [REGSET_DSP] = { @@ -945,7 +947,7 @@ static const struct user_regset mips_regsets[] = { .n = NUM_DSP_REGS + 1, .size = sizeof(u32), .align = sizeof(u32), - .regset_get = dsp32_get, + .regset_get = dsp32_get, .set = dsp32_set, .active = dsp_active, }, @@ -955,7 +957,7 @@ static const struct user_regset mips_regsets[] = { .n = ELF_NFPREG, .size = sizeof(elf_fpreg_t), .align = sizeof(elf_fpreg_t), - .regset_get = fpr_get, + .regset_get = fpr_get, .set = fpr_set, }, [REGSET_FP_MODE] = { @@ -963,7 +965,7 @@ static const struct user_regset mips_regsets[] = { .n = 1, .size = sizeof(int), .align = sizeof(int), - .regset_get = fp_mode_get, + .regset_get = fp_mode_get, .set = fp_mode_set, }, #endif @@ -973,7 +975,7 @@ static const struct user_regset mips_regsets[] = { .n = NUM_FPU_REGS + 1, .size = 16, .align = 16, - .regset_get = msa_get, + .regset_get = msa_get, .set = msa_set, }, #endif @@ -997,7 +999,7 @@ static const struct user_regset mips64_regsets[] = { .n = ELF_NGREG, .size = sizeof(unsigned long), .align = sizeof(unsigned long), - .regset_get = gpr64_get, + .regset_get = gpr64_get, .set = gpr64_set, }, [REGSET_DSP] = { @@ -1005,7 +1007,7 @@ static const struct user_regset mips64_regsets[] = { .n = NUM_DSP_REGS + 1, .size = sizeof(u64), .align = sizeof(u64), - .regset_get = dsp64_get, + .regset_get = dsp64_get, .set = dsp64_set, .active = dsp_active, }, @@ -1015,7 +1017,7 @@ static const struct user_regset mips64_regsets[] = { .n = 1, .size = sizeof(int), .align = sizeof(int), - .regset_get = fp_mode_get, + .regset_get = fp_mode_get, .set = fp_mode_set, }, [REGSET_FPR] = { @@ -1023,7 +1025,7 @@ static const struct user_regset mips64_regsets[] = { .n = ELF_NFPREG, .size = sizeof(elf_fpreg_t), .align = sizeof(elf_fpreg_t), - .regset_get = fpr_get, + .regset_get = fpr_get, .set = fpr_set, }, #endif @@ -1033,7 +1035,7 @@ static const struct user_regset mips64_regsets[] = { .n = NUM_FPU_REGS + 1, .size = 16, .align = 16, - .regset_get = msa_get, + .regset_get = msa_get, .set = msa_set, }, #endif @@ -1351,7 +1353,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs) */ asmlinkage void syscall_trace_leave(struct pt_regs *regs) { - /* + /* * We may come here right after calling schedule_user() * or do_notify_resume(), in which case we can be in RCU * user mode. -- cgit v1.2.3 From 9f6d908adabc11e5b407743696dbb333894b022e Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 22 Apr 2025 09:42:55 +0200 Subject: MIPS: BCM63XX: Replace strcpy() with strscpy() in board_prom_init() strcpy() is deprecated; use strscpy() instead. Link: https://github.com/KSPP/linux/issues/88 Cc: linux-hardening@vger.kernel.org Signed-off-by: Thorsten Blum Signed-off-by: Thomas Bogendoerfer --- arch/mips/bcm63xx/boards/board_bcm963xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/bcm63xx/boards/board_bcm963xx.c b/arch/mips/bcm63xx/boards/board_bcm963xx.c index 9cc8fbf218a5..c5617b889b1c 100644 --- a/arch/mips/bcm63xx/boards/board_bcm963xx.c +++ b/arch/mips/bcm63xx/boards/board_bcm963xx.c @@ -764,7 +764,7 @@ void __init board_prom_init(void) snprintf(cfe_version, 12, "%s", (char *) &cfe[4]); } } else { - strcpy(cfe_version, "unknown"); + strscpy(cfe_version, "unknown"); } pr_info("CFE version: %s\n", cfe_version); -- cgit v1.2.3 From 3b3704261e851e25983860e4c352f1f73786f4ab Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 25 Apr 2025 09:46:48 +0200 Subject: MIPS: Replace strcpy() with strscpy() in vpe_elfload() strcpy() is deprecated; use strscpy() instead. Link: https://github.com/KSPP/linux/issues/88 Signed-off-by: Thorsten Blum Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/vpe.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c index 737d0d4fdcd3..2b67c44adab9 100644 --- a/arch/mips/kernel/vpe.c +++ b/arch/mips/kernel/vpe.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -582,7 +583,7 @@ static int vpe_elfload(struct vpe *v) struct module mod; /* so we can re-use the relocations code */ memset(&mod, 0, sizeof(struct module)); - strcpy(mod.name, "VPE loader"); + strscpy(mod.name, "VPE loader"); hdr = (Elf_Ehdr *) v->pbuffer; len = v->plen; -- cgit v1.2.3 From 9456e2c60171ecffc0ea26347418e6bedb64ee78 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Wed, 26 Mar 2025 15:01:12 +0800 Subject: um: xterm: Add Wayland support Under Wayland, we should check WAYLAND_DISPLAY instead. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250326070113.401857-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/xterm.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c index e4316c7981e8..f607af738eac 100644 --- a/arch/um/drivers/xterm.c +++ b/arch/um/drivers/xterm.c @@ -97,12 +97,9 @@ static int xterm_open(int input, int output, int primary, void *d, if (access(argv[4], X_OK) < 0) argv[4] = "port-helper"; - /* - * Check that DISPLAY is set, this doesn't guarantee the xterm - * will work but w/o it we can be pretty sure it won't. - */ - if (getenv("DISPLAY") == NULL) { - printk(UM_KERN_ERR "xterm_open: $DISPLAY not set.\n"); + /* Ensure we are running on Xorg or Wayland. */ + if (!getenv("DISPLAY") && !getenv("WAYLAND_DISPLAY")) { + printk(UM_KERN_ERR "xterm_open : neither $DISPLAY nor $WAYLAND_DISPLAY is set.\n"); return -ENODEV; } -- cgit v1.2.3 From 22361369c2e0dc01ec7c3b42b582a37ca05a0973 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Wed, 26 Mar 2025 15:01:13 +0800 Subject: um: xterm: Update options for gnome-terminal The -x option is deprecated and might be removed in a future release of gnome-terminal. Let's recommend using -- instead. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250326070113.401857-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/xterm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c index f607af738eac..d05918e422f9 100644 --- a/arch/um/drivers/xterm.c +++ b/arch/um/drivers/xterm.c @@ -81,7 +81,7 @@ __uml_setup("xterm=", xterm_setup, " ' command arg1 arg2 ...'.\n" " The default values are 'xterm=" CONFIG_XTERM_CHAN_DEFAULT_EMULATOR ",-T,-e'.\n" -" Values for gnome-terminal are 'xterm=gnome-terminal,-t,-x'.\n\n" +" Values for gnome-terminal are 'xterm=gnome-terminal,-t,--'.\n\n" ); static int xterm_open(int input, int output, int primary, void *d, -- cgit v1.2.3 From 674d03f6bd6b0f8327f1a4920ff5893557facfbd Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 26 Mar 2025 19:05:00 +0000 Subject: um: Add cmpxchg8b_emu and checksum functions to asm-prototypes.h With CONFIG_GENDWARFKSYMS, um builds fail due to missing prototypes in asm/asm-prototypes.h. Add declarations for cmpxchg8b_emu and the exported checksum functions, including csum_partial_copy_generic as it's also exported. Cc: Masahiro Yamada Cc: linux-kbuild@vger.kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503251216.lE4t9Ikj-lkp@intel.com/ Signed-off-by: Sami Tolvanen Link: https://patch.msgid.link/20250326190500.847236-2-samitolvanen@google.com Signed-off-by: Johannes Berg --- arch/um/include/asm/asm-prototypes.h | 5 +++++ arch/x86/um/asm/checksum.h | 3 +++ 2 files changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/um/include/asm/asm-prototypes.h b/arch/um/include/asm/asm-prototypes.h index 5898a26daa0d..408b31d59127 100644 --- a/arch/um/include/asm/asm-prototypes.h +++ b/arch/um/include/asm/asm-prototypes.h @@ -1 +1,6 @@ #include +#include + +#ifdef CONFIG_UML_X86 +extern void cmpxchg8b_emu(void); +#endif diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h index b07824500363..ddc144657efa 100644 --- a/arch/x86/um/asm/checksum.h +++ b/arch/x86/um/asm/checksum.h @@ -20,6 +20,9 @@ */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); +/* Do not call this directly. Declared for export type visibility. */ +extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len); + /** * csum_fold - Fold and invert a 32bit checksum. * sum: 32bit unfolded sum -- cgit v1.2.3 From 82c8e1280c74f80fbacb9efbba6dd7b23446e536 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 31 Mar 2025 16:31:50 +0800 Subject: um: Remove duplicate arch.h header ./arch/um/kernel/trap.c: arch.h is included more than once. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=19877 Signed-off-by: Jiapeng Chong Link: https://patch.msgid.link/20250331083150.72598-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Johannes Berg --- arch/um/kernel/trap.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index ce073150dc20..39820bde49e6 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -16,7 +16,6 @@ #include #include #include -#include /* * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by -- cgit v1.2.3 From 49caacf1004d1e1fc40cfab165f104d051867c6e Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 8 Apr 2025 09:45:23 +0200 Subject: um: do not send SIGALRM to userspace in time-travel mode We send a SIGALRM to userspace processes to interrupt them. Really, doing so is only needed if they are actually executing at the time (to ensure we return to kernelspace). Unfortunately, we do not have that information readily available. We can however be sure that this is never the case when we are in time-travel mode with infinite CPU. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250408074524.300153-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/kernel/time.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index 1394568c0210..ae0fa2173778 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -856,11 +856,16 @@ static struct clock_event_device timer_clockevent = { static irqreturn_t um_timer(int irq, void *dev) { - if (get_current()->mm != NULL) - { - /* userspace - relay signal, results in correct userspace timers */ + /* + * Interrupt the (possibly) running userspace process, technically this + * should only happen if userspace is currently executing. + * With infinite CPU time-travel, we can only get here when userspace + * is not executing. Do not notify there and avoid spurious scheduling. + */ + if (time_travel_mode != TT_MODE_INFCPU && + time_travel_mode != TT_MODE_EXTERNAL && + get_current()->mm) os_alarm_process(get_current()->mm->context.id.pid); - } (*timer_clockevent.event_handler)(&timer_clockevent); -- cgit v1.2.3 From 6767e8784cd2e8b386a62330ea6864949d983a3e Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 8 Apr 2025 09:45:24 +0200 Subject: um: use proper care when taking mmap lock during segfault Segfaults can occur at times where the mmap lock cannot be taken. If that happens the segfault handler may not be able to take the mmap lock. Fix the code to use the same approach as most other architectures. Unfortunately, this requires copying code from mm/memory.c and modifying it slightly as UML does not have exception tables. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250408074524.300153-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/kernel/trap.c | 129 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 117 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 39820bde49e6..8a18f4054793 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -17,6 +17,122 @@ #include #include +/* + * NOTE: UML does not have exception tables. As such, this is almost a copy + * of the code in mm/memory.c, only adjusting the logic to simply check whether + * we are coming from the kernel instead of doing an additional lookup in the + * exception table. + * We can do this simplification because we never get here if the exception was + * fixable. + */ +static inline bool get_mmap_lock_carefully(struct mm_struct *mm, bool is_user) +{ + if (likely(mmap_read_trylock(mm))) + return true; + + if (!is_user) + return false; + + return !mmap_read_lock_killable(mm); +} + +static inline bool mmap_upgrade_trylock(struct mm_struct *mm) +{ + /* + * We don't have this operation yet. + * + * It should be easy enough to do: it's basically a + * atomic_long_try_cmpxchg_acquire() + * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but + * it also needs the proper lockdep magic etc. + */ + return false; +} + +static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, bool is_user) +{ + mmap_read_unlock(mm); + if (!is_user) + return false; + + return !mmap_write_lock_killable(mm); +} + +/* + * Helper for page fault handling. + * + * This is kind of equivalend to "mmap_read_lock()" followed + * by "find_extend_vma()", except it's a lot more careful about + * the locking (and will drop the lock on failure). + * + * For example, if we have a kernel bug that causes a page + * fault, we don't want to just use mmap_read_lock() to get + * the mm lock, because that would deadlock if the bug were + * to happen while we're holding the mm lock for writing. + * + * So this checks the exception tables on kernel faults in + * order to only do this all for instructions that are actually + * expected to fault. + * + * We can also actually take the mm lock for writing if we + * need to extend the vma, which helps the VM layer a lot. + */ +static struct vm_area_struct * +um_lock_mm_and_find_vma(struct mm_struct *mm, + unsigned long addr, bool is_user) +{ + struct vm_area_struct *vma; + + if (!get_mmap_lock_carefully(mm, is_user)) + return NULL; + + vma = find_vma(mm, addr); + if (likely(vma && (vma->vm_start <= addr))) + return vma; + + /* + * Well, dang. We might still be successful, but only + * if we can extend a vma to do so. + */ + if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) { + mmap_read_unlock(mm); + return NULL; + } + + /* + * We can try to upgrade the mmap lock atomically, + * in which case we can continue to use the vma + * we already looked up. + * + * Otherwise we'll have to drop the mmap lock and + * re-take it, and also look up the vma again, + * re-checking it. + */ + if (!mmap_upgrade_trylock(mm)) { + if (!upgrade_mmap_lock_carefully(mm, is_user)) + return NULL; + + vma = find_vma(mm, addr); + if (!vma) + goto fail; + if (vma->vm_start <= addr) + goto success; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto fail; + } + + if (expand_stack_locked(vma, addr)) + goto fail; + +success: + mmap_write_downgrade(mm); + return vma; + +fail: + mmap_write_unlock(mm); + return NULL; +} + /* * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by * segv(). @@ -43,21 +159,10 @@ int handle_page_fault(unsigned long address, unsigned long ip, if (is_user) flags |= FAULT_FLAG_USER; retry: - mmap_read_lock(mm); - vma = find_vma(mm, address); - if (!vma) - goto out; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out; - if (is_user && !ARCH_IS_STACKGROW(address)) - goto out; - vma = expand_stack(mm, address); + vma = um_lock_mm_and_find_vma(mm, address, is_user); if (!vma) goto out_nosemaphore; -good_area: *code_out = SEGV_ACCERR; if (is_write) { if (!(vma->vm_flags & VM_WRITE)) -- cgit v1.2.3 From a0e2cb6a90634f3dc80f16e882a683ee5761b0b0 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Sun, 13 Apr 2025 23:44:21 +0800 Subject: um: Add VFIO-based virtual PCI driver Implement a new virtual PCI driver based on the VFIO framework. This driver allows users to pass through PCI devices to UML via VFIO. Currently, only MSI-X capable devices are supported, and it is assumed that drivers will use MSI-X. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250413154421.517878-1-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/Kconfig | 8 + arch/um/drivers/Makefile | 2 + arch/um/drivers/vfio_kern.c | 642 ++++++++++++++++++++++++++++++++++++++++++++ arch/um/drivers/vfio_user.c | 327 ++++++++++++++++++++++ arch/um/drivers/vfio_user.h | 44 +++ 5 files changed, 1023 insertions(+) create mode 100644 arch/um/drivers/vfio_kern.c create mode 100644 arch/um/drivers/vfio_user.c create mode 100644 arch/um/drivers/vfio_user.h (limited to 'arch') diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index 9cb196070614..d7bb447ff958 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -367,3 +367,11 @@ config UML_PCI_OVER_VIRTIO_DEVICE_ID There's no official device ID assigned (yet), set the one you wish to use for experimentation here. The default of -1 is not valid and will cause the driver to fail at probe. + +config UML_PCI_OVER_VFIO + bool "Enable VFIO-based PCI passthrough" + select UML_PCI + help + This driver provides support for VFIO-based PCI passthrough. + Currently, only MSI-X capable devices are supported, and it + is assumed that drivers will use MSI-X. diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index 0a5820343ad3..336be56b8975 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -19,6 +19,7 @@ port-objs := port_kern.o port_user.o harddog-objs := harddog_kern.o harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o rtc-objs := rtc_kern.o rtc_user.o +vfio_uml-objs := vfio_kern.o vfio_user.o LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a) @@ -62,6 +63,7 @@ obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o obj-$(CONFIG_UML_RTC) += rtc.o obj-$(CONFIG_UML_PCI) += virt-pci.o obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o +obj-$(CONFIG_UML_PCI_OVER_VFIO) += vfio_uml.o # pcap_user.o must be added explicitly. USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o vector_user.o diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c new file mode 100644 index 000000000000..b51fc9888ae1 --- /dev/null +++ b/arch/um/drivers/vfio_kern.c @@ -0,0 +1,642 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Ant Group + * Author: Tiwei Bie + */ + +#define pr_fmt(fmt) "vfio-uml: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "virt-pci.h" +#include "vfio_user.h" + +#define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev) + +struct uml_vfio_intr_ctx { + struct uml_vfio_device *dev; + int irq; +}; + +struct uml_vfio_device { + const char *name; + int group; + + struct um_pci_device pdev; + struct uml_vfio_user_device udev; + struct uml_vfio_intr_ctx *intr_ctx; + + int msix_cap; + int msix_bar; + int msix_offset; + int msix_size; + u32 *msix_data; + + struct list_head list; +}; + +struct uml_vfio_group { + int id; + int fd; + int users; + struct list_head list; +}; + +static struct { + int fd; + int users; +} uml_vfio_container = { .fd = -1 }; +static DEFINE_MUTEX(uml_vfio_container_mtx); + +static LIST_HEAD(uml_vfio_groups); +static DEFINE_MUTEX(uml_vfio_groups_mtx); + +static LIST_HEAD(uml_vfio_devices); + +static int uml_vfio_set_container(int group_fd) +{ + int err; + + guard(mutex)(¨_vfio_container_mtx); + + err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd); + if (err) + return err; + + uml_vfio_container.users++; + if (uml_vfio_container.users > 1) + return 0; + + err = uml_vfio_user_setup_iommu(uml_vfio_container.fd); + if (err) { + uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); + uml_vfio_container.users--; + } + return err; +} + +static void uml_vfio_unset_container(int group_fd) +{ + guard(mutex)(¨_vfio_container_mtx); + + uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd); + uml_vfio_container.users--; +} + +static int uml_vfio_open_group(int group_id) +{ + struct uml_vfio_group *group; + int err; + + guard(mutex)(¨_vfio_groups_mtx); + + list_for_each_entry(group, ¨_vfio_groups, list) { + if (group->id == group_id) { + group->users++; + return group->fd; + } + } + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return -ENOMEM; + + group->fd = uml_vfio_user_open_group(group_id); + if (group->fd < 0) { + err = group->fd; + goto free_group; + } + + err = uml_vfio_set_container(group->fd); + if (err) + goto close_group; + + group->id = group_id; + group->users = 1; + + list_add(&group->list, ¨_vfio_groups); + + return group->fd; + +close_group: + os_close_file(group->fd); +free_group: + kfree(group); + return err; +} + +static int uml_vfio_release_group(int group_fd) +{ + struct uml_vfio_group *group; + + guard(mutex)(¨_vfio_groups_mtx); + + list_for_each_entry(group, ¨_vfio_groups, list) { + if (group->fd == group_fd) { + group->users--; + if (group->users == 0) { + uml_vfio_unset_container(group_fd); + os_close_file(group_fd); + list_del(&group->list); + kfree(group); + } + return 0; + } + } + + return -ENOENT; +} + +static irqreturn_t uml_vfio_interrupt(int unused, void *opaque) +{ + struct uml_vfio_intr_ctx *ctx = opaque; + struct uml_vfio_device *dev = ctx->dev; + int index = ctx - dev->intr_ctx; + int irqfd = dev->udev.irqfd[index]; + int irq = dev->msix_data[index]; + uint64_t v; + int r; + + do { + r = os_read_file(irqfd, &v, sizeof(v)); + if (r == sizeof(v)) + generic_handle_irq(irq); + } while (r == sizeof(v) || r == -EINTR); + WARN(r != -EAGAIN, "read returned %d\n", r); + + return IRQ_HANDLED; +} + +static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index) +{ + struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; + int err, irqfd; + + if (ctx->irq >= 0) + return 0; + + irqfd = uml_vfio_user_activate_irq(&dev->udev, index); + if (irqfd < 0) + return irqfd; + + ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ, + uml_vfio_interrupt, 0, + "vfio-uml", ctx); + if (ctx->irq < 0) { + err = ctx->irq; + goto deactivate; + } + + err = add_sigio_fd(irqfd); + if (err) + goto free_irq; + + return 0; + +free_irq: + um_free_irq(ctx->irq, ctx); + ctx->irq = -1; +deactivate: + uml_vfio_user_deactivate_irq(&dev->udev, index); + return err; +} + +static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index) +{ + struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index]; + + if (ctx->irq >= 0) { + ignore_sigio_fd(dev->udev.irqfd[index]); + um_free_irq(ctx->irq, ctx); + uml_vfio_user_deactivate_irq(&dev->udev, index); + ctx->irq = -1; + } + return 0; +} + +static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev, + unsigned int offset, int size, + unsigned long val) +{ + /* + * Here, we handle only the operations we care about, + * ignoring the rest. + */ + if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) { + switch (val & ~PCI_MSIX_FLAGS_QSIZE) { + case PCI_MSIX_FLAGS_ENABLE: + case 0: + return uml_vfio_user_update_irqs(&dev->udev); + } + } + return 0; +} + +static int uml_vfio_update_msix_table(struct uml_vfio_device *dev, + unsigned int offset, int size, + unsigned long val) +{ + int index; + + /* + * Here, we handle only the operations we care about, + * ignoring the rest. + */ + offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA; + + if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0) + return 0; + + index = offset / PCI_MSIX_ENTRY_SIZE; + if (index >= dev->udev.irq_count) + return -EINVAL; + + dev->msix_data[index] = val; + + return val ? uml_vfio_activate_irq(dev, index) : + uml_vfio_deactivate_irq(dev, index); +} + +static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev, + unsigned int offset, int size) +{ + u8 data[8]; + + memset(data, 0xff, sizeof(data)); + + if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size)) + return ULONG_MAX; + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev, + unsigned int offset, int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + return __uml_vfio_cfgspace_read(dev, offset, size); +} + +static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev, + unsigned int offset, int size, + unsigned long val) +{ + u8 data[8]; + + switch (size) { + case 1: + data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)data); + break; + case 4: + put_unaligned_le32(val, (void *)data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)data); + break; +#endif + } + + WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size)); +} + +static void uml_vfio_cfgspace_write(struct um_pci_device *pdev, + unsigned int offset, int size, + unsigned long val) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF && + offset + size > dev->msix_cap) + WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val)); + + __uml_vfio_cfgspace_write(dev, offset, size, val); +} + +static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar, + void *buffer, unsigned int offset, int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + memset(buffer, 0xff, size); + uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size); +} + +static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar, + unsigned int offset, int size) +{ + u8 data[8]; + + uml_vfio_bar_copy_from(pdev, bar, data, offset, size); + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar, + unsigned int offset, const void *buffer, + int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + + uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size); +} + +static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar, + unsigned int offset, int size, + unsigned long val) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + u8 data[8]; + + if (bar == dev->msix_bar && offset + size > dev->msix_offset && + offset < dev->msix_offset + dev->msix_size) + WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val)); + + switch (size) { + case 1: + data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)data); + break; + case 4: + put_unaligned_le32(val, (void *)data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)data); + break; +#endif + } + + uml_vfio_bar_copy_to(pdev, bar, offset, data, size); +} + +static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar, + unsigned int offset, u8 value, int size) +{ + struct uml_vfio_device *dev = to_vdev(pdev); + int i; + + for (i = 0; i < size; i++) + uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1); +} + +static const struct um_pci_ops uml_vfio_um_pci_ops = { + .cfgspace_read = uml_vfio_cfgspace_read, + .cfgspace_write = uml_vfio_cfgspace_write, + .bar_read = uml_vfio_bar_read, + .bar_write = uml_vfio_bar_write, + .bar_copy_from = uml_vfio_bar_copy_from, + .bar_copy_to = uml_vfio_bar_copy_to, + .bar_set = uml_vfio_bar_set, +}; + +static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap) +{ + u8 id, pos; + u16 ent; + int ttl = 48; /* PCI_FIND_CAP_TTL */ + + pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos)); + + while (pos && ttl--) { + ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent)); + + id = ent & 0xff; + if (id == 0xff) + break; + if (id == cap) + return pos; + + pos = ent >> 8; + } + + return 0; +} + +static int uml_vfio_read_msix_table(struct uml_vfio_device *dev) +{ + unsigned int off; + u16 flags; + u32 tbl; + + off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX); + if (!off) + return -ENOTSUPP; + + dev->msix_cap = off; + + tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl)); + flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags)); + + dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR; + dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET; + dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE; + + dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL); + if (!dev->msix_data) + return -ENOMEM; + + return 0; +} + +static void uml_vfio_open_device(struct uml_vfio_device *dev) +{ + struct uml_vfio_intr_ctx *ctx; + int err, group_id, i; + + group_id = uml_vfio_user_get_group_id(dev->name); + if (group_id < 0) { + pr_err("Failed to get group id (%s), error %d\n", + dev->name, group_id); + goto free_dev; + } + + dev->group = uml_vfio_open_group(group_id); + if (dev->group < 0) { + pr_err("Failed to open group %d (%s), error %d\n", + group_id, dev->name, dev->group); + goto free_dev; + } + + err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name); + if (err) { + pr_err("Failed to setup device (%s), error %d\n", + dev->name, err); + goto release_group; + } + + err = uml_vfio_read_msix_table(dev); + if (err) { + pr_err("Failed to read MSI-X table (%s), error %d\n", + dev->name, err); + goto teardown_udev; + } + + dev->intr_ctx = kmalloc_array(dev->udev.irq_count, + sizeof(struct uml_vfio_intr_ctx), + GFP_KERNEL); + if (!dev->intr_ctx) { + pr_err("Failed to allocate interrupt context (%s)\n", + dev->name); + goto free_msix; + } + + for (i = 0; i < dev->udev.irq_count; i++) { + ctx = &dev->intr_ctx[i]; + ctx->dev = dev; + ctx->irq = -1; + } + + dev->pdev.ops = ¨_vfio_um_pci_ops; + + err = um_pci_device_register(&dev->pdev); + if (err) { + pr_err("Failed to register UM PCI device (%s), error %d\n", + dev->name, err); + goto free_intr_ctx; + } + + return; + +free_intr_ctx: + kfree(dev->intr_ctx); +free_msix: + kfree(dev->msix_data); +teardown_udev: + uml_vfio_user_teardown_device(&dev->udev); +release_group: + uml_vfio_release_group(dev->group); +free_dev: + list_del(&dev->list); + kfree(dev->name); + kfree(dev); +} + +static void uml_vfio_release_device(struct uml_vfio_device *dev) +{ + int i; + + for (i = 0; i < dev->udev.irq_count; i++) + uml_vfio_deactivate_irq(dev, i); + uml_vfio_user_update_irqs(&dev->udev); + + um_pci_device_unregister(&dev->pdev); + kfree(dev->intr_ctx); + kfree(dev->msix_data); + uml_vfio_user_teardown_device(&dev->udev); + uml_vfio_release_group(dev->group); + list_del(&dev->list); + kfree(dev->name); + kfree(dev); +} + +static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp) +{ + struct uml_vfio_device *dev; + int fd; + + if (uml_vfio_container.fd < 0) { + fd = uml_vfio_user_open_container(); + if (fd < 0) + return fd; + uml_vfio_container.fd = fd; + } + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->name = kstrdup(device, GFP_KERNEL); + if (!dev->name) { + kfree(dev); + return -ENOMEM; + } + + list_add_tail(&dev->list, ¨_vfio_devices); + return 0; +} + +static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp) +{ + return 0; +} + +static const struct kernel_param_ops uml_vfio_cmdline_param_ops = { + .set = uml_vfio_cmdline_set, + .get = uml_vfio_cmdline_get, +}; + +device_param_cb(device, ¨_vfio_cmdline_param_ops, NULL, 0400); +__uml_help(uml_vfio_cmdline_param_ops, +"vfio_uml.device=\n" +" Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n" +" capable devices are supported, and it is assumed that drivers will\n" +" use MSI-X. This parameter can be specified multiple times to pass\n" +" through multiple PCI devices to UML.\n\n" +); + +static int __init uml_vfio_init(void) +{ + struct uml_vfio_device *dev, *n; + + sigio_broken(); + + /* If the opening fails, the device will be released. */ + list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) + uml_vfio_open_device(dev); + + return 0; +} +late_initcall(uml_vfio_init); + +static void __exit uml_vfio_exit(void) +{ + struct uml_vfio_device *dev, *n; + + list_for_each_entry_safe(dev, n, ¨_vfio_devices, list) + uml_vfio_release_device(dev); + + if (uml_vfio_container.fd >= 0) + os_close_file(uml_vfio_container.fd); +} +module_exit(uml_vfio_exit); diff --git a/arch/um/drivers/vfio_user.c b/arch/um/drivers/vfio_user.c new file mode 100644 index 000000000000..6a45d8e14582 --- /dev/null +++ b/arch/um/drivers/vfio_user.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Ant Group + * Author: Tiwei Bie + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vfio_user.h" + +int uml_vfio_user_open_container(void) +{ + int r, fd; + + fd = open("/dev/vfio/vfio", O_RDWR); + if (fd < 0) + return -errno; + + r = ioctl(fd, VFIO_GET_API_VERSION); + if (r != VFIO_API_VERSION) { + r = r < 0 ? -errno : -EINVAL; + goto error; + } + + r = ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU); + if (r <= 0) { + r = r < 0 ? -errno : -EINVAL; + goto error; + } + + return fd; + +error: + close(fd); + return r; +} + +int uml_vfio_user_setup_iommu(int container) +{ + /* + * This is a bit tricky. See the big comment in + * vhost_user_set_mem_table() in virtio_uml.c. + */ + unsigned long reserved = uml_reserved - uml_physmem; + struct vfio_iommu_type1_dma_map dma_map = { + .argsz = sizeof(dma_map), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .vaddr = uml_reserved, + .iova = reserved, + .size = physmem_size - reserved, + }; + + if (ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < 0) + return -errno; + + if (ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map) < 0) + return -errno; + + return 0; +} + +int uml_vfio_user_get_group_id(const char *device) +{ + char *path, *buf, *end; + const char *name; + int r; + + path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL); + if (!path) + return -ENOMEM; + + sprintf(path, "/sys/bus/pci/devices/%s/iommu_group", device); + + buf = uml_kmalloc(PATH_MAX + 1, UM_GFP_KERNEL); + if (!buf) { + r = -ENOMEM; + goto free_path; + } + + r = readlink(path, buf, PATH_MAX); + if (r < 0) { + r = -errno; + goto free_buf; + } + buf[r] = '\0'; + + name = basename(buf); + + r = strtoul(name, &end, 10); + if (*end != '\0' || end == name) { + r = -EINVAL; + goto free_buf; + } + +free_buf: + kfree(buf); +free_path: + kfree(path); + return r; +} + +int uml_vfio_user_open_group(int group_id) +{ + char *path; + int fd; + + path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL); + if (!path) + return -ENOMEM; + + sprintf(path, "/dev/vfio/%d", group_id); + + fd = open(path, O_RDWR); + if (fd < 0) { + fd = -errno; + goto out; + } + +out: + kfree(path); + return fd; +} + +int uml_vfio_user_set_container(int container, int group) +{ + if (ioctl(group, VFIO_GROUP_SET_CONTAINER, &container) < 0) + return -errno; + return 0; +} + +int uml_vfio_user_unset_container(int container, int group) +{ + if (ioctl(group, VFIO_GROUP_UNSET_CONTAINER, &container) < 0) + return -errno; + return 0; +} + +static int vfio_set_irqs(int device, int start, int count, int *irqfd) +{ + struct vfio_irq_set *irq_set; + int argsz = sizeof(*irq_set) + sizeof(*irqfd) * count; + int err = 0; + + irq_set = uml_kmalloc(argsz, UM_GFP_KERNEL); + if (!irq_set) + return -ENOMEM; + + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = start; + irq_set->count = count; + memcpy(irq_set->data, irqfd, sizeof(*irqfd) * count); + + if (ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set) < 0) { + err = -errno; + goto out; + } + +out: + kfree(irq_set); + return err; +} + +int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev, + int group, const char *device) +{ + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + int err, i; + + dev->device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, device); + if (dev->device < 0) + return -errno; + + if (ioctl(dev->device, VFIO_DEVICE_GET_INFO, &device_info) < 0) { + err = -errno; + goto close_device; + } + + dev->num_regions = device_info.num_regions; + if (dev->num_regions > VFIO_PCI_CONFIG_REGION_INDEX + 1) + dev->num_regions = VFIO_PCI_CONFIG_REGION_INDEX + 1; + + dev->region = uml_kmalloc(sizeof(*dev->region) * dev->num_regions, + UM_GFP_KERNEL); + if (!dev->region) { + err = -ENOMEM; + goto close_device; + } + + for (i = 0; i < dev->num_regions; i++) { + struct vfio_region_info region = { + .argsz = sizeof(region), + .index = i, + }; + if (ioctl(dev->device, VFIO_DEVICE_GET_REGION_INFO, ®ion) < 0) { + err = -errno; + goto free_region; + } + dev->region[i].size = region.size; + dev->region[i].offset = region.offset; + } + + /* Only MSI-X is supported currently. */ + irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX; + if (ioctl(dev->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0) { + err = -errno; + goto free_region; + } + + dev->irq_count = irq_info.count; + + dev->irqfd = uml_kmalloc(sizeof(int) * dev->irq_count, UM_GFP_KERNEL); + if (!dev->irqfd) { + err = -ENOMEM; + goto free_region; + } + + memset(dev->irqfd, -1, sizeof(int) * dev->irq_count); + + err = vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd); + if (err) + goto free_irqfd; + + return 0; + +free_irqfd: + kfree(dev->irqfd); +free_region: + kfree(dev->region); +close_device: + close(dev->device); + return err; +} + +void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev) +{ + kfree(dev->irqfd); + kfree(dev->region); + close(dev->device); +} + +int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index) +{ + int irqfd; + + irqfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (irqfd < 0) + return -errno; + + dev->irqfd[index] = irqfd; + return irqfd; +} + +void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index) +{ + close(dev->irqfd[index]); + dev->irqfd[index] = -1; +} + +int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev) +{ + return vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd); +} + +static int vfio_region_read(struct uml_vfio_user_device *dev, unsigned int index, + uint64_t offset, void *buf, uint64_t size) +{ + if (index >= dev->num_regions || offset + size > dev->region[index].size) + return -EINVAL; + + if (pread(dev->device, buf, size, dev->region[index].offset + offset) < 0) + return -errno; + + return 0; +} + +static int vfio_region_write(struct uml_vfio_user_device *dev, unsigned int index, + uint64_t offset, const void *buf, uint64_t size) +{ + if (index >= dev->num_regions || offset + size > dev->region[index].size) + return -EINVAL; + + if (pwrite(dev->device, buf, size, dev->region[index].offset + offset) < 0) + return -errno; + + return 0; +} + +int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev, + unsigned int offset, void *buf, int size) +{ + return vfio_region_read(dev, VFIO_PCI_CONFIG_REGION_INDEX, + offset, buf, size); +} + +int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev, + unsigned int offset, const void *buf, int size) +{ + return vfio_region_write(dev, VFIO_PCI_CONFIG_REGION_INDEX, + offset, buf, size); +} + +int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, void *buf, int size) +{ + return vfio_region_read(dev, bar, offset, buf, size); +} + +int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, const void *buf, int size) +{ + return vfio_region_write(dev, bar, offset, buf, size); +} diff --git a/arch/um/drivers/vfio_user.h b/arch/um/drivers/vfio_user.h new file mode 100644 index 000000000000..75535e05059b --- /dev/null +++ b/arch/um/drivers/vfio_user.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_VFIO_USER_H +#define __UM_VFIO_USER_H + +struct uml_vfio_user_device { + int device; + + struct { + uint64_t size; + uint64_t offset; + } *region; + int num_regions; + + int32_t *irqfd; + int irq_count; +}; + +int uml_vfio_user_open_container(void); +int uml_vfio_user_setup_iommu(int container); + +int uml_vfio_user_get_group_id(const char *device); +int uml_vfio_user_open_group(int group_id); +int uml_vfio_user_set_container(int container, int group); +int uml_vfio_user_unset_container(int container, int group); + +int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev, + int group, const char *device); +void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev); + +int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index); +void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index); +int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev); + +int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev, + unsigned int offset, void *buf, int size); +int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev, + unsigned int offset, const void *buf, int size); + +int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, void *buf, int size); +int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar, + unsigned int offset, const void *buf, int size); + +#endif /* __UM_VFIO_USER_H */ -- cgit v1.2.3 From 7633b8b1e793e3441448c12c271fbecc53397fa9 Mon Sep 17 00:00:00 2001 From: "Jiri Slaby (SUSE)" Date: Tue, 15 Apr 2025 12:47:13 +0200 Subject: irqdomain: um: use irq_domain_create_linear() helper um_pci_init() open-codes what the irq_domain_create_linear() helper does already. Use the helper instead of open-coding it. This needs retval checking modification. Signed-off-by: Jiri Slaby (SUSE) Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Cc: linux-um@lists.infradead.org Link: https://patch.msgid.link/20250415104713.106819-1-jirislaby@kernel.org Signed-off-by: Johannes Berg --- arch/um/drivers/virt-pci.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index b83b5a765d4e..0fe207ca4b72 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -538,11 +538,6 @@ void um_pci_platform_device_unregister(struct um_pci_device *dev) static int __init um_pci_init(void) { - struct irq_domain_info inner_domain_info = { - .size = MAX_MSI_VECTORS, - .hwirq_max = MAX_MSI_VECTORS, - .ops = &um_pci_inner_domain_ops, - }; int err, i; WARN_ON(logic_iomem_add_region(&virt_cfgspace_resource, @@ -564,10 +559,10 @@ static int __init um_pci_init(void) goto free; } - inner_domain_info.fwnode = um_pci_fwnode; - um_pci_inner_domain = irq_domain_instantiate(&inner_domain_info); - if (IS_ERR(um_pci_inner_domain)) { - err = PTR_ERR(um_pci_inner_domain); + um_pci_inner_domain = irq_domain_create_linear(um_pci_fwnode, MAX_MSI_VECTORS, + &um_pci_inner_domain_ops, NULL); + if (!um_pci_inner_domain) { + err = -ENOMEM; goto free; } @@ -602,7 +597,7 @@ static int __init um_pci_init(void) return 0; free: - if (!IS_ERR_OR_NULL(um_pci_inner_domain)) + if (um_pci_inner_domain) irq_domain_remove(um_pci_inner_domain); if (um_pci_fwnode) irq_domain_free_fwnode(um_pci_fwnode); -- cgit v1.2.3 From 9c88156b2c81dc317297494e879f814e6574330a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 18 Apr 2025 10:33:58 +0200 Subject: um/asm: Rename rep_nop() to native_pause() Rename rep_nop() function to what it really does. No functional change intended. Signed-off-by: Uros Bizjak Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: David Laight Link: https://patch.msgid.link/20250418083436.133148-1-ubizjak@gmail.com Signed-off-by: Johannes Berg --- arch/x86/um/asm/processor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 478710384b34..d50549e0089c 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -22,7 +22,7 @@ #include /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static __always_inline void rep_nop(void) +static __always_inline void native_pause(void) { __asm__ __volatile__("rep;nop": : :"memory"); } @@ -33,7 +33,7 @@ static __always_inline void cpu_relax(void) time_travel_mode == TT_MODE_EXTERNAL) time_travel_ndelay(1); else - rep_nop(); + native_pause(); } #define task_pt_regs(t) (&(t)->thread.regs) -- cgit v1.2.3 From 304c9f7f8f439083c56846c4433fbab7467eb01e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 18 Apr 2025 10:33:59 +0200 Subject: um/asm: Replace "REP; NOP" with PAUSE mnemonic Current minimum required version of binutils is 2.25, which supports PAUSE instruction mnemonic. Replace "REP; NOP" with this proper mnemonic. No functional change intended. Signed-off-by: Uros Bizjak Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: David Laight Link: https://patch.msgid.link/20250418083436.133148-2-ubizjak@gmail.com Signed-off-by: Johannes Berg --- arch/x86/um/asm/processor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index d50549e0089c..e222d2ae28fd 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -21,10 +21,10 @@ #include -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +/* PAUSE is a good thing to insert into busy-wait loops. */ static __always_inline void native_pause(void) { - __asm__ __volatile__("rep;nop": : :"memory"); + __asm__ __volatile__("pause": : :"memory"); } static __always_inline void cpu_relax(void) -- cgit v1.2.3 From 65eaac591b752042006d3a79c0cfba47e2a9aaac Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Sat, 3 May 2025 13:17:08 +0800 Subject: um: Remove obsolete legacy network transports These legacy network transports were marked as obsolete in commit 40814b98a570 ("um: Mark non-vector net transports as obsolete"). More than five years have passed since then. Remove these network transports to reduce the maintenance burden. Suggested-by: Anton Ivanov Signed-off-by: Tiwei Bie Acked-By: Anton Ivanov Link: https://patch.msgid.link/20250503051710.3286595-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/configs/i386_defconfig | 6 - arch/um/configs/x86_64_defconfig | 6 - arch/um/drivers/Kconfig | 170 --------------------- arch/um/drivers/Makefile | 18 +-- arch/um/drivers/daemon.h | 29 ---- arch/um/drivers/daemon_kern.c | 95 ------------ arch/um/drivers/daemon_user.c | 194 ------------------------ arch/um/drivers/slip.h | 21 --- arch/um/drivers/slip_common.c | 55 ------- arch/um/drivers/slip_common.h | 106 ------------- arch/um/drivers/slip_kern.c | 93 ------------ arch/um/drivers/slip_user.c | 252 ------------------------------- arch/um/drivers/slirp.h | 34 ----- arch/um/drivers/slirp_kern.c | 120 --------------- arch/um/drivers/slirp_user.c | 124 --------------- arch/um/drivers/umcast.h | 27 ---- arch/um/drivers/umcast_kern.c | 188 ----------------------- arch/um/drivers/umcast_user.c | 184 ---------------------- arch/um/drivers/vde.h | 32 ---- arch/um/drivers/vde_kern.c | 129 ---------------- arch/um/drivers/vde_user.c | 125 --------------- arch/um/include/shared/os.h | 1 - arch/um/os-Linux/Makefile | 2 +- arch/um/os-Linux/drivers/Makefile | 13 -- arch/um/os-Linux/drivers/etap.h | 21 --- arch/um/os-Linux/drivers/ethertap_kern.c | 100 ------------ arch/um/os-Linux/drivers/ethertap_user.c | 248 ------------------------------ arch/um/os-Linux/drivers/tuntap.h | 21 --- arch/um/os-Linux/drivers/tuntap_kern.c | 86 ----------- arch/um/os-Linux/drivers/tuntap_user.c | 215 -------------------------- arch/um/os-Linux/file.c | 15 -- 31 files changed, 2 insertions(+), 2728 deletions(-) delete mode 100644 arch/um/drivers/daemon.h delete mode 100644 arch/um/drivers/daemon_kern.c delete mode 100644 arch/um/drivers/daemon_user.c delete mode 100644 arch/um/drivers/slip.h delete mode 100644 arch/um/drivers/slip_common.c delete mode 100644 arch/um/drivers/slip_common.h delete mode 100644 arch/um/drivers/slip_kern.c delete mode 100644 arch/um/drivers/slip_user.c delete mode 100644 arch/um/drivers/slirp.h delete mode 100644 arch/um/drivers/slirp_kern.c delete mode 100644 arch/um/drivers/slirp_user.c delete mode 100644 arch/um/drivers/umcast.h delete mode 100644 arch/um/drivers/umcast_kern.c delete mode 100644 arch/um/drivers/umcast_user.c delete mode 100644 arch/um/drivers/vde.h delete mode 100644 arch/um/drivers/vde_kern.c delete mode 100644 arch/um/drivers/vde_user.c delete mode 100644 arch/um/os-Linux/drivers/Makefile delete mode 100644 arch/um/os-Linux/drivers/etap.h delete mode 100644 arch/um/os-Linux/drivers/ethertap_kern.c delete mode 100644 arch/um/os-Linux/drivers/ethertap_user.c delete mode 100644 arch/um/os-Linux/drivers/tuntap.h delete mode 100644 arch/um/os-Linux/drivers/tuntap_kern.c delete mode 100644 arch/um/os-Linux/drivers/tuntap_user.c (limited to 'arch') diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index 1ffa088739f4..7c8999f18f2d 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -53,12 +53,6 @@ CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set CONFIG_UML_NET=y -CONFIG_UML_NET_ETHERTAP=y -CONFIG_UML_NET_TUNTAP=y -CONFIG_UML_NET_SLIP=y -CONFIG_UML_NET_DAEMON=y -CONFIG_UML_NET_MCAST=y -CONFIG_UML_NET_SLIRP=y CONFIG_EXT4_FS=y CONFIG_QUOTA=y CONFIG_AUTOFS_FS=m diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig index 03b10d3f6816..9858d989777e 100644 --- a/arch/um/configs/x86_64_defconfig +++ b/arch/um/configs/x86_64_defconfig @@ -52,12 +52,6 @@ CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set CONFIG_UML_NET=y -CONFIG_UML_NET_ETHERTAP=y -CONFIG_UML_NET_TUNTAP=y -CONFIG_UML_NET_SLIP=y -CONFIG_UML_NET_DAEMON=y -CONFIG_UML_NET_MCAST=y -CONFIG_UML_NET_SLIRP=y CONFIG_EXT4_FS=y CONFIG_QUOTA=y CONFIG_AUTOFS_FS=m diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index d7bb447ff958..bc68c3e341f5 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -143,99 +143,6 @@ config UML_NET enable at least one of the following transport options to actually make use of UML networking. -config UML_NET_ETHERTAP - bool "Ethertap transport (obsolete)" - depends on UML_NET - help - The Ethertap User-Mode Linux network transport allows a single - running UML to exchange packets with its host over one of the - host's Ethertap devices, such as /dev/tap0. Additional running - UMLs can use additional Ethertap devices, one per running UML. - While the UML believes it's on a (multi-device, broadcast) virtual - Ethernet network, it's in fact communicating over a point-to-point - link with the host. - - To use this, your host kernel must have support for Ethertap - devices. Also, if your host kernel is 2.4.x, it must have - CONFIG_NETLINK_DEV configured as Y or M. - - For more information, see - That site - has examples of the UML command line to use to enable Ethertap - networking. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_TUNTAP - bool "TUN/TAP transport (obsolete)" - depends on UML_NET - help - The UML TUN/TAP network transport allows a UML instance to exchange - packets with the host over a TUN/TAP device. This option will only - work with a 2.4 host, unless you've applied the TUN/TAP patch to - your 2.2 host kernel. - - To use this transport, your host kernel must have support for TUN/TAP - devices, either built-in or as a module. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_SLIP - bool "SLIP transport (obsolete)" - depends on UML_NET - help - The slip User-Mode Linux network transport allows a running UML to - network with its host over a point-to-point link. Unlike Ethertap, - which can carry any Ethernet frame (and hence even non-IP packets), - the slip transport can only carry IP packets. - - To use this, your host must support slip devices. - - For more information, see - . - has examples of the UML command line to use to enable slip - networking, and details of a few quirks with it. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_DAEMON - bool "Daemon transport (obsolete)" - depends on UML_NET - help - This User-Mode Linux network transport allows one or more running - UMLs on a single host to communicate with each other, but not to - the host. - - To use this form of networking, you'll need to run the UML - networking daemon on the host. - - For more information, see - That site - has examples of the UML command line to use to enable Daemon - networking. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_DAEMON_DEFAULT_SOCK - string "Default socket for daemon transport" - default "/tmp/uml.ctl" - depends on UML_NET_DAEMON - help - This option allows setting the default socket for the daemon - transport, normally it defaults to /tmp/uml.ctl. - config UML_NET_VECTOR bool "Vector I/O high performance network devices" depends on UML_NET @@ -248,83 +155,6 @@ config UML_NET_VECTOR with up to 4 times higher network throughput than the UML network drivers. -config UML_NET_VDE - bool "VDE transport (obsolete)" - depends on UML_NET - depends on !MODVERSIONS - select MAY_HAVE_RUNTIME_DEPS - help - This User-Mode Linux network transport allows one or more running - UMLs on a single host to communicate with each other and also - with the rest of the world using Virtual Distributed Ethernet, - an improved fork of uml_switch. - - You must have libvdeplug installed in order to build the vde - transport into UML. - - To use this form of networking, you will need to run vde_switch - on the host. - - For more information, see - That site has a good overview of what VDE is and also examples - of the UML command line to use to enable VDE networking. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_MCAST - bool "Multicast transport (obsolete)" - depends on UML_NET - help - This Multicast User-Mode Linux network transport allows multiple - UMLs (even ones running on different host machines!) to talk to - each other over a virtual ethernet network. However, it requires - at least one UML with one of the other transports to act as a - bridge if any of them need to be able to talk to their hosts or any - other IP machines. - - To use this, your host kernel(s) must support IP Multicasting. - - For more information, see - That site - has examples of the UML command line to use to enable Multicast - networking, and notes about the security of this approach. - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - -config UML_NET_SLIRP - bool "SLiRP transport (obsolete)" - depends on UML_NET - help - The SLiRP User-Mode Linux network transport allows a running UML - to network by invoking a program that can handle SLIP encapsulated - packets. This is commonly (but not limited to) the application - known as SLiRP, a program that can re-socket IP packets back onto - he host on which it is run. Only IP packets are supported, - unlike other network transports that can handle all Ethernet - frames. In general, slirp allows the UML the same IP connectivity - to the outside world that the host user is permitted, and unlike - other transports, SLiRP works without the need of root level - privileges, setuid binaries, or SLIP devices on the host. This - also means not every type of connection is possible, but most - situations can be accommodated with carefully crafted slirp - commands that can be passed along as part of the network device's - setup string. The effect of this transport on the UML is similar - that of a host behind a firewall that masquerades all network - connections passing through it (but is less secure). - - NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please - migrate to UML_NET_VECTOR. - - If unsure, say N. - - Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp" - endmenu config VIRTIO_UML diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index 336be56b8975..6dfc2e9c0ce8 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -6,11 +6,7 @@ # pcap is broken in 2.5 because kbuild doesn't allow pcap.a to be linked # in to pcap.o -slip-objs := slip_kern.o slip_user.o -slirp-objs := slirp_kern.o slirp_user.o -daemon-objs := daemon_kern.o daemon_user.o vector-objs := vector_kern.o vector_user.o vector_transports.o -umcast-objs := umcast_kern.o umcast_user.o net-objs := net_kern.o net_user.o mconsole-objs := mconsole_kern.o mconsole_user.o hostaudio-objs := hostaudio_kern.o @@ -21,13 +17,6 @@ harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o rtc-objs := rtc_kern.o rtc_user.o vfio_uml-objs := vfio_kern.o vfio_user.o -LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a) - -targets := vde_kern.o vde_user.o - -$(obj)/vde.o: $(obj)/vde_kern.o $(obj)/vde_user.o - $(LD) -r -dp -o $@ $^ $(ld_flags) - #XXX: The call below does not work because the flags are added before the # object name, so nothing from the library gets linked. #$(call if_changed,ld) @@ -39,12 +28,7 @@ obj-y := stdio_console.o fd.o chan_kern.o chan_user.o line.o obj-$(CONFIG_SSL) += ssl.o obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o -obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o -obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o -obj-$(CONFIG_UML_NET_DAEMON) += daemon.o obj-$(CONFIG_UML_NET_VECTOR) += vector.o -obj-$(CONFIG_UML_NET_VDE) += vde.o -obj-$(CONFIG_UML_NET_MCAST) += umcast.o obj-$(CONFIG_UML_NET) += net.o obj-$(CONFIG_MCONSOLE) += mconsole.o obj-$(CONFIG_MMAPPER) += mmapper_kern.o @@ -66,7 +50,7 @@ obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o obj-$(CONFIG_UML_PCI_OVER_VFIO) += vfio_uml.o # pcap_user.o must be added explicitly. -USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o vector_user.o +USER_OBJS := fd.o null.o pty.o tty.o xterm.o vector_user.o CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH) CFLAGS_xterm.o += '-DCONFIG_XTERM_CHAN_DEFAULT_EMULATOR="$(CONFIG_XTERM_CHAN_DEFAULT_EMULATOR)"' diff --git a/arch/um/drivers/daemon.h b/arch/um/drivers/daemon.h deleted file mode 100644 index 1509cc7eb907..000000000000 --- a/arch/um/drivers/daemon.h +++ /dev/null @@ -1,29 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __DAEMON_H__ -#define __DAEMON_H__ - -#include - -#define SWITCH_VERSION 3 - -struct daemon_data { - char *sock_type; - char *ctl_sock; - void *ctl_addr; - void *data_addr; - void *local_addr; - int fd; - int control; - void *dev; -}; - -extern const struct net_user_info daemon_user_info; - -extern int daemon_user_write(int fd, void *buf, int len, - struct daemon_data *pri); - -#endif diff --git a/arch/um/drivers/daemon_kern.c b/arch/um/drivers/daemon_kern.c deleted file mode 100644 index afde1e82c056..000000000000 --- a/arch/um/drivers/daemon_kern.c +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 by various other people who didn't put their name here. - */ - -#include -#include -#include -#include "daemon.h" - -struct daemon_init { - char *sock_type; - char *ctl_sock; -}; - -static void daemon_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct daemon_data *dpri; - struct daemon_init *init = data; - - pri = netdev_priv(dev); - dpri = (struct daemon_data *) pri->user; - dpri->sock_type = init->sock_type; - dpri->ctl_sock = init->ctl_sock; - dpri->fd = -1; - dpri->control = -1; - dpri->dev = dev; - /* We will free this pointer. If it contains crap we're burned. */ - dpri->ctl_addr = NULL; - dpri->data_addr = NULL; - dpri->local_addr = NULL; - - printk("daemon backend (uml_switch version %d) - %s:%s", - SWITCH_VERSION, dpri->sock_type, dpri->ctl_sock); - printk("\n"); -} - -static int daemon_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_recvfrom(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); -} - -static int daemon_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return daemon_user_write(fd, skb->data, skb->len, - (struct daemon_data *) &lp->user); -} - -static const struct net_kern_info daemon_kern_info = { - .init = daemon_init, - .protocol = eth_protocol, - .read = daemon_read, - .write = daemon_write, -}; - -static int daemon_setup(char *str, char **mac_out, void *data) -{ - struct daemon_init *init = data; - char *remain; - - *init = ((struct daemon_init) - { .sock_type = "unix", - .ctl_sock = CONFIG_UML_NET_DAEMON_DEFAULT_SOCK }); - - remain = split_if_spec(str, mac_out, &init->sock_type, &init->ctl_sock, - NULL); - if (remain != NULL) - printk(KERN_WARNING "daemon_setup : Ignoring data socket " - "specification\n"); - - return 1; -} - -static struct transport daemon_transport = { - .list = LIST_HEAD_INIT(daemon_transport.list), - .name = "daemon", - .setup = daemon_setup, - .user = &daemon_user_info, - .kern = &daemon_kern_info, - .private_size = sizeof(struct daemon_data), - .setup_size = sizeof(struct daemon_init), -}; - -static int register_daemon(void) -{ - register_transport(&daemon_transport); - return 0; -} - -late_initcall(register_daemon); diff --git a/arch/um/drivers/daemon_user.c b/arch/um/drivers/daemon_user.c deleted file mode 100644 index 785baedc3555..000000000000 --- a/arch/um/drivers/daemon_user.c +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "daemon.h" -#include -#include -#include - -enum request_type { REQ_NEW_CONTROL }; - -#define SWITCH_MAGIC 0xfeedface - -struct request_v3 { - uint32_t magic; - uint32_t version; - enum request_type type; - struct sockaddr_un sock; -}; - -static struct sockaddr_un *new_addr(void *name, int len) -{ - struct sockaddr_un *sun; - - sun = uml_kmalloc(sizeof(struct sockaddr_un), UM_GFP_KERNEL); - if (sun == NULL) { - printk(UM_KERN_ERR "new_addr: allocation of sockaddr_un " - "failed\n"); - return NULL; - } - sun->sun_family = AF_UNIX; - memcpy(sun->sun_path, name, len); - return sun; -} - -static int connect_to_switch(struct daemon_data *pri) -{ - struct sockaddr_un *ctl_addr = pri->ctl_addr; - struct sockaddr_un *local_addr = pri->local_addr; - struct sockaddr_un *sun; - struct request_v3 req; - int fd, n, err; - - pri->control = socket(AF_UNIX, SOCK_STREAM, 0); - if (pri->control < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : control socket failed, " - "errno = %d\n", -err); - return err; - } - - if (connect(pri->control, (struct sockaddr *) ctl_addr, - sizeof(*ctl_addr)) < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : control connect failed, " - "errno = %d\n", -err); - goto out; - } - - fd = socket(AF_UNIX, SOCK_DGRAM, 0); - if (fd < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : data socket failed, " - "errno = %d\n", -err); - goto out; - } - if (bind(fd, (struct sockaddr *) local_addr, sizeof(*local_addr)) < 0) { - err = -errno; - printk(UM_KERN_ERR "daemon_open : data bind failed, " - "errno = %d\n", -err); - goto out_close; - } - - sun = uml_kmalloc(sizeof(struct sockaddr_un), UM_GFP_KERNEL); - if (sun == NULL) { - printk(UM_KERN_ERR "new_addr: allocation of sockaddr_un " - "failed\n"); - err = -ENOMEM; - goto out_close; - } - - req.magic = SWITCH_MAGIC; - req.version = SWITCH_VERSION; - req.type = REQ_NEW_CONTROL; - req.sock = *local_addr; - n = write(pri->control, &req, sizeof(req)); - if (n != sizeof(req)) { - printk(UM_KERN_ERR "daemon_open : control setup request " - "failed, err = %d\n", -errno); - err = -ENOTCONN; - goto out_free; - } - - n = read(pri->control, sun, sizeof(*sun)); - if (n != sizeof(*sun)) { - printk(UM_KERN_ERR "daemon_open : read of data socket failed, " - "err = %d\n", -errno); - err = -ENOTCONN; - goto out_free; - } - - pri->data_addr = sun; - return fd; - - out_free: - kfree(sun); - out_close: - close(fd); - out: - close(pri->control); - return err; -} - -static int daemon_user_init(void *data, void *dev) -{ - struct daemon_data *pri = data; - struct timeval tv; - struct { - char zero; - int pid; - int usecs; - } name; - - if (!strcmp(pri->sock_type, "unix")) - pri->ctl_addr = new_addr(pri->ctl_sock, - strlen(pri->ctl_sock) + 1); - name.zero = 0; - name.pid = os_getpid(); - gettimeofday(&tv, NULL); - name.usecs = tv.tv_usec; - pri->local_addr = new_addr(&name, sizeof(name)); - pri->dev = dev; - pri->fd = connect_to_switch(pri); - if (pri->fd < 0) { - kfree(pri->local_addr); - pri->local_addr = NULL; - return pri->fd; - } - - return 0; -} - -static int daemon_open(void *data) -{ - struct daemon_data *pri = data; - return pri->fd; -} - -static void daemon_remove(void *data) -{ - struct daemon_data *pri = data; - - close(pri->fd); - pri->fd = -1; - close(pri->control); - pri->control = -1; - - kfree(pri->data_addr); - pri->data_addr = NULL; - kfree(pri->ctl_addr); - pri->ctl_addr = NULL; - kfree(pri->local_addr); - pri->local_addr = NULL; -} - -int daemon_user_write(int fd, void *buf, int len, struct daemon_data *pri) -{ - struct sockaddr_un *data_addr = pri->data_addr; - - return net_sendto(fd, buf, len, data_addr, sizeof(*data_addr)); -} - -const struct net_user_info daemon_user_info = { - .init = daemon_user_init, - .open = daemon_open, - .close = NULL, - .remove = daemon_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/drivers/slip.h b/arch/um/drivers/slip.h deleted file mode 100644 index 0f3b7ca99465..000000000000 --- a/arch/um/drivers/slip.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_SLIP_H -#define __UM_SLIP_H - -#include "slip_common.h" - -struct slip_data { - void *dev; - char name[sizeof("slnnnnn\0")]; - char *addr; - char *gate_addr; - int slave; - struct slip_proto slip; -}; - -extern const struct net_user_info slip_user_info; - -extern int slip_user_read(int fd, void *buf, int len, struct slip_data *pri); -extern int slip_user_write(int fd, void *buf, int len, struct slip_data *pri); - -#endif diff --git a/arch/um/drivers/slip_common.c b/arch/um/drivers/slip_common.c deleted file mode 100644 index 20fe4f42743d..000000000000 --- a/arch/um/drivers/slip_common.c +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include "slip_common.h" -#include - -int slip_proto_read(int fd, void *buf, int len, struct slip_proto *slip) -{ - int i, n, size, start; - - if(slip->more > 0){ - i = 0; - while(i < slip->more){ - size = slip_unesc(slip->ibuf[i++], slip->ibuf, - &slip->pos, &slip->esc); - if(size){ - memcpy(buf, slip->ibuf, size); - memmove(slip->ibuf, &slip->ibuf[i], - slip->more - i); - slip->more = slip->more - i; - return size; - } - } - slip->more = 0; - } - - n = net_read(fd, &slip->ibuf[slip->pos], - sizeof(slip->ibuf) - slip->pos); - if(n <= 0) - return n; - - start = slip->pos; - for(i = 0; i < n; i++){ - size = slip_unesc(slip->ibuf[start + i], slip->ibuf,&slip->pos, - &slip->esc); - if(size){ - memcpy(buf, slip->ibuf, size); - memmove(slip->ibuf, &slip->ibuf[start+i+1], - n - (i + 1)); - slip->more = n - (i + 1); - return size; - } - } - return 0; -} - -int slip_proto_write(int fd, void *buf, int len, struct slip_proto *slip) -{ - int actual, n; - - actual = slip_esc(buf, slip->obuf, len); - n = net_write(fd, slip->obuf, actual); - if(n < 0) - return n; - else return len; -} diff --git a/arch/um/drivers/slip_common.h b/arch/um/drivers/slip_common.h deleted file mode 100644 index d3798b5caf7f..000000000000 --- a/arch/um/drivers/slip_common.h +++ /dev/null @@ -1,106 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_SLIP_COMMON_H -#define __UM_SLIP_COMMON_H - -#define BUF_SIZE 1500 - /* two bytes each for a (pathological) max packet of escaped chars + * - * terminating END char + initial END char */ -#define ENC_BUF_SIZE (2 * BUF_SIZE + 2) - -/* SLIP protocol characters. */ -#define SLIP_END 0300 /* indicates end of frame */ -#define SLIP_ESC 0333 /* indicates byte stuffing */ -#define SLIP_ESC_END 0334 /* ESC ESC_END means END 'data' */ -#define SLIP_ESC_ESC 0335 /* ESC ESC_ESC means ESC 'data' */ - -static inline int slip_unesc(unsigned char c, unsigned char *buf, int *pos, - int *esc) -{ - int ret; - - switch(c){ - case SLIP_END: - *esc = 0; - ret=*pos; - *pos=0; - return(ret); - case SLIP_ESC: - *esc = 1; - return(0); - case SLIP_ESC_ESC: - if(*esc){ - *esc = 0; - c = SLIP_ESC; - } - break; - case SLIP_ESC_END: - if(*esc){ - *esc = 0; - c = SLIP_END; - } - break; - } - buf[(*pos)++] = c; - return(0); -} - -static inline int slip_esc(unsigned char *s, unsigned char *d, int len) -{ - unsigned char *ptr = d; - unsigned char c; - - /* - * Send an initial END character to flush out any - * data that may have accumulated in the receiver - * due to line noise. - */ - - *ptr++ = SLIP_END; - - /* - * For each byte in the packet, send the appropriate - * character sequence, according to the SLIP protocol. - */ - - while (len-- > 0) { - switch(c = *s++) { - case SLIP_END: - *ptr++ = SLIP_ESC; - *ptr++ = SLIP_ESC_END; - break; - case SLIP_ESC: - *ptr++ = SLIP_ESC; - *ptr++ = SLIP_ESC_ESC; - break; - default: - *ptr++ = c; - break; - } - } - *ptr++ = SLIP_END; - return (ptr - d); -} - -struct slip_proto { - unsigned char ibuf[ENC_BUF_SIZE]; - unsigned char obuf[ENC_BUF_SIZE]; - int more; /* more data: do not read fd until ibuf has been drained */ - int pos; - int esc; -}; - -static inline void slip_proto_init(struct slip_proto * slip) -{ - memset(slip->ibuf, 0, sizeof(slip->ibuf)); - memset(slip->obuf, 0, sizeof(slip->obuf)); - slip->more = 0; - slip->pos = 0; - slip->esc = 0; -} - -extern int slip_proto_read(int fd, void *buf, int len, - struct slip_proto *slip); -extern int slip_proto_write(int fd, void *buf, int len, - struct slip_proto *slip); - -#endif diff --git a/arch/um/drivers/slip_kern.c b/arch/um/drivers/slip_kern.c deleted file mode 100644 index c58ccdcc16d6..000000000000 --- a/arch/um/drivers/slip_kern.c +++ /dev/null @@ -1,93 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include -#include -#include -#include -#include "slip.h" - -struct slip_init { - char *gate_addr; -}; - -static void slip_init(struct net_device *dev, void *data) -{ - struct uml_net_private *private; - struct slip_data *spri; - struct slip_init *init = data; - - private = netdev_priv(dev); - spri = (struct slip_data *) private->user; - - memset(spri->name, 0, sizeof(spri->name)); - spri->addr = NULL; - spri->gate_addr = init->gate_addr; - spri->slave = -1; - spri->dev = dev; - - slip_proto_init(&spri->slip); - - dev->hard_header_len = 0; - dev->header_ops = NULL; - dev->addr_len = 0; - dev->type = ARPHRD_SLIP; - dev->tx_queue_len = 256; - dev->flags = IFF_NOARP; - printk("SLIP backend - SLIP IP = %s\n", spri->gate_addr); -} - -static unsigned short slip_protocol(struct sk_buff *skbuff) -{ - return htons(ETH_P_IP); -} - -static int slip_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slip_user_read(fd, skb_mac_header(skb), skb->dev->mtu, - (struct slip_data *) &lp->user); -} - -static int slip_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slip_user_write(fd, skb->data, skb->len, - (struct slip_data *) &lp->user); -} - -static const struct net_kern_info slip_kern_info = { - .init = slip_init, - .protocol = slip_protocol, - .read = slip_read, - .write = slip_write, -}; - -static int slip_setup(char *str, char **mac_out, void *data) -{ - struct slip_init *init = data; - - *init = ((struct slip_init) { .gate_addr = NULL }); - - if (str[0] != '\0') - init->gate_addr = str; - return 1; -} - -static struct transport slip_transport = { - .list = LIST_HEAD_INIT(slip_transport.list), - .name = "slip", - .setup = slip_setup, - .user = &slip_user_info, - .kern = &slip_kern_info, - .private_size = sizeof(struct slip_data), - .setup_size = sizeof(struct slip_init), -}; - -static int register_slip(void) -{ - register_transport(&slip_transport); - return 0; -} - -late_initcall(register_slip); diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c deleted file mode 100644 index 7334019c9e60..000000000000 --- a/arch/um/drivers/slip_user.c +++ /dev/null @@ -1,252 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "slip.h" -#include - -static int slip_user_init(void *data, void *dev) -{ - struct slip_data *pri = data; - - pri->dev = dev; - return 0; -} - -static int set_up_tty(int fd) -{ - int i; - struct termios tios; - - if (tcgetattr(fd, &tios) < 0) { - printk(UM_KERN_ERR "could not get initial terminal " - "attributes\n"); - return -1; - } - - tios.c_cflag = CS8 | CREAD | HUPCL | CLOCAL; - tios.c_iflag = IGNBRK | IGNPAR; - tios.c_oflag = 0; - tios.c_lflag = 0; - for (i = 0; i < NCCS; i++) - tios.c_cc[i] = 0; - tios.c_cc[VMIN] = 1; - tios.c_cc[VTIME] = 0; - - cfsetospeed(&tios, B38400); - cfsetispeed(&tios, B38400); - - if (tcsetattr(fd, TCSAFLUSH, &tios) < 0) { - printk(UM_KERN_ERR "failed to set terminal attributes\n"); - return -1; - } - return 0; -} - -struct slip_pre_exec_data { - int stdin_fd; - int stdout_fd; - int close_me; -}; - -static void slip_pre_exec(void *arg) -{ - struct slip_pre_exec_data *data = arg; - - if (data->stdin_fd >= 0) - dup2(data->stdin_fd, 0); - dup2(data->stdout_fd, 1); - if (data->close_me >= 0) - close(data->close_me); -} - -static int slip_tramp(char **argv, int fd) -{ - struct slip_pre_exec_data pe_data; - char *output; - int pid, fds[2], err, output_len; - - err = os_pipe(fds, 1, 0); - if (err < 0) { - printk(UM_KERN_ERR "slip_tramp : pipe failed, err = %d\n", - -err); - goto out; - } - - err = 0; - pe_data.stdin_fd = fd; - pe_data.stdout_fd = fds[1]; - pe_data.close_me = fds[0]; - err = run_helper(slip_pre_exec, &pe_data, argv); - if (err < 0) - goto out_close; - pid = err; - - output_len = UM_KERN_PAGE_SIZE; - output = uml_kmalloc(output_len, UM_GFP_KERNEL); - if (output == NULL) { - printk(UM_KERN_ERR "slip_tramp : failed to allocate output " - "buffer\n"); - os_kill_process(pid, 1); - err = -ENOMEM; - goto out_close; - } - - close(fds[1]); - read_output(fds[0], output, output_len); - printk("%s", output); - - err = helper_wait(pid); - close(fds[0]); - - kfree(output); - return err; - -out_close: - close(fds[0]); - close(fds[1]); -out: - return err; -} - -static int slip_open(void *data) -{ - struct slip_data *pri = data; - char version_buf[sizeof("nnnnn\0")]; - char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")]; - char *argv[] = { "uml_net", version_buf, "slip", "up", gate_buf, - NULL }; - int sfd, mfd, err; - - err = get_pty(); - if (err < 0) { - printk(UM_KERN_ERR "slip-open : Failed to open pty, err = %d\n", - -err); - goto out; - } - mfd = err; - - err = open(ptsname(mfd), O_RDWR, 0); - if (err < 0) { - printk(UM_KERN_ERR "Couldn't open tty for slip line, " - "err = %d\n", -err); - goto out_close; - } - sfd = err; - - err = set_up_tty(sfd); - if (err) - goto out_close2; - - pri->slave = sfd; - pri->slip.pos = 0; - pri->slip.esc = 0; - if (pri->gate_addr != NULL) { - sprintf(version_buf, "%d", UML_NET_VERSION); - strcpy(gate_buf, pri->gate_addr); - - err = slip_tramp(argv, sfd); - - if (err < 0) { - printk(UM_KERN_ERR "slip_tramp failed - err = %d\n", - -err); - goto out_close2; - } - err = os_get_ifname(pri->slave, pri->name); - if (err < 0) { - printk(UM_KERN_ERR "get_ifname failed, err = %d\n", - -err); - goto out_close2; - } - iter_addresses(pri->dev, open_addr, pri->name); - } - else { - err = os_set_slip(sfd); - if (err < 0) { - printk(UM_KERN_ERR "Failed to set slip discipline " - "encapsulation - err = %d\n", -err); - goto out_close2; - } - } - return mfd; -out_close2: - close(sfd); -out_close: - close(mfd); -out: - return err; -} - -static void slip_close(int fd, void *data) -{ - struct slip_data *pri = data; - char version_buf[sizeof("nnnnn\0")]; - char *argv[] = { "uml_net", version_buf, "slip", "down", pri->name, - NULL }; - int err; - - if (pri->gate_addr != NULL) - iter_addresses(pri->dev, close_addr, pri->name); - - sprintf(version_buf, "%d", UML_NET_VERSION); - - err = slip_tramp(argv, pri->slave); - - if (err != 0) - printk(UM_KERN_ERR "slip_tramp failed - errno = %d\n", -err); - close(fd); - close(pri->slave); - pri->slave = -1; -} - -int slip_user_read(int fd, void *buf, int len, struct slip_data *pri) -{ - return slip_proto_read(fd, buf, len, &pri->slip); -} - -int slip_user_write(int fd, void *buf, int len, struct slip_data *pri) -{ - return slip_proto_write(fd, buf, len, &pri->slip); -} - -static void slip_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct slip_data *pri = data; - - if (pri->slave < 0) - return; - open_addr(addr, netmask, pri->name); -} - -static void slip_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct slip_data *pri = data; - - if (pri->slave < 0) - return; - close_addr(addr, netmask, pri->name); -} - -const struct net_user_info slip_user_info = { - .init = slip_user_init, - .open = slip_open, - .close = slip_close, - .remove = NULL, - .add_address = slip_add_addr, - .delete_address = slip_del_addr, - .mtu = BUF_SIZE, - .max_packet = BUF_SIZE, -}; diff --git a/arch/um/drivers/slirp.h b/arch/um/drivers/slirp.h deleted file mode 100644 index 4aef2b88249a..000000000000 --- a/arch/um/drivers/slirp.h +++ /dev/null @@ -1,34 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_SLIRP_H -#define __UM_SLIRP_H - -#include "slip_common.h" - -#define SLIRP_MAX_ARGS 100 -/* - * XXX this next definition is here because I don't understand why this - * initializer doesn't work in slirp_kern.c: - * - * argv : { init->argv[ 0 ... SLIRP_MAX_ARGS-1 ] }, - * - * or why I can't typecast like this: - * - * argv : (char* [SLIRP_MAX_ARGS])(init->argv), - */ -struct arg_list_dummy_wrapper { char *argv[SLIRP_MAX_ARGS]; }; - -struct slirp_data { - void *dev; - struct arg_list_dummy_wrapper argw; - int pid; - int slave; - struct slip_proto slip; -}; - -extern const struct net_user_info slirp_user_info; - -extern int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri); -extern int slirp_user_write(int fd, void *buf, int len, - struct slirp_data *pri); - -#endif diff --git a/arch/um/drivers/slirp_kern.c b/arch/um/drivers/slirp_kern.c deleted file mode 100644 index 0a6151ee9572..000000000000 --- a/arch/um/drivers/slirp_kern.c +++ /dev/null @@ -1,120 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include -#include -#include -#include -#include -#include -#include "slirp.h" - -struct slirp_init { - struct arg_list_dummy_wrapper argw; /* XXX should be simpler... */ -}; - -static void slirp_init(struct net_device *dev, void *data) -{ - struct uml_net_private *private; - struct slirp_data *spri; - struct slirp_init *init = data; - int i; - - private = netdev_priv(dev); - spri = (struct slirp_data *) private->user; - - spri->argw = init->argw; - spri->pid = -1; - spri->slave = -1; - spri->dev = dev; - - slip_proto_init(&spri->slip); - - dev->hard_header_len = 0; - dev->header_ops = NULL; - dev->addr_len = 0; - dev->type = ARPHRD_SLIP; - dev->tx_queue_len = 256; - dev->flags = IFF_NOARP; - printk("SLIRP backend - command line:"); - for (i = 0; spri->argw.argv[i] != NULL; i++) - printk(" '%s'",spri->argw.argv[i]); - printk("\n"); -} - -static unsigned short slirp_protocol(struct sk_buff *skbuff) -{ - return htons(ETH_P_IP); -} - -static int slirp_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slirp_user_read(fd, skb_mac_header(skb), skb->dev->mtu, - (struct slirp_data *) &lp->user); -} - -static int slirp_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return slirp_user_write(fd, skb->data, skb->len, - (struct slirp_data *) &lp->user); -} - -const struct net_kern_info slirp_kern_info = { - .init = slirp_init, - .protocol = slirp_protocol, - .read = slirp_read, - .write = slirp_write, -}; - -static int slirp_setup(char *str, char **mac_out, void *data) -{ - struct slirp_init *init = data; - int i=0; - - *init = ((struct slirp_init) { .argw = { { "slirp", NULL } } }); - - str = split_if_spec(str, mac_out, NULL); - - if (str == NULL) /* no command line given after MAC addr */ - return 1; - - do { - if (i >= SLIRP_MAX_ARGS - 1) { - printk(KERN_WARNING "slirp_setup: truncating slirp " - "arguments\n"); - break; - } - init->argw.argv[i++] = str; - while(*str && *str!=',') { - if (*str == '_') - *str=' '; - str++; - } - if (*str != ',') - break; - *str++ = '\0'; - } while (1); - - init->argw.argv[i] = NULL; - return 1; -} - -static struct transport slirp_transport = { - .list = LIST_HEAD_INIT(slirp_transport.list), - .name = "slirp", - .setup = slirp_setup, - .user = &slirp_user_info, - .kern = &slirp_kern_info, - .private_size = sizeof(struct slirp_data), - .setup_size = sizeof(struct slirp_init), -}; - -static int register_slirp(void) -{ - register_transport(&slirp_transport); - return 0; -} - -late_initcall(register_slirp); diff --git a/arch/um/drivers/slirp_user.c b/arch/um/drivers/slirp_user.c deleted file mode 100644 index 97228aa080cb..000000000000 --- a/arch/um/drivers/slirp_user.c +++ /dev/null @@ -1,124 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include -#include -#include -#include -#include -#include -#include "slirp.h" - -static int slirp_user_init(void *data, void *dev) -{ - struct slirp_data *pri = data; - - pri->dev = dev; - return 0; -} - -struct slirp_pre_exec_data { - int stdin_fd; - int stdout_fd; -}; - -static void slirp_pre_exec(void *arg) -{ - struct slirp_pre_exec_data *data = arg; - - if (data->stdin_fd != -1) - dup2(data->stdin_fd, 0); - if (data->stdout_fd != -1) - dup2(data->stdout_fd, 1); -} - -static int slirp_tramp(char **argv, int fd) -{ - struct slirp_pre_exec_data pe_data; - int pid; - - pe_data.stdin_fd = fd; - pe_data.stdout_fd = fd; - pid = run_helper(slirp_pre_exec, &pe_data, argv); - - return pid; -} - -static int slirp_open(void *data) -{ - struct slirp_data *pri = data; - int fds[2], err; - - err = os_pipe(fds, 1, 1); - if (err) - return err; - - err = slirp_tramp(pri->argw.argv, fds[1]); - if (err < 0) { - printk(UM_KERN_ERR "slirp_tramp failed - errno = %d\n", -err); - goto out; - } - - pri->slave = fds[1]; - pri->slip.pos = 0; - pri->slip.esc = 0; - pri->pid = err; - - return fds[0]; -out: - close(fds[0]); - close(fds[1]); - return err; -} - -static void slirp_close(int fd, void *data) -{ - struct slirp_data *pri = data; - int err; - - close(fd); - close(pri->slave); - - pri->slave = -1; - - if (pri->pid<1) { - printk(UM_KERN_ERR "slirp_close: no child process to shut " - "down\n"); - return; - } - -#if 0 - if (kill(pri->pid, SIGHUP)<0) { - printk(UM_KERN_ERR "slirp_close: sending hangup to %d failed " - "(%d)\n", pri->pid, errno); - } -#endif - err = helper_wait(pri->pid); - if (err < 0) - return; - - pri->pid = -1; -} - -int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri) -{ - return slip_proto_read(fd, buf, len, &pri->slip); -} - -int slirp_user_write(int fd, void *buf, int len, struct slirp_data *pri) -{ - return slip_proto_write(fd, buf, len, &pri->slip); -} - -const struct net_user_info slirp_user_info = { - .init = slirp_user_init, - .open = slirp_open, - .close = slirp_close, - .remove = NULL, - .add_address = NULL, - .delete_address = NULL, - .mtu = BUF_SIZE, - .max_packet = BUF_SIZE, -}; diff --git a/arch/um/drivers/umcast.h b/arch/um/drivers/umcast.h deleted file mode 100644 index fe39bee1e3bd..000000000000 --- a/arch/um/drivers/umcast.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __DRIVERS_UMCAST_H -#define __DRIVERS_UMCAST_H - -#include - -struct umcast_data { - char *addr; - unsigned short lport; - unsigned short rport; - void *listen_addr; - void *remote_addr; - int ttl; - int unicast; - void *dev; -}; - -extern const struct net_user_info umcast_user_info; - -extern int umcast_user_write(int fd, void *buf, int len, - struct umcast_data *pri); - -#endif diff --git a/arch/um/drivers/umcast_kern.c b/arch/um/drivers/umcast_kern.c deleted file mode 100644 index 595a54f2b9c6..000000000000 --- a/arch/um/drivers/umcast_kern.c +++ /dev/null @@ -1,188 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * user-mode-linux networking multicast transport - * Copyright (C) 2001 by Harald Welte - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * - * based on the existing uml-networking code, which is - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * - */ - -#include -#include -#include "umcast.h" -#include - -struct umcast_init { - char *addr; - int lport; - int rport; - int ttl; - bool unicast; -}; - -static void umcast_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct umcast_data *dpri; - struct umcast_init *init = data; - - pri = netdev_priv(dev); - dpri = (struct umcast_data *) pri->user; - dpri->addr = init->addr; - dpri->lport = init->lport; - dpri->rport = init->rport; - dpri->unicast = init->unicast; - dpri->ttl = init->ttl; - dpri->dev = dev; - - if (dpri->unicast) { - printk(KERN_INFO "ucast backend address: %s:%u listen port: " - "%u\n", dpri->addr, dpri->rport, dpri->lport); - } else { - printk(KERN_INFO "mcast backend multicast address: %s:%u, " - "TTL:%u\n", dpri->addr, dpri->lport, dpri->ttl); - } -} - -static int umcast_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_recvfrom(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); -} - -static int umcast_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return umcast_user_write(fd, skb->data, skb->len, - (struct umcast_data *) &lp->user); -} - -static const struct net_kern_info umcast_kern_info = { - .init = umcast_init, - .protocol = eth_protocol, - .read = umcast_read, - .write = umcast_write, -}; - -static int mcast_setup(char *str, char **mac_out, void *data) -{ - struct umcast_init *init = data; - char *port_str = NULL, *ttl_str = NULL, *remain; - char *last; - - *init = ((struct umcast_init) - { .addr = "239.192.168.1", - .lport = 1102, - .ttl = 1 }); - - remain = split_if_spec(str, mac_out, &init->addr, &port_str, &ttl_str, - NULL); - if (remain != NULL) { - printk(KERN_ERR "mcast_setup - Extra garbage on " - "specification : '%s'\n", remain); - return 0; - } - - if (port_str != NULL) { - init->lport = simple_strtoul(port_str, &last, 10); - if ((*last != '\0') || (last == port_str)) { - printk(KERN_ERR "mcast_setup - Bad port : '%s'\n", - port_str); - return 0; - } - } - - if (ttl_str != NULL) { - init->ttl = simple_strtoul(ttl_str, &last, 10); - if ((*last != '\0') || (last == ttl_str)) { - printk(KERN_ERR "mcast_setup - Bad ttl : '%s'\n", - ttl_str); - return 0; - } - } - - init->unicast = false; - init->rport = init->lport; - - printk(KERN_INFO "Configured mcast device: %s:%u-%u\n", init->addr, - init->lport, init->ttl); - - return 1; -} - -static int ucast_setup(char *str, char **mac_out, void *data) -{ - struct umcast_init *init = data; - char *lport_str = NULL, *rport_str = NULL, *remain; - char *last; - - *init = ((struct umcast_init) - { .addr = "", - .lport = 1102, - .rport = 1102 }); - - remain = split_if_spec(str, mac_out, &init->addr, - &lport_str, &rport_str, NULL); - if (remain != NULL) { - printk(KERN_ERR "ucast_setup - Extra garbage on " - "specification : '%s'\n", remain); - return 0; - } - - if (lport_str != NULL) { - init->lport = simple_strtoul(lport_str, &last, 10); - if ((*last != '\0') || (last == lport_str)) { - printk(KERN_ERR "ucast_setup - Bad listen port : " - "'%s'\n", lport_str); - return 0; - } - } - - if (rport_str != NULL) { - init->rport = simple_strtoul(rport_str, &last, 10); - if ((*last != '\0') || (last == rport_str)) { - printk(KERN_ERR "ucast_setup - Bad remote port : " - "'%s'\n", rport_str); - return 0; - } - } - - init->unicast = true; - - printk(KERN_INFO "Configured ucast device: :%u -> %s:%u\n", - init->lport, init->addr, init->rport); - - return 1; -} - -static struct transport mcast_transport = { - .list = LIST_HEAD_INIT(mcast_transport.list), - .name = "mcast", - .setup = mcast_setup, - .user = &umcast_user_info, - .kern = &umcast_kern_info, - .private_size = sizeof(struct umcast_data), - .setup_size = sizeof(struct umcast_init), -}; - -static struct transport ucast_transport = { - .list = LIST_HEAD_INIT(ucast_transport.list), - .name = "ucast", - .setup = ucast_setup, - .user = &umcast_user_info, - .kern = &umcast_kern_info, - .private_size = sizeof(struct umcast_data), - .setup_size = sizeof(struct umcast_init), -}; - -static int register_umcast(void) -{ - register_transport(&mcast_transport); - register_transport(&ucast_transport); - return 0; -} - -late_initcall(register_umcast); diff --git a/arch/um/drivers/umcast_user.c b/arch/um/drivers/umcast_user.c deleted file mode 100644 index b50b13cff04e..000000000000 --- a/arch/um/drivers/umcast_user.c +++ /dev/null @@ -1,184 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * user-mode-linux networking multicast transport - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 by Harald Welte - * - * based on the existing uml-networking code, which is - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - * - * - */ - -#include -#include -#include -#include "umcast.h" -#include -#include - -static struct sockaddr_in *new_addr(char *addr, unsigned short port) -{ - struct sockaddr_in *sin; - - sin = uml_kmalloc(sizeof(struct sockaddr_in), UM_GFP_KERNEL); - if (sin == NULL) { - printk(UM_KERN_ERR "new_addr: allocation of sockaddr_in " - "failed\n"); - return NULL; - } - sin->sin_family = AF_INET; - if (addr) - sin->sin_addr.s_addr = in_aton(addr); - else - sin->sin_addr.s_addr = INADDR_ANY; - sin->sin_port = htons(port); - return sin; -} - -static int umcast_user_init(void *data, void *dev) -{ - struct umcast_data *pri = data; - - pri->remote_addr = new_addr(pri->addr, pri->rport); - if (pri->unicast) - pri->listen_addr = new_addr(NULL, pri->lport); - else - pri->listen_addr = pri->remote_addr; - pri->dev = dev; - return 0; -} - -static void umcast_remove(void *data) -{ - struct umcast_data *pri = data; - - kfree(pri->listen_addr); - if (pri->unicast) - kfree(pri->remote_addr); - pri->listen_addr = pri->remote_addr = NULL; -} - -static int umcast_open(void *data) -{ - struct umcast_data *pri = data; - struct sockaddr_in *lsin = pri->listen_addr; - struct sockaddr_in *rsin = pri->remote_addr; - struct ip_mreq mreq; - int fd, yes = 1, err = -EINVAL; - - - if ((!pri->unicast && lsin->sin_addr.s_addr == 0) || - (rsin->sin_addr.s_addr == 0) || - (lsin->sin_port == 0) || (rsin->sin_port == 0)) - goto out; - - fd = socket(AF_INET, SOCK_DGRAM, 0); - - if (fd < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open : data socket failed, " - "errno = %d\n", errno); - goto out; - } - - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: SO_REUSEADDR failed, " - "errno = %d\n", errno); - goto out_close; - } - - if (!pri->unicast) { - /* set ttl according to config */ - if (setsockopt(fd, SOL_IP, IP_MULTICAST_TTL, &pri->ttl, - sizeof(pri->ttl)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: IP_MULTICAST_TTL " - "failed, error = %d\n", errno); - goto out_close; - } - - /* set LOOP, so data does get fed back to local sockets */ - if (setsockopt(fd, SOL_IP, IP_MULTICAST_LOOP, - &yes, sizeof(yes)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: IP_MULTICAST_LOOP " - "failed, error = %d\n", errno); - goto out_close; - } - } - - /* bind socket to the address */ - if (bind(fd, (struct sockaddr *) lsin, sizeof(*lsin)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open : data bind failed, " - "errno = %d\n", errno); - goto out_close; - } - - if (!pri->unicast) { - /* subscribe to the multicast group */ - mreq.imr_multiaddr.s_addr = lsin->sin_addr.s_addr; - mreq.imr_interface.s_addr = 0; - if (setsockopt(fd, SOL_IP, IP_ADD_MEMBERSHIP, - &mreq, sizeof(mreq)) < 0) { - err = -errno; - printk(UM_KERN_ERR "umcast_open: IP_ADD_MEMBERSHIP " - "failed, error = %d\n", errno); - printk(UM_KERN_ERR "There appears not to be a " - "multicast-capable network interface on the " - "host.\n"); - printk(UM_KERN_ERR "eth0 should be configured in order " - "to use the multicast transport.\n"); - goto out_close; - } - } - - return fd; - - out_close: - close(fd); - out: - return err; -} - -static void umcast_close(int fd, void *data) -{ - struct umcast_data *pri = data; - - if (!pri->unicast) { - struct ip_mreq mreq; - struct sockaddr_in *lsin = pri->listen_addr; - - mreq.imr_multiaddr.s_addr = lsin->sin_addr.s_addr; - mreq.imr_interface.s_addr = 0; - if (setsockopt(fd, SOL_IP, IP_DROP_MEMBERSHIP, - &mreq, sizeof(mreq)) < 0) { - printk(UM_KERN_ERR "umcast_close: IP_DROP_MEMBERSHIP " - "failed, error = %d\n", errno); - } - } - - close(fd); -} - -int umcast_user_write(int fd, void *buf, int len, struct umcast_data *pri) -{ - struct sockaddr_in *data_addr = pri->remote_addr; - - return net_sendto(fd, buf, len, data_addr, sizeof(*data_addr)); -} - -const struct net_user_info umcast_user_info = { - .init = umcast_user_init, - .open = umcast_open, - .close = umcast_close, - .remove = umcast_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/drivers/vde.h b/arch/um/drivers/vde.h deleted file mode 100644 index cab0379e6142..000000000000 --- a/arch/um/drivers/vde.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org). - */ - -#ifndef __UM_VDE_H__ -#define __UM_VDE_H__ - -struct vde_data { - char *vde_switch; - char *descr; - void *args; - void *conn; - void *dev; -}; - -struct vde_init { - char *vde_switch; - char *descr; - int port; - char *group; - int mode; -}; - -extern const struct net_user_info vde_user_info; - -extern void vde_init_libstuff(struct vde_data *vpri, struct vde_init *init); - -extern int vde_user_read(void *conn, void *buf, int len); -extern int vde_user_write(void *conn, void *buf, int len); - -#endif diff --git a/arch/um/drivers/vde_kern.c b/arch/um/drivers/vde_kern.c deleted file mode 100644 index bc6f22cbfb35..000000000000 --- a/arch/um/drivers/vde_kern.c +++ /dev/null @@ -1,129 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org). - * - * Transport usage: - * ethN=vde,,,,,, - * - */ - -#include -#include -#include -#include -#include "vde.h" - -static void vde_init(struct net_device *dev, void *data) -{ - struct vde_init *init = data; - struct uml_net_private *pri; - struct vde_data *vpri; - - pri = netdev_priv(dev); - vpri = (struct vde_data *) pri->user; - - vpri->vde_switch = init->vde_switch; - vpri->descr = init->descr ? init->descr : "UML vde_transport"; - vpri->args = NULL; - vpri->conn = NULL; - vpri->dev = dev; - - printk("vde backend - %s, ", vpri->vde_switch ? - vpri->vde_switch : "(default socket)"); - - vde_init_libstuff(vpri, init); - - printk("\n"); -} - -static int vde_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - struct vde_data *pri = (struct vde_data *) &lp->user; - - if (pri->conn != NULL) - return vde_user_read(pri->conn, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); - - printk(KERN_ERR "vde_read - we have no VDECONN to read from"); - return -EBADF; -} - -static int vde_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - struct vde_data *pri = (struct vde_data *) &lp->user; - - if (pri->conn != NULL) - return vde_user_write((void *)pri->conn, skb->data, - skb->len); - - printk(KERN_ERR "vde_write - we have no VDECONN to write to"); - return -EBADF; -} - -static const struct net_kern_info vde_kern_info = { - .init = vde_init, - .protocol = eth_protocol, - .read = vde_read, - .write = vde_write, -}; - -static int vde_setup(char *str, char **mac_out, void *data) -{ - struct vde_init *init = data; - char *remain, *port_str = NULL, *mode_str = NULL, *last; - - *init = ((struct vde_init) - { .vde_switch = NULL, - .descr = NULL, - .port = 0, - .group = NULL, - .mode = 0 }); - - remain = split_if_spec(str, &init->vde_switch, mac_out, &port_str, - &init->group, &mode_str, &init->descr, NULL); - - if (remain != NULL) - printk(KERN_WARNING "vde_setup - Ignoring extra data :" - "'%s'\n", remain); - - if (port_str != NULL) { - init->port = simple_strtoul(port_str, &last, 10); - if ((*last != '\0') || (last == port_str)) { - printk(KERN_ERR "vde_setup - Bad port : '%s'\n", - port_str); - return 0; - } - } - - if (mode_str != NULL) { - init->mode = simple_strtoul(mode_str, &last, 8); - if ((*last != '\0') || (last == mode_str)) { - printk(KERN_ERR "vde_setup - Bad mode : '%s'\n", - mode_str); - return 0; - } - } - - printk(KERN_INFO "Configured vde device: %s\n", init->vde_switch ? - init->vde_switch : "(default socket)"); - - return 1; -} - -static struct transport vde_transport = { - .list = LIST_HEAD_INIT(vde_transport.list), - .name = "vde", - .setup = vde_setup, - .user = &vde_user_info, - .kern = &vde_kern_info, - .private_size = sizeof(struct vde_data), - .setup_size = sizeof(struct vde_init), -}; - -static int register_vde(void) -{ - register_transport(&vde_transport); - return 0; -} - -late_initcall(register_vde); diff --git a/arch/um/drivers/vde_user.c b/arch/um/drivers/vde_user.c deleted file mode 100644 index bc7dc4e1e486..000000000000 --- a/arch/um/drivers/vde_user.c +++ /dev/null @@ -1,125 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org). - */ - -#include -#include -#include -#include -#include -#include "vde.h" - -static int vde_user_init(void *data, void *dev) -{ - struct vde_data *pri = data; - VDECONN *conn = NULL; - int err = -EINVAL; - - pri->dev = dev; - - conn = vde_open(pri->vde_switch, pri->descr, pri->args); - - if (conn == NULL) { - err = -errno; - printk(UM_KERN_ERR "vde_user_init: vde_open failed, " - "errno = %d\n", errno); - return err; - } - - printk(UM_KERN_INFO "vde backend - connection opened\n"); - - pri->conn = conn; - - return 0; -} - -static int vde_user_open(void *data) -{ - struct vde_data *pri = data; - - if (pri->conn != NULL) - return vde_datafd(pri->conn); - - printk(UM_KERN_WARNING "vde_open - we have no VDECONN to open"); - return -EINVAL; -} - -static void vde_remove(void *data) -{ - struct vde_data *pri = data; - - if (pri->conn != NULL) { - printk(UM_KERN_INFO "vde backend - closing connection\n"); - vde_close(pri->conn); - pri->conn = NULL; - kfree(pri->args); - pri->args = NULL; - return; - } - - printk(UM_KERN_WARNING "vde_remove - we have no VDECONN to remove"); -} - -const struct net_user_info vde_user_info = { - .init = vde_user_init, - .open = vde_user_open, - .close = NULL, - .remove = vde_remove, - .add_address = NULL, - .delete_address = NULL, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; - -void vde_init_libstuff(struct vde_data *vpri, struct vde_init *init) -{ - struct vde_open_args *args; - - vpri->args = uml_kmalloc(sizeof(struct vde_open_args), UM_GFP_KERNEL); - if (vpri->args == NULL) { - printk(UM_KERN_ERR "vde_init_libstuff - vde_open_args " - "allocation failed"); - return; - } - - args = vpri->args; - - args->port = init->port; - args->group = init->group; - args->mode = init->mode ? init->mode : 0700; - - args->port ? printk("port %d", args->port) : - printk("undefined port"); -} - -int vde_user_read(void *conn, void *buf, int len) -{ - VDECONN *vconn = conn; - int rv; - - if (vconn == NULL) - return 0; - - rv = vde_recv(vconn, buf, len, 0); - if (rv < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (rv == 0) - return -ENOTCONN; - - return rv; -} - -int vde_user_write(void *conn, void *buf, int len) -{ - VDECONN *vconn = conn; - - if (vconn == NULL) - return 0; - - return vde_send(vconn, buf, len, 0); -} - diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 152a60080d5b..e63a74a5ff19 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -143,7 +143,6 @@ extern int os_access(const char *file, int mode); extern int os_set_exec_close(int fd); extern int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg); extern int os_get_ifname(int fd, char *namebuf); -extern int os_set_slip(int fd); extern int os_mode_fd(int fd, int mode); extern int os_seek_file(int fd, unsigned long long offset); diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile index 049dfa5bc9c6..fae836713487 100644 --- a/arch/um/os-Linux/Makefile +++ b/arch/um/os-Linux/Makefile @@ -8,7 +8,7 @@ KCOV_INSTRUMENT := n obj-y = execvp.o file.o helper.o irq.o main.o mem.o process.o \ registers.o sigio.o signal.o start_up.o time.o tty.o \ - umid.o user_syms.o util.o drivers/ skas/ + umid.o user_syms.o util.o skas/ CFLAGS_signal.o += -Wframe-larger-than=4096 diff --git a/arch/um/os-Linux/drivers/Makefile b/arch/um/os-Linux/drivers/Makefile deleted file mode 100644 index cf2d75bb1884..000000000000 --- a/arch/um/os-Linux/drivers/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com) -# - -ethertap-objs := ethertap_kern.o ethertap_user.o -tuntap-objs := tuntap_kern.o tuntap_user.o - -obj-y = -obj-$(CONFIG_UML_NET_ETHERTAP) += ethertap.o -obj-$(CONFIG_UML_NET_TUNTAP) += tuntap.o - -include $(srctree)/arch/um/scripts/Makefile.rules diff --git a/arch/um/os-Linux/drivers/etap.h b/arch/um/os-Linux/drivers/etap.h deleted file mode 100644 index a475259f90e1..000000000000 --- a/arch/um/os-Linux/drivers/etap.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __DRIVERS_ETAP_H -#define __DRIVERS_ETAP_H - -#include - -struct ethertap_data { - char *dev_name; - char *gate_addr; - int data_fd; - int control_fd; - void *dev; -}; - -extern const struct net_user_info ethertap_user_info; - -#endif diff --git a/arch/um/os-Linux/drivers/ethertap_kern.c b/arch/um/os-Linux/drivers/ethertap_kern.c deleted file mode 100644 index 5e5ee40680ce..000000000000 --- a/arch/um/os-Linux/drivers/ethertap_kern.c +++ /dev/null @@ -1,100 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 by various other people who didn't put their name here. - */ - -#include -#include -#include "etap.h" -#include - -struct ethertap_init { - char *dev_name; - char *gate_addr; -}; - -static void etap_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct ethertap_data *epri; - struct ethertap_init *init = data; - - pri = netdev_priv(dev); - epri = (struct ethertap_data *) pri->user; - epri->dev_name = init->dev_name; - epri->gate_addr = init->gate_addr; - epri->data_fd = -1; - epri->control_fd = -1; - epri->dev = dev; - - printk(KERN_INFO "ethertap backend - %s", epri->dev_name); - if (epri->gate_addr != NULL) - printk(KERN_CONT ", IP = %s", epri->gate_addr); - printk(KERN_CONT "\n"); -} - -static int etap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - int len; - - len = net_recvfrom(fd, skb_mac_header(skb), - skb->dev->mtu + 2 + ETH_HEADER_ETHERTAP); - if (len <= 0) - return(len); - - skb_pull(skb, 2); - len -= 2; - return len; -} - -static int etap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - skb_push(skb, 2); - return net_send(fd, skb->data, skb->len); -} - -const struct net_kern_info ethertap_kern_info = { - .init = etap_init, - .protocol = eth_protocol, - .read = etap_read, - .write = etap_write, -}; - -static int ethertap_setup(char *str, char **mac_out, void *data) -{ - struct ethertap_init *init = data; - - *init = ((struct ethertap_init) - { .dev_name = NULL, - .gate_addr = NULL }); - if (tap_setup_common(str, "ethertap", &init->dev_name, mac_out, - &init->gate_addr)) - return 0; - if (init->dev_name == NULL) { - printk(KERN_ERR "ethertap_setup : Missing tap device name\n"); - return 0; - } - - return 1; -} - -static struct transport ethertap_transport = { - .list = LIST_HEAD_INIT(ethertap_transport.list), - .name = "ethertap", - .setup = ethertap_setup, - .user = ðertap_user_info, - .kern = ðertap_kern_info, - .private_size = sizeof(struct ethertap_data), - .setup_size = sizeof(struct ethertap_init), -}; - -static int register_ethertap(void) -{ - register_transport(ðertap_transport); - return 0; -} - -late_initcall(register_ethertap); diff --git a/arch/um/os-Linux/drivers/ethertap_user.c b/arch/um/os-Linux/drivers/ethertap_user.c deleted file mode 100644 index bdf215c0eca7..000000000000 --- a/arch/um/os-Linux/drivers/ethertap_user.c +++ /dev/null @@ -1,248 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - */ - -#include -#include -#include -#include -#include -#include -#include "etap.h" -#include -#include -#include - -#define MAX_PACKET ETH_MAX_PACKET - -static int etap_user_init(void *data, void *dev) -{ - struct ethertap_data *pri = data; - - pri->dev = dev; - return 0; -} - -struct addr_change { - enum { ADD_ADDR, DEL_ADDR } what; - unsigned char addr[4]; - unsigned char netmask[4]; -}; - -static void etap_change(int op, unsigned char *addr, unsigned char *netmask, - int fd) -{ - struct addr_change change; - char *output; - int n; - - change.what = op; - memcpy(change.addr, addr, sizeof(change.addr)); - memcpy(change.netmask, netmask, sizeof(change.netmask)); - CATCH_EINTR(n = write(fd, &change, sizeof(change))); - if (n != sizeof(change)) { - printk(UM_KERN_ERR "etap_change - request failed, err = %d\n", - errno); - return; - } - - output = uml_kmalloc(UM_KERN_PAGE_SIZE, UM_GFP_KERNEL); - if (output == NULL) - printk(UM_KERN_ERR "etap_change : Failed to allocate output " - "buffer\n"); - read_output(fd, output, UM_KERN_PAGE_SIZE); - if (output != NULL) { - printk("%s", output); - kfree(output); - } -} - -static void etap_open_addr(unsigned char *addr, unsigned char *netmask, - void *arg) -{ - etap_change(ADD_ADDR, addr, netmask, *((int *) arg)); -} - -static void etap_close_addr(unsigned char *addr, unsigned char *netmask, - void *arg) -{ - etap_change(DEL_ADDR, addr, netmask, *((int *) arg)); -} - -struct etap_pre_exec_data { - int control_remote; - int control_me; - int data_me; -}; - -static void etap_pre_exec(void *arg) -{ - struct etap_pre_exec_data *data = arg; - - dup2(data->control_remote, 1); - close(data->data_me); - close(data->control_me); -} - -static int etap_tramp(char *dev, char *gate, int control_me, - int control_remote, int data_me, int data_remote) -{ - struct etap_pre_exec_data pe_data; - int pid, err, n; - char version_buf[sizeof("nnnnn\0")]; - char data_fd_buf[sizeof("nnnnnn\0")]; - char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")]; - char *setup_args[] = { "uml_net", version_buf, "ethertap", dev, - data_fd_buf, gate_buf, NULL }; - char *nosetup_args[] = { "uml_net", version_buf, "ethertap", - dev, data_fd_buf, NULL }; - char **args, c; - - sprintf(data_fd_buf, "%d", data_remote); - sprintf(version_buf, "%d", UML_NET_VERSION); - if (gate != NULL) { - strscpy(gate_buf, gate); - args = setup_args; - } - else args = nosetup_args; - - err = 0; - pe_data.control_remote = control_remote; - pe_data.control_me = control_me; - pe_data.data_me = data_me; - pid = run_helper(etap_pre_exec, &pe_data, args); - - if (pid < 0) - err = pid; - close(data_remote); - close(control_remote); - CATCH_EINTR(n = read(control_me, &c, sizeof(c))); - if (n != sizeof(c)) { - err = -errno; - printk(UM_KERN_ERR "etap_tramp : read of status failed, " - "err = %d\n", -err); - return err; - } - if (c != 1) { - printk(UM_KERN_ERR "etap_tramp : uml_net failed\n"); - err = helper_wait(pid); - } - return err; -} - -static int etap_open(void *data) -{ - struct ethertap_data *pri = data; - char *output; - int data_fds[2], control_fds[2], err, output_len; - - err = tap_open_common(pri->dev, pri->gate_addr); - if (err) - return err; - - err = socketpair(AF_UNIX, SOCK_DGRAM, 0, data_fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "etap_open - data socketpair failed - " - "err = %d\n", errno); - return err; - } - - err = socketpair(AF_UNIX, SOCK_STREAM, 0, control_fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "etap_open - control socketpair failed - " - "err = %d\n", errno); - goto out_close_data; - } - - err = etap_tramp(pri->dev_name, pri->gate_addr, control_fds[0], - control_fds[1], data_fds[0], data_fds[1]); - output_len = UM_KERN_PAGE_SIZE; - output = uml_kmalloc(output_len, UM_GFP_KERNEL); - read_output(control_fds[0], output, output_len); - - if (output == NULL) - printk(UM_KERN_ERR "etap_open : failed to allocate output " - "buffer\n"); - else { - printk("%s", output); - kfree(output); - } - - if (err < 0) { - printk(UM_KERN_ERR "etap_tramp failed - err = %d\n", -err); - goto out_close_control; - } - - pri->data_fd = data_fds[0]; - pri->control_fd = control_fds[0]; - iter_addresses(pri->dev, etap_open_addr, &pri->control_fd); - return data_fds[0]; - -out_close_control: - close(control_fds[0]); - close(control_fds[1]); -out_close_data: - close(data_fds[0]); - close(data_fds[1]); - return err; -} - -static void etap_close(int fd, void *data) -{ - struct ethertap_data *pri = data; - - iter_addresses(pri->dev, etap_close_addr, &pri->control_fd); - close(fd); - - if (shutdown(pri->data_fd, SHUT_RDWR) < 0) - printk(UM_KERN_ERR "etap_close - shutdown data socket failed, " - "errno = %d\n", errno); - - if (shutdown(pri->control_fd, SHUT_RDWR) < 0) - printk(UM_KERN_ERR "etap_close - shutdown control socket " - "failed, errno = %d\n", errno); - - close(pri->data_fd); - pri->data_fd = -1; - close(pri->control_fd); - pri->control_fd = -1; -} - -static void etap_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct ethertap_data *pri = data; - - tap_check_ips(pri->gate_addr, addr); - if (pri->control_fd == -1) - return; - etap_open_addr(addr, netmask, &pri->control_fd); -} - -static void etap_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct ethertap_data *pri = data; - - if (pri->control_fd == -1) - return; - - etap_close_addr(addr, netmask, &pri->control_fd); -} - -const struct net_user_info ethertap_user_info = { - .init = etap_user_init, - .open = etap_open, - .close = etap_close, - .remove = NULL, - .add_address = etap_add_addr, - .delete_address = etap_del_addr, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_ETHERTAP, -}; diff --git a/arch/um/os-Linux/drivers/tuntap.h b/arch/um/os-Linux/drivers/tuntap.h deleted file mode 100644 index e364e42abfc5..000000000000 --- a/arch/um/os-Linux/drivers/tuntap.h +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __UM_TUNTAP_H -#define __UM_TUNTAP_H - -#include - -struct tuntap_data { - char *dev_name; - int fixed_config; - char *gate_addr; - int fd; - void *dev; -}; - -extern const struct net_user_info tuntap_user_info; - -#endif diff --git a/arch/um/os-Linux/drivers/tuntap_kern.c b/arch/um/os-Linux/drivers/tuntap_kern.c deleted file mode 100644 index ff022d9cf0dd..000000000000 --- a/arch/um/os-Linux/drivers/tuntap_kern.c +++ /dev/null @@ -1,86 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include -#include -#include -#include -#include -#include "tuntap.h" - -struct tuntap_init { - char *dev_name; - char *gate_addr; -}; - -static void tuntap_init(struct net_device *dev, void *data) -{ - struct uml_net_private *pri; - struct tuntap_data *tpri; - struct tuntap_init *init = data; - - pri = netdev_priv(dev); - tpri = (struct tuntap_data *) pri->user; - tpri->dev_name = init->dev_name; - tpri->fixed_config = (init->dev_name != NULL); - tpri->gate_addr = init->gate_addr; - tpri->fd = -1; - tpri->dev = dev; - - printk(KERN_INFO "TUN/TAP backend - "); - if (tpri->gate_addr != NULL) - printk(KERN_CONT "IP = %s", tpri->gate_addr); - printk(KERN_CONT "\n"); -} - -static int tuntap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_read(fd, skb_mac_header(skb), - skb->dev->mtu + ETH_HEADER_OTHER); -} - -static int tuntap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp) -{ - return net_write(fd, skb->data, skb->len); -} - -const struct net_kern_info tuntap_kern_info = { - .init = tuntap_init, - .protocol = eth_protocol, - .read = tuntap_read, - .write = tuntap_write, -}; - -static int tuntap_setup(char *str, char **mac_out, void *data) -{ - struct tuntap_init *init = data; - - *init = ((struct tuntap_init) - { .dev_name = NULL, - .gate_addr = NULL }); - if (tap_setup_common(str, "tuntap", &init->dev_name, mac_out, - &init->gate_addr)) - return 0; - - return 1; -} - -static struct transport tuntap_transport = { - .list = LIST_HEAD_INIT(tuntap_transport.list), - .name = "tuntap", - .setup = tuntap_setup, - .user = &tuntap_user_info, - .kern = &tuntap_kern_info, - .private_size = sizeof(struct tuntap_data), - .setup_size = sizeof(struct tuntap_init), -}; - -static int register_tuntap(void) -{ - register_transport(&tuntap_transport); - return 0; -} - -late_initcall(register_tuntap); diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c deleted file mode 100644 index 91f0e27ca3a6..000000000000 --- a/arch/um/os-Linux/drivers/tuntap_user.c +++ /dev/null @@ -1,215 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tuntap.h" - -static int tuntap_user_init(void *data, void *dev) -{ - struct tuntap_data *pri = data; - - pri->dev = dev; - return 0; -} - -static void tuntap_add_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct tuntap_data *pri = data; - - tap_check_ips(pri->gate_addr, addr); - if ((pri->fd == -1) || pri->fixed_config) - return; - open_addr(addr, netmask, pri->dev_name); -} - -static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask, - void *data) -{ - struct tuntap_data *pri = data; - - if ((pri->fd == -1) || pri->fixed_config) - return; - close_addr(addr, netmask, pri->dev_name); -} - -struct tuntap_pre_exec_data { - int stdout_fd; - int close_me; -}; - -static void tuntap_pre_exec(void *arg) -{ - struct tuntap_pre_exec_data *data = arg; - - dup2(data->stdout_fd, 1); - close(data->close_me); -} - -static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote, - char *buffer, int buffer_len, int *used_out) -{ - struct tuntap_pre_exec_data data; - char version_buf[sizeof("nnnnn\0")]; - char *argv[] = { "uml_net", version_buf, "tuntap", "up", gate, - NULL }; - char buf[CMSG_SPACE(sizeof(*fd_out))]; - struct msghdr msg; - struct cmsghdr *cmsg; - struct iovec iov; - int pid, n, err; - - sprintf(version_buf, "%d", UML_NET_VERSION); - - data.stdout_fd = remote; - data.close_me = me; - - pid = run_helper(tuntap_pre_exec, &data, argv); - - if (pid < 0) - return pid; - - close(remote); - - msg.msg_name = NULL; - msg.msg_namelen = 0; - if (buffer != NULL) { - iov = ((struct iovec) { buffer, buffer_len }); - msg.msg_iov = &iov; - msg.msg_iovlen = 1; - } - else { - msg.msg_iov = NULL; - msg.msg_iovlen = 0; - } - msg.msg_control = buf; - msg.msg_controllen = sizeof(buf); - msg.msg_flags = 0; - n = recvmsg(me, &msg, 0); - *used_out = n; - if (n < 0) { - err = -errno; - printk(UM_KERN_ERR "tuntap_open_tramp : recvmsg failed - " - "errno = %d\n", errno); - return err; - } - helper_wait(pid); - - cmsg = CMSG_FIRSTHDR(&msg); - if (cmsg == NULL) { - printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a " - "message\n"); - return -EINVAL; - } - if ((cmsg->cmsg_level != SOL_SOCKET) || - (cmsg->cmsg_type != SCM_RIGHTS)) { - printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a " - "descriptor\n"); - return -EINVAL; - } - *fd_out = ((int *) CMSG_DATA(cmsg))[0]; - os_set_exec_close(*fd_out); - return 0; -} - -static int tuntap_open(void *data) -{ - struct ifreq ifr; - struct tuntap_data *pri = data; - char *output, *buffer; - int err, fds[2], len, used; - - err = tap_open_common(pri->dev, pri->gate_addr); - if (err < 0) - return err; - - if (pri->fixed_config) { - pri->fd = os_open_file("/dev/net/tun", - of_cloexec(of_rdwr(OPENFLAGS())), 0); - if (pri->fd < 0) { - printk(UM_KERN_ERR "Failed to open /dev/net/tun, " - "err = %d\n", -pri->fd); - return pri->fd; - } - memset(&ifr, 0, sizeof(ifr)); - ifr.ifr_flags = IFF_TAP | IFF_NO_PI; - strscpy(ifr.ifr_name, pri->dev_name); - if (ioctl(pri->fd, TUNSETIFF, &ifr) < 0) { - err = -errno; - printk(UM_KERN_ERR "TUNSETIFF failed, errno = %d\n", - errno); - close(pri->fd); - return err; - } - } - else { - err = socketpair(AF_UNIX, SOCK_DGRAM, 0, fds); - if (err) { - err = -errno; - printk(UM_KERN_ERR "tuntap_open : socketpair failed - " - "errno = %d\n", errno); - return err; - } - - buffer = get_output_buffer(&len); - if (buffer != NULL) - len--; - used = 0; - - err = tuntap_open_tramp(pri->gate_addr, &pri->fd, fds[0], - fds[1], buffer, len, &used); - - output = buffer; - if (err < 0) { - printk("%s", output); - free_output_buffer(buffer); - printk(UM_KERN_ERR "tuntap_open_tramp failed - " - "err = %d\n", -err); - return err; - } - - pri->dev_name = uml_strdup(buffer); - output += IFNAMSIZ; - printk("%s", output); - free_output_buffer(buffer); - - close(fds[0]); - iter_addresses(pri->dev, open_addr, pri->dev_name); - } - - return pri->fd; -} - -static void tuntap_close(int fd, void *data) -{ - struct tuntap_data *pri = data; - - if (!pri->fixed_config) - iter_addresses(pri->dev, close_addr, pri->dev_name); - close(fd); - pri->fd = -1; -} - -const struct net_user_info tuntap_user_info = { - .init = tuntap_user_init, - .open = tuntap_open, - .close = tuntap_close, - .remove = NULL, - .add_address = tuntap_add_addr, - .delete_address = tuntap_del_addr, - .mtu = ETH_MAX_PACKET, - .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER, -}; diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c index a0d01c68ce3e..617886d1fb1e 100644 --- a/arch/um/os-Linux/file.c +++ b/arch/um/os-Linux/file.c @@ -106,21 +106,6 @@ int os_get_ifname(int fd, char* namebuf) return 0; } -int os_set_slip(int fd) -{ - int disc, sencap; - - disc = N_SLIP; - if (ioctl(fd, TIOCSETD, &disc) < 0) - return -errno; - - sencap = 0; - if (ioctl(fd, SIOCSIFENCAP, &sencap) < 0) - return -errno; - - return 0; -} - int os_mode_fd(int fd, int mode) { int err; -- cgit v1.2.3 From b555cb66583e99158cfef8e91c025252cefae55b Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Sat, 3 May 2025 13:17:09 +0800 Subject: um: vector: Eliminate the dependency on uml_net The only dependency on uml_net (i.e., the legacy network transport infrastructure) is the call to uml_net_setup_etheraddr(). Implement it inside vector to eliminate the uml_net dependency completely. It will allow us to remove uml_net in the next step. Signed-off-by: Tiwei Bie Acked-By: Anton Ivanov Link: https://patch.msgid.link/20250503051710.3286595-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/Kconfig | 1 - arch/um/drivers/vector_kern.c | 52 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index bc68c3e341f5..1bd4e053be04 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -145,7 +145,6 @@ config UML_NET config UML_NET_VECTOR bool "Vector I/O high performance network devices" - depends on UML_NET select MAY_HAVE_RUNTIME_DEPS help This User-Mode Linux network driver uses multi-message send diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index b97bb52dd562..0d5c96897fa0 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include "mconsole_kern.h" #include "vector_user.h" @@ -1539,7 +1538,56 @@ static void vector_timer_expire(struct timer_list *t) napi_schedule(&vp->napi); } +static void vector_setup_etheraddr(struct net_device *dev, char *str) +{ + u8 addr[ETH_ALEN]; + char *end; + int i; + if (str == NULL) + goto random; + + for (i = 0; i < 6; i++) { + addr[i] = simple_strtoul(str, &end, 16); + if ((end == str) || + ((*end != ':') && (*end != ',') && (*end != '\0'))) { + printk(KERN_ERR + "setup_etheraddr: failed to parse '%s' " + "as an ethernet address\n", str); + goto random; + } + str = end + 1; + } + if (is_multicast_ether_addr(addr)) { + printk(KERN_ERR + "Attempt to assign a multicast ethernet address to a " + "device disallowed\n"); + goto random; + } + if (!is_valid_ether_addr(addr)) { + printk(KERN_ERR + "Attempt to assign an invalid ethernet address to a " + "device disallowed\n"); + goto random; + } + if (!is_local_ether_addr(addr)) { + printk(KERN_WARNING + "Warning: Assigning a globally valid ethernet " + "address to a device\n"); + printk(KERN_WARNING "You should set the 2nd rightmost bit in " + "the first byte of the MAC,\n"); + printk(KERN_WARNING "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n", + addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4], + addr[5]); + } + eth_hw_addr_set(dev, addr); + return; + +random: + printk(KERN_INFO + "Choosing a random ethernet address for device %s\n", dev->name); + eth_hw_addr_random(dev); +} static void vector_eth_configure( int n, @@ -1574,7 +1622,7 @@ static void vector_eth_configure( * and fail. */ snprintf(dev->name, sizeof(dev->name), "vec%d", n); - uml_net_setup_etheraddr(dev, uml_vector_fetch_arg(def, "mac")); + vector_setup_etheraddr(dev, uml_vector_fetch_arg(def, "mac")); vp = netdev_priv(dev); /* sysfs register */ -- cgit v1.2.3 From e619e18ed462bded8e8f12672a37053d39451404 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Sat, 3 May 2025 13:17:10 +0800 Subject: um: Remove legacy network transport infrastructure All legacy network transports have been removed. Vector transports provide the same capabilities with significantly higher network throughput. There is no reason to keep the legacy network transport infrastructure anymore. Remove it to reduce the maintenance burden. Signed-off-by: Tiwei Bie Acked-By: Anton Ivanov Link: https://patch.msgid.link/20250503051710.3286595-4-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/configs/i386_defconfig | 1 - arch/um/configs/x86_64_defconfig | 1 - arch/um/drivers/Kconfig | 27 +- arch/um/drivers/Makefile | 2 - arch/um/drivers/net_kern.c | 889 -------------------------------------- arch/um/drivers/net_user.c | 271 ------------ arch/um/include/shared/net_kern.h | 69 --- arch/um/include/shared/net_user.h | 52 --- 8 files changed, 5 insertions(+), 1307 deletions(-) delete mode 100644 arch/um/drivers/net_kern.c delete mode 100644 arch/um/drivers/net_user.c delete mode 100644 arch/um/include/shared/net_kern.h delete mode 100644 arch/um/include/shared/net_user.h (limited to 'arch') diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig index 7c8999f18f2d..29d9666eceae 100644 --- a/arch/um/configs/i386_defconfig +++ b/arch/um/configs/i386_defconfig @@ -52,7 +52,6 @@ CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set -CONFIG_UML_NET=y CONFIG_EXT4_FS=y CONFIG_QUOTA=y CONFIG_AUTOFS_FS=m diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig index 9858d989777e..cf309c5406a2 100644 --- a/arch/um/configs/x86_64_defconfig +++ b/arch/um/configs/x86_64_defconfig @@ -51,7 +51,6 @@ CONFIG_PACKET=y CONFIG_UNIX=y CONFIG_INET=y # CONFIG_IPV6 is not set -CONFIG_UML_NET=y CONFIG_EXT4_FS=y CONFIG_QUOTA=y CONFIG_AUTOFS_FS=m diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index 1bd4e053be04..34085bfc6d41 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -124,25 +124,6 @@ endmenu menu "UML Network Devices" depends on NET -# UML virtual driver -config UML_NET - bool "Virtual network device" - help - While the User-Mode port cannot directly talk to any physical - hardware devices, this choice and the following transport options - provide one or more virtual network devices through which the UML - kernels can talk to each other, the host, and with the host's help, - machines on the outside world. - - For more information, including explanations of the networking and - sample configurations, see - . - - If you'd like to be able to enable networking in the User-Mode - linux environment, say Y; otherwise say N. Note that you must - enable at least one of the following transport options to actually - make use of UML networking. - config UML_NET_VECTOR bool "Vector I/O high performance network devices" select MAY_HAVE_RUNTIME_DEPS @@ -150,9 +131,11 @@ config UML_NET_VECTOR This User-Mode Linux network driver uses multi-message send and receive functions. The host running the UML guest must have a linux kernel version above 3.0 and a libc version > 2.13. - This driver provides tap, raw, gre and l2tpv3 network transports - with up to 4 times higher network throughput than the UML network - drivers. + This driver provides tap, raw, gre and l2tpv3 network transports. + + For more information, including explanations of the networking + and sample configurations, see + . endmenu diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index 6dfc2e9c0ce8..6bf8cbf71d3c 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -7,7 +7,6 @@ # in to pcap.o vector-objs := vector_kern.o vector_user.o vector_transports.o -net-objs := net_kern.o net_user.o mconsole-objs := mconsole_kern.o mconsole_user.o hostaudio-objs := hostaudio_kern.o ubd-objs := ubd_kern.o ubd_user.o @@ -29,7 +28,6 @@ obj-$(CONFIG_SSL) += ssl.o obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o obj-$(CONFIG_UML_NET_VECTOR) += vector.o -obj-$(CONFIG_UML_NET) += net.o obj-$(CONFIG_MCONSOLE) += mconsole.o obj-$(CONFIG_MMAPPER) += mmapper_kern.o obj-$(CONFIG_BLK_DEV_UBD) += ubd.o diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c deleted file mode 100644 index d5a9c5aabaec..000000000000 --- a/arch/um/drivers/net_kern.c +++ /dev/null @@ -1,889 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and - * James Leu (jleu@mindspring.net). - * Copyright (C) 2001 by various other people who didn't put their name here. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "mconsole_kern.h" -#include -#include - -#define DRIVER_NAME "uml-netdev" - -static DEFINE_SPINLOCK(opened_lock); -static LIST_HEAD(opened); - -/* - * The drop_skb is used when we can't allocate an skb. The - * packet is read into drop_skb in order to get the data off the - * connection to the host. - * It is reallocated whenever a maximum packet size is seen which is - * larger than any seen before. update_drop_skb is called from - * eth_configure when a new interface is added. - */ -static DEFINE_SPINLOCK(drop_lock); -static struct sk_buff *drop_skb; -static int drop_max; - -static int update_drop_skb(int max) -{ - struct sk_buff *new; - unsigned long flags; - int err = 0; - - spin_lock_irqsave(&drop_lock, flags); - - if (max <= drop_max) - goto out; - - err = -ENOMEM; - new = dev_alloc_skb(max); - if (new == NULL) - goto out; - - skb_put(new, max); - - kfree_skb(drop_skb); - drop_skb = new; - drop_max = max; - err = 0; -out: - spin_unlock_irqrestore(&drop_lock, flags); - - return err; -} - -static int uml_net_rx(struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - int pkt_len; - struct sk_buff *skb; - - /* If we can't allocate memory, try again next round. */ - skb = dev_alloc_skb(lp->max_packet); - if (skb == NULL) { - drop_skb->dev = dev; - /* Read a packet into drop_skb and don't do anything with it. */ - (*lp->read)(lp->fd, drop_skb, lp); - dev->stats.rx_dropped++; - return 0; - } - - skb->dev = dev; - skb_put(skb, lp->max_packet); - skb_reset_mac_header(skb); - pkt_len = (*lp->read)(lp->fd, skb, lp); - - if (pkt_len > 0) { - skb_trim(skb, pkt_len); - skb->protocol = (*lp->protocol)(skb); - - dev->stats.rx_bytes += skb->len; - dev->stats.rx_packets++; - netif_rx(skb); - return pkt_len; - } - - kfree_skb(skb); - return pkt_len; -} - -static void uml_dev_close(struct work_struct *work) -{ - struct uml_net_private *lp = - container_of(work, struct uml_net_private, work); - dev_close(lp->dev); -} - -static irqreturn_t uml_net_interrupt(int irq, void *dev_id) -{ - struct net_device *dev = dev_id; - struct uml_net_private *lp = netdev_priv(dev); - int err; - - if (!netif_running(dev)) - return IRQ_NONE; - - spin_lock(&lp->lock); - while ((err = uml_net_rx(dev)) > 0) ; - if (err < 0) { - printk(KERN_ERR - "Device '%s' read returned %d, shutting it down\n", - dev->name, err); - /* dev_close can't be called in interrupt context, and takes - * again lp->lock. - * And dev_close() can be safely called multiple times on the - * same device, since it tests for (dev->flags & IFF_UP). So - * there's no harm in delaying the device shutdown. - * Furthermore, the workqueue will not re-enqueue an already - * enqueued work item. */ - schedule_work(&lp->work); - goto out; - } -out: - spin_unlock(&lp->lock); - return IRQ_HANDLED; -} - -static int uml_net_open(struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - int err; - - if (lp->fd >= 0) { - err = -ENXIO; - goto out; - } - - lp->fd = (*lp->open)(&lp->user); - if (lp->fd < 0) { - err = lp->fd; - goto out; - } - - err = um_request_irq(dev->irq, lp->fd, IRQ_READ, uml_net_interrupt, - IRQF_SHARED, dev->name, dev); - if (err < 0) { - printk(KERN_ERR "uml_net_open: failed to get irq(%d)\n", err); - err = -ENETUNREACH; - goto out_close; - } - - netif_start_queue(dev); - - /* clear buffer - it can happen that the host side of the interface - * is full when we get here. In this case, new data is never queued, - * SIGIOs never arrive, and the net never works. - */ - while ((err = uml_net_rx(dev)) > 0) ; - - spin_lock(&opened_lock); - list_add(&lp->list, &opened); - spin_unlock(&opened_lock); - - return 0; -out_close: - if (lp->close != NULL) (*lp->close)(lp->fd, &lp->user); - lp->fd = -1; -out: - return err; -} - -static int uml_net_close(struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - - netif_stop_queue(dev); - - um_free_irq(dev->irq, dev); - if (lp->close != NULL) - (*lp->close)(lp->fd, &lp->user); - lp->fd = -1; - - spin_lock(&opened_lock); - list_del(&lp->list); - spin_unlock(&opened_lock); - - return 0; -} - -static netdev_tx_t uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct uml_net_private *lp = netdev_priv(dev); - unsigned long flags; - int len; - - netif_stop_queue(dev); - - spin_lock_irqsave(&lp->lock, flags); - - len = (*lp->write)(lp->fd, skb, lp); - skb_tx_timestamp(skb); - - if (len == skb->len) { - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - netif_trans_update(dev); - netif_start_queue(dev); - - /* this is normally done in the interrupt when tx finishes */ - netif_wake_queue(dev); - } - else if (len == 0) { - netif_start_queue(dev); - dev->stats.tx_dropped++; - } - else { - netif_start_queue(dev); - printk(KERN_ERR "uml_net_start_xmit: failed(%d)\n", len); - } - - spin_unlock_irqrestore(&lp->lock, flags); - - dev_consume_skb_any(skb); - - return NETDEV_TX_OK; -} - -static void uml_net_set_multicast_list(struct net_device *dev) -{ - return; -} - -static void uml_net_tx_timeout(struct net_device *dev, unsigned int txqueue) -{ - netif_trans_update(dev); - netif_wake_queue(dev); -} - -#ifdef CONFIG_NET_POLL_CONTROLLER -static void uml_net_poll_controller(struct net_device *dev) -{ - disable_irq(dev->irq); - uml_net_interrupt(dev->irq, dev); - enable_irq(dev->irq); -} -#endif - -static void uml_net_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *info) -{ - strscpy(info->driver, DRIVER_NAME); -} - -static const struct ethtool_ops uml_net_ethtool_ops = { - .get_drvinfo = uml_net_get_drvinfo, - .get_link = ethtool_op_get_link, - .get_ts_info = ethtool_op_get_ts_info, -}; - -void uml_net_setup_etheraddr(struct net_device *dev, char *str) -{ - u8 addr[ETH_ALEN]; - char *end; - int i; - - if (str == NULL) - goto random; - - for (i = 0; i < 6; i++) { - addr[i] = simple_strtoul(str, &end, 16); - if ((end == str) || - ((*end != ':') && (*end != ',') && (*end != '\0'))) { - printk(KERN_ERR - "setup_etheraddr: failed to parse '%s' " - "as an ethernet address\n", str); - goto random; - } - str = end + 1; - } - if (is_multicast_ether_addr(addr)) { - printk(KERN_ERR - "Attempt to assign a multicast ethernet address to a " - "device disallowed\n"); - goto random; - } - if (!is_valid_ether_addr(addr)) { - printk(KERN_ERR - "Attempt to assign an invalid ethernet address to a " - "device disallowed\n"); - goto random; - } - if (!is_local_ether_addr(addr)) { - printk(KERN_WARNING - "Warning: Assigning a globally valid ethernet " - "address to a device\n"); - printk(KERN_WARNING "You should set the 2nd rightmost bit in " - "the first byte of the MAC,\n"); - printk(KERN_WARNING "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n", - addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4], - addr[5]); - } - eth_hw_addr_set(dev, addr); - return; - -random: - printk(KERN_INFO - "Choosing a random ethernet address for device %s\n", dev->name); - eth_hw_addr_random(dev); -} - -static DEFINE_SPINLOCK(devices_lock); -static LIST_HEAD(devices); - -static struct platform_driver uml_net_driver = { - .driver = { - .name = DRIVER_NAME, - }, -}; - -static void net_device_release(struct device *dev) -{ - struct uml_net *device = container_of(dev, struct uml_net, pdev.dev); - struct net_device *netdev = device->dev; - struct uml_net_private *lp = netdev_priv(netdev); - - if (lp->remove != NULL) - (*lp->remove)(&lp->user); - list_del(&device->list); - kfree(device); - free_netdev(netdev); -} - -static const struct net_device_ops uml_netdev_ops = { - .ndo_open = uml_net_open, - .ndo_stop = uml_net_close, - .ndo_start_xmit = uml_net_start_xmit, - .ndo_set_rx_mode = uml_net_set_multicast_list, - .ndo_tx_timeout = uml_net_tx_timeout, - .ndo_set_mac_address = eth_mac_addr, - .ndo_validate_addr = eth_validate_addr, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = uml_net_poll_controller, -#endif -}; - -/* - * Ensures that platform_driver_register is called only once by - * eth_configure. Will be set in an initcall. - */ -static int driver_registered; - -static void eth_configure(int n, void *init, char *mac, - struct transport *transport, gfp_t gfp_mask) -{ - struct uml_net *device; - struct net_device *dev; - struct uml_net_private *lp; - int err, size; - - size = transport->private_size + sizeof(struct uml_net_private); - - device = kzalloc(sizeof(*device), gfp_mask); - if (device == NULL) { - printk(KERN_ERR "eth_configure failed to allocate struct " - "uml_net\n"); - return; - } - - dev = alloc_etherdev(size); - if (dev == NULL) { - printk(KERN_ERR "eth_configure: failed to allocate struct " - "net_device for eth%d\n", n); - goto out_free_device; - } - - INIT_LIST_HEAD(&device->list); - device->index = n; - - /* If this name ends up conflicting with an existing registered - * netdevice, that is OK, register_netdev{,ice}() will notice this - * and fail. - */ - snprintf(dev->name, sizeof(dev->name), "eth%d", n); - - uml_net_setup_etheraddr(dev, mac); - - printk(KERN_INFO "Netdevice %d (%pM) : ", n, dev->dev_addr); - - lp = netdev_priv(dev); - /* This points to the transport private data. It's still clear, but we - * must memset it to 0 *now*. Let's help the drivers. */ - memset(lp, 0, size); - INIT_WORK(&lp->work, uml_dev_close); - - /* sysfs register */ - if (!driver_registered) { - platform_driver_register(¨_net_driver); - driver_registered = 1; - } - device->pdev.id = n; - device->pdev.name = DRIVER_NAME; - device->pdev.dev.release = net_device_release; - dev_set_drvdata(&device->pdev.dev, device); - if (platform_device_register(&device->pdev)) - goto out_free_netdev; - SET_NETDEV_DEV(dev,&device->pdev.dev); - - device->dev = dev; - - /* - * These just fill in a data structure, so there's no failure - * to be worried about. - */ - (*transport->kern->init)(dev, init); - - *lp = ((struct uml_net_private) - { .list = LIST_HEAD_INIT(lp->list), - .dev = dev, - .fd = -1, - .mac = { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0}, - .max_packet = transport->user->max_packet, - .protocol = transport->kern->protocol, - .open = transport->user->open, - .close = transport->user->close, - .remove = transport->user->remove, - .read = transport->kern->read, - .write = transport->kern->write, - .add_address = transport->user->add_address, - .delete_address = transport->user->delete_address }); - - spin_lock_init(&lp->lock); - memcpy(lp->mac, dev->dev_addr, sizeof(lp->mac)); - - if ((transport->user->init != NULL) && - ((*transport->user->init)(&lp->user, dev) != 0)) - goto out_unregister; - - dev->mtu = transport->user->mtu; - dev->netdev_ops = ¨_netdev_ops; - dev->ethtool_ops = ¨_net_ethtool_ops; - dev->watchdog_timeo = (HZ >> 1); - dev->irq = UM_ETH_IRQ; - - err = update_drop_skb(lp->max_packet); - if (err) - goto out_undo_user_init; - - rtnl_lock(); - err = register_netdevice(dev); - rtnl_unlock(); - if (err) - goto out_undo_user_init; - - spin_lock(&devices_lock); - list_add(&device->list, &devices); - spin_unlock(&devices_lock); - - return; - -out_undo_user_init: - if (transport->user->remove != NULL) - (*transport->user->remove)(&lp->user); -out_unregister: - platform_device_unregister(&device->pdev); - return; /* platform_device_unregister frees dev and device */ -out_free_netdev: - free_netdev(dev); -out_free_device: - kfree(device); -} - -static struct uml_net *find_device(int n) -{ - struct uml_net *device; - struct list_head *ele; - - spin_lock(&devices_lock); - list_for_each(ele, &devices) { - device = list_entry(ele, struct uml_net, list); - if (device->index == n) - goto out; - } - device = NULL; - out: - spin_unlock(&devices_lock); - return device; -} - -static int eth_parse(char *str, int *index_out, char **str_out, - char **error_out) -{ - char *end; - int n, err = -EINVAL; - - n = simple_strtoul(str, &end, 0); - if (end == str) { - *error_out = "Bad device number"; - return err; - } - - str = end; - if (*str != '=') { - *error_out = "Expected '=' after device number"; - return err; - } - - str++; - if (find_device(n)) { - *error_out = "Device already configured"; - return err; - } - - *index_out = n; - *str_out = str; - return 0; -} - -struct eth_init { - struct list_head list; - char *init; - int index; -}; - -static DEFINE_SPINLOCK(transports_lock); -static LIST_HEAD(transports); - -/* Filled in during early boot */ -static LIST_HEAD(eth_cmd_line); - -static int check_transport(struct transport *transport, char *eth, int n, - void **init_out, char **mac_out, gfp_t gfp_mask) -{ - int len; - - len = strlen(transport->name); - if (strncmp(eth, transport->name, len)) - return 0; - - eth += len; - if (*eth == ',') - eth++; - else if (*eth != '\0') - return 0; - - *init_out = kmalloc(transport->setup_size, gfp_mask); - if (*init_out == NULL) - return 1; - - if (!transport->setup(eth, mac_out, *init_out)) { - kfree(*init_out); - *init_out = NULL; - } - return 1; -} - -void register_transport(struct transport *new) -{ - struct list_head *ele, *next; - struct eth_init *eth; - void *init; - char *mac = NULL; - int match; - - spin_lock(&transports_lock); - BUG_ON(!list_empty(&new->list)); - list_add(&new->list, &transports); - spin_unlock(&transports_lock); - - list_for_each_safe(ele, next, ð_cmd_line) { - eth = list_entry(ele, struct eth_init, list); - match = check_transport(new, eth->init, eth->index, &init, - &mac, GFP_KERNEL); - if (!match) - continue; - else if (init != NULL) { - eth_configure(eth->index, init, mac, new, GFP_KERNEL); - kfree(init); - } - list_del(ð->list); - } -} - -static int eth_setup_common(char *str, int index) -{ - struct list_head *ele; - struct transport *transport; - void *init; - char *mac = NULL; - int found = 0; - - spin_lock(&transports_lock); - list_for_each(ele, &transports) { - transport = list_entry(ele, struct transport, list); - if (!check_transport(transport, str, index, &init, - &mac, GFP_ATOMIC)) - continue; - if (init != NULL) { - eth_configure(index, init, mac, transport, GFP_ATOMIC); - kfree(init); - } - found = 1; - break; - } - - spin_unlock(&transports_lock); - return found; -} - -static int __init eth_setup(char *str) -{ - struct eth_init *new; - char *error; - int n, err; - - err = eth_parse(str, &n, &str, &error); - if (err) { - printk(KERN_ERR "eth_setup - Couldn't parse '%s' : %s\n", - str, error); - return 1; - } - - new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); - - INIT_LIST_HEAD(&new->list); - new->index = n; - new->init = str; - - list_add_tail(&new->list, ð_cmd_line); - return 1; -} - -__setup("eth", eth_setup); -__uml_help(eth_setup, -"eth[0-9]+=,\n" -" Configure a network device.\n\n" -); - -static int net_config(char *str, char **error_out) -{ - int n, err; - - err = eth_parse(str, &n, &str, error_out); - if (err) - return err; - - /* This string is broken up and the pieces used by the underlying - * driver. So, it is freed only if eth_setup_common fails. - */ - str = kstrdup(str, GFP_KERNEL); - if (str == NULL) { - *error_out = "net_config failed to strdup string"; - return -ENOMEM; - } - err = !eth_setup_common(str, n); - if (err) - kfree(str); - return err; -} - -static int net_id(char **str, int *start_out, int *end_out) -{ - char *end; - int n; - - n = simple_strtoul(*str, &end, 0); - if ((*end != '\0') || (end == *str)) - return -1; - - *start_out = n; - *end_out = n; - *str = end; - return n; -} - -static int net_remove(int n, char **error_out) -{ - struct uml_net *device; - struct net_device *dev; - struct uml_net_private *lp; - - device = find_device(n); - if (device == NULL) - return -ENODEV; - - dev = device->dev; - lp = netdev_priv(dev); - if (lp->fd > 0) - return -EBUSY; - unregister_netdev(dev); - platform_device_unregister(&device->pdev); - - return 0; -} - -static struct mc_device net_mc = { - .list = LIST_HEAD_INIT(net_mc.list), - .name = "eth", - .config = net_config, - .get_config = NULL, - .id = net_id, - .remove = net_remove, -}; - -#ifdef CONFIG_INET -static int uml_inetaddr_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct in_ifaddr *ifa = ptr; - struct net_device *dev = ifa->ifa_dev->dev; - struct uml_net_private *lp; - void (*proc)(unsigned char *, unsigned char *, void *); - unsigned char addr_buf[4], netmask_buf[4]; - - if (dev->netdev_ops->ndo_open != uml_net_open) - return NOTIFY_DONE; - - lp = netdev_priv(dev); - - proc = NULL; - switch (event) { - case NETDEV_UP: - proc = lp->add_address; - break; - case NETDEV_DOWN: - proc = lp->delete_address; - break; - } - if (proc != NULL) { - memcpy(addr_buf, &ifa->ifa_address, sizeof(addr_buf)); - memcpy(netmask_buf, &ifa->ifa_mask, sizeof(netmask_buf)); - (*proc)(addr_buf, netmask_buf, &lp->user); - } - return NOTIFY_DONE; -} - -/* uml_net_init shouldn't be called twice on two CPUs at the same time */ -static struct notifier_block uml_inetaddr_notifier = { - .notifier_call = uml_inetaddr_event, -}; - -static void inet_register(void) -{ - struct list_head *ele; - struct uml_net_private *lp; - struct in_device *ip; - struct in_ifaddr *in; - - register_inetaddr_notifier(¨_inetaddr_notifier); - - /* Devices may have been opened already, so the uml_inetaddr_notifier - * didn't get a chance to run for them. This fakes it so that - * addresses which have already been set up get handled properly. - */ - spin_lock(&opened_lock); - list_for_each(ele, &opened) { - lp = list_entry(ele, struct uml_net_private, list); - ip = lp->dev->ip_ptr; - if (ip == NULL) - continue; - in = ip->ifa_list; - while (in != NULL) { - uml_inetaddr_event(NULL, NETDEV_UP, in); - in = in->ifa_next; - } - } - spin_unlock(&opened_lock); -} -#else -static inline void inet_register(void) -{ -} -#endif - -static int uml_net_init(void) -{ - mconsole_register_dev(&net_mc); - inet_register(); - return 0; -} - -__initcall(uml_net_init); - -static void close_devices(void) -{ - struct list_head *ele; - struct uml_net_private *lp; - - spin_lock(&opened_lock); - list_for_each(ele, &opened) { - lp = list_entry(ele, struct uml_net_private, list); - um_free_irq(lp->dev->irq, lp->dev); - if ((lp->close != NULL) && (lp->fd >= 0)) - (*lp->close)(lp->fd, &lp->user); - if (lp->remove != NULL) - (*lp->remove)(&lp->user); - } - spin_unlock(&opened_lock); -} - -__uml_exitcall(close_devices); - -void iter_addresses(void *d, void (*cb)(unsigned char *, unsigned char *, - void *), - void *arg) -{ - struct net_device *dev = d; - struct in_device *ip = dev->ip_ptr; - struct in_ifaddr *in; - unsigned char address[4], netmask[4]; - - if (ip == NULL) return; - in = ip->ifa_list; - while (in != NULL) { - memcpy(address, &in->ifa_address, sizeof(address)); - memcpy(netmask, &in->ifa_mask, sizeof(netmask)); - (*cb)(address, netmask, arg); - in = in->ifa_next; - } -} - -int dev_netmask(void *d, void *m) -{ - struct net_device *dev = d; - struct in_device *ip = dev->ip_ptr; - struct in_ifaddr *in; - __be32 *mask_out = m; - - if (ip == NULL) - return 1; - - in = ip->ifa_list; - if (in == NULL) - return 1; - - *mask_out = in->ifa_mask; - return 0; -} - -void *get_output_buffer(int *len_out) -{ - void *ret; - - ret = (void *) __get_free_pages(GFP_KERNEL, 0); - if (ret) *len_out = PAGE_SIZE; - else *len_out = 0; - return ret; -} - -void free_output_buffer(void *buffer) -{ - free_pages((unsigned long) buffer, 0); -} - -int tap_setup_common(char *str, char *type, char **dev_name, char **mac_out, - char **gate_addr) -{ - char *remain; - - remain = split_if_spec(str, dev_name, mac_out, gate_addr, NULL); - if (remain != NULL) { - printk(KERN_ERR "tap_setup_common - Extra garbage on " - "specification : '%s'\n", remain); - return 1; - } - - return 0; -} - -unsigned short eth_protocol(struct sk_buff *skb) -{ - return eth_type_trans(skb, skb->dev); -} diff --git a/arch/um/drivers/net_user.c b/arch/um/drivers/net_user.c deleted file mode 100644 index 4c9576452ab0..000000000000 --- a/arch/um/drivers/net_user.c +++ /dev/null @@ -1,271 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int tap_open_common(void *dev, char *gate_addr) -{ - int tap_addr[4]; - - if (gate_addr == NULL) - return 0; - if (sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0], - &tap_addr[1], &tap_addr[2], &tap_addr[3]) != 4) { - printk(UM_KERN_ERR "Invalid tap IP address - '%s'\n", - gate_addr); - return -EINVAL; - } - return 0; -} - -void tap_check_ips(char *gate_addr, unsigned char *eth_addr) -{ - int tap_addr[4]; - - if ((gate_addr != NULL) && - (sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0], - &tap_addr[1], &tap_addr[2], &tap_addr[3]) == 4) && - (eth_addr[0] == tap_addr[0]) && - (eth_addr[1] == tap_addr[1]) && - (eth_addr[2] == tap_addr[2]) && - (eth_addr[3] == tap_addr[3])) { - printk(UM_KERN_ERR "The tap IP address and the UML eth IP " - "address must be different\n"); - } -} - -/* Do reliable error handling as this fails frequently enough. */ -void read_output(int fd, char *output, int len) -{ - int remain, ret, expected; - char c; - char *str; - - if (output == NULL) { - output = &c; - len = sizeof(c); - } - - *output = '\0'; - ret = read(fd, &remain, sizeof(remain)); - - if (ret != sizeof(remain)) { - if (ret < 0) - ret = -errno; - expected = sizeof(remain); - str = "length"; - goto err; - } - - while (remain != 0) { - expected = (remain < len) ? remain : len; - ret = read(fd, output, expected); - if (ret != expected) { - if (ret < 0) - ret = -errno; - str = "data"; - goto err; - } - remain -= ret; - } - - return; - -err: - if (ret < 0) - printk(UM_KERN_ERR "read_output - read of %s failed, " - "errno = %d\n", str, -ret); - else - printk(UM_KERN_ERR "read_output - read of %s failed, read only " - "%d of %d bytes\n", str, ret, expected); -} - -int net_read(int fd, void *buf, int len) -{ - int n; - - n = read(fd, buf, len); - - if ((n < 0) && (errno == EAGAIN)) - return 0; - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_recvfrom(int fd, void *buf, int len) -{ - int n; - - CATCH_EINTR(n = recvfrom(fd, buf, len, 0, NULL, NULL)); - if (n < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_write(int fd, void *buf, int len) -{ - int n; - - n = write(fd, buf, len); - - if ((n < 0) && (errno == EAGAIN)) - return 0; - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_send(int fd, void *buf, int len) -{ - int n; - - CATCH_EINTR(n = send(fd, buf, len, 0)); - if (n < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (n == 0) - return -ENOTCONN; - return n; -} - -int net_sendto(int fd, void *buf, int len, void *to, int sock_len) -{ - int n; - - CATCH_EINTR(n = sendto(fd, buf, len, 0, (struct sockaddr *) to, - sock_len)); - if (n < 0) { - if (errno == EAGAIN) - return 0; - return -errno; - } - else if (n == 0) - return -ENOTCONN; - return n; -} - -struct change_pre_exec_data { - int close_me; - int stdout_fd; -}; - -static void change_pre_exec(void *arg) -{ - struct change_pre_exec_data *data = arg; - - close(data->close_me); - dup2(data->stdout_fd, 1); -} - -static int change_tramp(char **argv, char *output, int output_len) -{ - int pid, fds[2], err; - struct change_pre_exec_data pe_data; - - err = os_pipe(fds, 1, 0); - if (err < 0) { - printk(UM_KERN_ERR "change_tramp - pipe failed, err = %d\n", - -err); - return err; - } - pe_data.close_me = fds[0]; - pe_data.stdout_fd = fds[1]; - pid = run_helper(change_pre_exec, &pe_data, argv); - - if (pid > 0) /* Avoid hang as we won't get data in failure case. */ - read_output(fds[0], output, output_len); - - close(fds[0]); - close(fds[1]); - - if (pid > 0) - helper_wait(pid); - return pid; -} - -static void change(char *dev, char *what, unsigned char *addr, - unsigned char *netmask) -{ - char addr_buf[sizeof("255.255.255.255\0")]; - char netmask_buf[sizeof("255.255.255.255\0")]; - char version[sizeof("nnnnn\0")]; - char *argv[] = { "uml_net", version, what, dev, addr_buf, - netmask_buf, NULL }; - char *output; - int output_len, pid; - - sprintf(version, "%d", UML_NET_VERSION); - sprintf(addr_buf, "%d.%d.%d.%d", addr[0], addr[1], addr[2], addr[3]); - sprintf(netmask_buf, "%d.%d.%d.%d", netmask[0], netmask[1], - netmask[2], netmask[3]); - - output_len = UM_KERN_PAGE_SIZE; - output = uml_kmalloc(output_len, UM_GFP_KERNEL); - if (output == NULL) - printk(UM_KERN_ERR "change : failed to allocate output " - "buffer\n"); - - pid = change_tramp(argv, output, output_len); - if (pid < 0) { - kfree(output); - return; - } - - if (output != NULL) { - printk("%s", output); - kfree(output); - } -} - -void open_addr(unsigned char *addr, unsigned char *netmask, void *arg) -{ - change(arg, "add", addr, netmask); -} - -void close_addr(unsigned char *addr, unsigned char *netmask, void *arg) -{ - change(arg, "del", addr, netmask); -} - -char *split_if_spec(char *str, ...) -{ - char **arg, *end, *ret = NULL; - va_list ap; - - va_start(ap, str); - while ((arg = va_arg(ap, char **)) != NULL) { - if (*str == '\0') - goto out; - end = strchr(str, ','); - if (end != str) - *arg = str; - if (end == NULL) - goto out; - *end++ = '\0'; - str = end; - } - ret = str; -out: - va_end(ap); - return ret; -} diff --git a/arch/um/include/shared/net_kern.h b/arch/um/include/shared/net_kern.h deleted file mode 100644 index 67b2e9a1f2e5..000000000000 --- a/arch/um/include/shared/net_kern.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __UM_NET_KERN_H -#define __UM_NET_KERN_H - -#include -#include -#include -#include -#include -#include - -struct uml_net { - struct list_head list; - struct net_device *dev; - struct platform_device pdev; - int index; -}; - -struct uml_net_private { - struct list_head list; - spinlock_t lock; - struct net_device *dev; - struct timer_list tl; - - struct work_struct work; - int fd; - unsigned char mac[ETH_ALEN]; - int max_packet; - unsigned short (*protocol)(struct sk_buff *); - int (*open)(void *); - void (*close)(int, void *); - void (*remove)(void *); - int (*read)(int, struct sk_buff *skb, struct uml_net_private *); - int (*write)(int, struct sk_buff *skb, struct uml_net_private *); - - void (*add_address)(unsigned char *, unsigned char *, void *); - void (*delete_address)(unsigned char *, unsigned char *, void *); - char user[]; -}; - -struct net_kern_info { - void (*init)(struct net_device *, void *); - unsigned short (*protocol)(struct sk_buff *); - int (*read)(int, struct sk_buff *skb, struct uml_net_private *); - int (*write)(int, struct sk_buff *skb, struct uml_net_private *); -}; - -struct transport { - struct list_head list; - const char *name; - int (* const setup)(char *, char **, void *); - const struct net_user_info *user; - const struct net_kern_info *kern; - const int private_size; - const int setup_size; -}; - -extern int tap_setup_common(char *str, char *type, char **dev_name, - char **mac_out, char **gate_addr); -extern void register_transport(struct transport *new); -extern unsigned short eth_protocol(struct sk_buff *skb); -extern void uml_net_setup_etheraddr(struct net_device *dev, char *str); - - -#endif diff --git a/arch/um/include/shared/net_user.h b/arch/um/include/shared/net_user.h deleted file mode 100644 index ba92a4d93531..000000000000 --- a/arch/um/include/shared/net_user.h +++ /dev/null @@ -1,52 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - */ - -#ifndef __UM_NET_USER_H__ -#define __UM_NET_USER_H__ - -#define ETH_ADDR_LEN (6) -#define ETH_HEADER_ETHERTAP (16) -#define ETH_HEADER_OTHER (26) /* 14 for ethernet + VLAN + MPLS for crazy people */ -#define ETH_MAX_PACKET (1500) - -#define UML_NET_VERSION (4) - -struct net_user_info { - int (*init)(void *, void *); - int (*open)(void *); - void (*close)(int, void *); - void (*remove)(void *); - void (*add_address)(unsigned char *, unsigned char *, void *); - void (*delete_address)(unsigned char *, unsigned char *, void *); - int max_packet; - int mtu; -}; - -extern void iter_addresses(void *d, void (*cb)(unsigned char *, - unsigned char *, void *), - void *arg); - -extern void *get_output_buffer(int *len_out); -extern void free_output_buffer(void *buffer); - -extern int tap_open_common(void *dev, char *gate_addr); -extern void tap_check_ips(char *gate_addr, unsigned char *eth_addr); - -extern void read_output(int fd, char *output_out, int len); - -extern int net_read(int fd, void *buf, int len); -extern int net_recvfrom(int fd, void *buf, int len); -extern int net_write(int fd, void *buf, int len); -extern int net_send(int fd, void *buf, int len); -extern int net_sendto(int fd, void *buf, int len, void *to, int sock_len); - -extern void open_addr(unsigned char *addr, unsigned char *netmask, void *arg); -extern void close_addr(unsigned char *addr, unsigned char *netmask, void *arg); - -extern char *split_if_spec(char *str, ...); - -extern int dev_netmask(void *d, void *m); - -#endif -- cgit v1.2.3 From 788aa64c01f1262310b4c1fb827a36df170d86ea Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 10 Apr 2025 07:05:22 +0000 Subject: riscv: save the SR_SUM status over switches When threads/tasks are switched we need to ensure the old execution's SR_SUM state is saved and the new thread has the old SR_SUM state restored. The issue was seen under heavy load especially with the syz-stress tool running, with crashes as follows in schedule_tail: Unable to handle kernel access to user memory without uaccess routines at virtual address 000000002749f0d0 Oops [#1] Modules linked in: CPU: 1 PID: 4875 Comm: syz-executor.0 Not tainted 5.12.0-rc2-syzkaller-00467-g0d7588ab9ef9 #0 Hardware name: riscv-virtio,qemu (DT) epc : schedule_tail+0x72/0xb2 kernel/sched/core.c:4264 ra : task_pid_vnr include/linux/sched.h:1421 [inline] ra : schedule_tail+0x70/0xb2 kernel/sched/core.c:4264 epc : ffffffe00008c8b0 ra : ffffffe00008c8ae sp : ffffffe025d17ec0 gp : ffffffe005d25378 tp : ffffffe00f0d0000 t0 : 0000000000000000 t1 : 0000000000000001 t2 : 00000000000f4240 s0 : ffffffe025d17ee0 s1 : 000000002749f0d0 a0 : 000000000000002a a1 : 0000000000000003 a2 : 1ffffffc0cfac500 a3 : ffffffe0000c80cc a4 : 5ae9db91c19bbe00 a5 : 0000000000000000 a6 : 0000000000f00000 a7 : ffffffe000082eba s2 : 0000000000040000 s3 : ffffffe00eef96c0 s4 : ffffffe022c77fe0 s5 : 0000000000004000 s6 : ffffffe067d74e00 s7 : ffffffe067d74850 s8 : ffffffe067d73e18 s9 : ffffffe067d74e00 s10: ffffffe00eef96e8 s11: 000000ae6cdf8368 t3 : 5ae9db91c19bbe00 t4 : ffffffc4043cafb2 t5 : ffffffc4043cafba t6 : 0000000000040000 status: 0000000000000120 badaddr: 000000002749f0d0 cause: 000000000000000f Call Trace: [] schedule_tail+0x72/0xb2 kernel/sched/core.c:4264 [] ret_from_exception+0x0/0x14 Dumping ftrace buffer: (ftrace buffer empty) ---[ end trace b5f8f9231dc87dda ]--- The issue comes from the put_user() in schedule_tail (kernel/sched/core.c) doing the following: asmlinkage __visible void schedule_tail(struct task_struct *prev) { ... if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_child_tid); ... } the put_user() macro causes the code sequence to come out as follows: 1: __enable_user_access() 2: reg = task_pid_vnr(current); 3: *current->set_child_tid = reg; 4: __disable_user_access() The problem is that we may have a sleeping function as argument which could clear SR_SUM causing the panic above. This was fixed by evaluating the argument of the put_user() macro outside the user-enabled section in commit 285a76bb2cf5 ("riscv: evaluate put_user() arg before enabling user access")" In order for riscv to take advantage of unsafe_get/put_XXX() macros and to avoid the same issue we had with put_user() and sleeping functions we must ensure code flow can go through switch_to() from within a region of code with SR_SUM enabled and come back with SR_SUM still enabled. This patch addresses the problem allowing future work to enable full use of unsafe_get/put_XXX() macros without needing to take a CSR bit flip cost on every access. Make switch_to() save and restore SR_SUM. Reported-by: syzbot+e74b94fe601ab9552d69@syzkaller.appspotmail.com Signed-off-by: Ben Dooks Signed-off-by: Cyril Bur Reviewed-by: Alexandre Ghiti Reviewed-by: Deepak Gupta Link: https://lore.kernel.org/r/20250410070526.3160847-2-cyrilbur@tenstorrent.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/processor.h | 1 + arch/riscv/kernel/asm-offsets.c | 5 +++++ arch/riscv/kernel/entry.S | 8 ++++++++ 3 files changed, 14 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 5f56eb9d114a..58fd11c89fe9 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -103,6 +103,7 @@ struct thread_struct { struct __riscv_d_ext_state fstate; unsigned long bad_cause; unsigned long envcfg; + unsigned long status; u32 riscv_v_flags; u32 vstate_ctrl; struct __riscv_v_ext_state vstate; diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 16490755304e..969c65b1fe41 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -34,6 +34,7 @@ void asm_offsets(void) OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]); OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]); OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]); + OFFSET(TASK_THREAD_STATUS, task_struct, thread.status); OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu); OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count); @@ -346,6 +347,10 @@ void asm_offsets(void) offsetof(struct task_struct, thread.s[11]) - offsetof(struct task_struct, thread.ra) ); + DEFINE(TASK_THREAD_STATUS_RA, + offsetof(struct task_struct, thread.status) + - offsetof(struct task_struct, thread.ra) + ); DEFINE(TASK_THREAD_F0_F0, offsetof(struct task_struct, thread.fstate.f[0]) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 33a5a9f2a0d4..00bd0de9faa2 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -397,9 +397,17 @@ SYM_FUNC_START(__switch_to) REG_S s9, TASK_THREAD_S9_RA(a3) REG_S s10, TASK_THREAD_S10_RA(a3) REG_S s11, TASK_THREAD_S11_RA(a3) + + /* save the user space access flag */ + li s0, SR_SUM + csrr s1, CSR_STATUS + REG_S s1, TASK_THREAD_STATUS_RA(a3) + /* Save the kernel shadow call stack pointer */ scs_save_current /* Restore context from next->thread */ + REG_L s0, TASK_THREAD_STATUS_RA(a4) + csrs CSR_STATUS, s0 REG_L ra, TASK_THREAD_RA_RA(a4) REG_L sp, TASK_THREAD_SP_RA(a4) REG_L s0, TASK_THREAD_S0_RA(a4) -- cgit v1.2.3 From 19500c6dbc5c348564a6513c801ab0889300565a Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 10 Apr 2025 07:05:23 +0000 Subject: riscv: implement user_access_begin() and families Currently, when a function like strncpy_from_user() is called, the userspace access protection is disabled and enabled for every word read. By implementing user_access_begin() and families, the protection is disabled at the beginning of the copy and enabled at the end. The __inttype macro is borrowed from x86 implementation. Signed-off-by: Jisheng Zhang Signed-off-by: Cyril Bur Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250410070526.3160847-3-cyrilbur@tenstorrent.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 76 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index fee56b0c8058..c9a461467bf4 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -61,6 +61,19 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, unsigne #define __disable_user_access() \ __asm__ __volatile__ ("csrc sstatus, %0" : : "r" (SR_SUM) : "memory") +/* + * This is the smallest unsigned integer type that can fit a value + * (up to 'long long') + */ +#define __inttype(x) __typeof__( \ + __typefits(x, char, \ + __typefits(x, short, \ + __typefits(x, int, \ + __typefits(x, long, 0ULL))))) + +#define __typefits(x, type, not) \ + __builtin_choose_expr(sizeof(x) <= sizeof(type), (unsigned type)0, not) + /* * The exception table consists of pairs of addresses: the first is the * address of an instruction that is allowed to fault, and the second is @@ -368,6 +381,69 @@ do { \ goto err_label; \ } while (0) +static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) +{ + if (unlikely(!access_ok(ptr, len))) + return 0; + __enable_user_access(); + return 1; +} +#define user_access_begin user_access_begin +#define user_access_end __disable_user_access + +static inline unsigned long user_access_save(void) { return 0UL; } +static inline void user_access_restore(unsigned long enabled) { } + +/* + * We want the unsafe accessors to always be inlined and use + * the error labels - thus the macro games. + */ +#define unsafe_put_user(x, ptr, label) do { \ + long __err = 0; \ + __put_user_nocheck(x, (ptr), __err); \ + if (__err) \ + goto label; \ +} while (0) + +#define unsafe_get_user(x, ptr, label) do { \ + long __err = 0; \ + __inttype(*(ptr)) __gu_val; \ + __get_user_nocheck(__gu_val, (ptr), __err); \ + (x) = (__force __typeof__(*(ptr)))__gu_val; \ + if (__err) \ + goto label; \ +} while (0) + +#define unsafe_copy_loop(dst, src, len, type, op, label) \ + while (len >= sizeof(type)) { \ + op(*(type *)(src), (type __user *)(dst), label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +#define unsafe_copy_to_user(_dst, _src, _len, label) \ +do { \ + char __user *__ucu_dst = (_dst); \ + const char *__ucu_src = (_src); \ + size_t __ucu_len = (_len); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, unsafe_put_user, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, unsafe_put_user, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, unsafe_put_user, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, unsafe_put_user, label); \ +} while (0) + +#define unsafe_copy_from_user(_dst, _src, _len, label) \ +do { \ + char *__ucu_dst = (_dst); \ + const char __user *__ucu_src = (_src); \ + size_t __ucu_len = (_len); \ + unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u64, unsafe_get_user, label); \ + unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u32, unsafe_get_user, label); \ + unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u16, unsafe_get_user, label); \ + unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u8, unsafe_get_user, label); \ +} while (0) + #else /* CONFIG_MMU */ #include #endif /* CONFIG_MMU */ -- cgit v1.2.3 From 62135bf660b2c3887e22f33d3adbefedb4dc9c7a Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 10 Apr 2025 07:05:24 +0000 Subject: riscv: uaccess: use input constraints for ptr of __put_user() Putting ptr in the inputs as opposed to output may seem incorrect but this is done for a few reasons: - Not having it in the output permits the use of asm goto in a subsequent patch. There are bugs in gcc [1] which would otherwise prevent it. - Since the output memory is userspace there isn't any real benefit from telling the compiler about the memory clobber. - x86, arm and powerpc all use this technique. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 # 1 Signed-off-by: Jisheng Zhang [Cyril Bur: Rewritten commit message] Signed-off-by: Cyril Bur Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250410070526.3160847-4-cyrilbur@tenstorrent.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index c9a461467bf4..da36057847f0 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -219,11 +219,11 @@ do { \ __typeof__(*(ptr)) __x = x; \ __asm__ __volatile__ ( \ "1:\n" \ - " " insn " %z2, %1\n" \ + " " insn " %z1, %2\n" \ "2:\n" \ _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %0) \ - : "+r" (err), "=m" (*(ptr)) \ - : "rJ" (__x)); \ + : "+r" (err) \ + : "rJ" (__x), "m"(*(ptr))); \ } while (0) #ifdef CONFIG_64BIT @@ -236,16 +236,16 @@ do { \ u64 __x = (__typeof__((x)-(x)))(x); \ __asm__ __volatile__ ( \ "1:\n" \ - " sw %z3, %1\n" \ + " sw %z1, %3\n" \ "2:\n" \ - " sw %z4, %2\n" \ + " sw %z2, %4\n" \ "3:\n" \ _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %0) \ _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %0) \ - : "+r" (err), \ - "=m" (__ptr[__LSW]), \ - "=m" (__ptr[__MSW]) \ - : "rJ" (__x), "rJ" (__x >> 32)); \ + : "+r" (err) \ + : "rJ" (__x), "rJ" (__x >> 32), \ + "m" (__ptr[__LSW]), \ + "m" (__ptr[__MSW])); \ } while (0) #endif /* CONFIG_64BIT */ -- cgit v1.2.3 From cdf647e817143c9762c5bdf724ca2821a171f011 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 10 Apr 2025 07:05:25 +0000 Subject: riscv: uaccess: use 'asm goto' for put_user() With 'asm goto' we don't need to test the error etc, the exception just jumps to the error handling directly. Because there are no output clobbers which could trigger gcc bugs [1] the use of asm_goto_output() macro is not necessary here. Not using asm_goto_output() is desirable as the generated output asm will be cleaner. Use of the volatile keyword is redundant as per gcc 14.2.0 manual section 6.48.2.7 Goto Labels: > Also note that an asm goto statement is always implicitly considered volatile. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 # 1 Signed-off-by: Jisheng Zhang [Cyril Bur: Rewritten commit message] Signed-off-by: Cyril Bur Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250410070526.3160847-5-cyrilbur@tenstorrent.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 71 +++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 38 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index da36057847f0..719c9179a751 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -214,61 +214,66 @@ do { \ ((x) = (__force __typeof__(x))0, -EFAULT); \ }) -#define __put_user_asm(insn, x, ptr, err) \ +#define __put_user_asm(insn, x, ptr, label) \ do { \ __typeof__(*(ptr)) __x = x; \ - __asm__ __volatile__ ( \ + asm goto( \ "1:\n" \ - " " insn " %z1, %2\n" \ - "2:\n" \ - _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %0) \ - : "+r" (err) \ - : "rJ" (__x), "m"(*(ptr))); \ + " " insn " %z0, %1\n" \ + _ASM_EXTABLE(1b, %l2) \ + : : "rJ" (__x), "m"(*(ptr)) : : label); \ } while (0) #ifdef CONFIG_64BIT -#define __put_user_8(x, ptr, err) \ - __put_user_asm("sd", x, ptr, err) +#define __put_user_8(x, ptr, label) \ + __put_user_asm("sd", x, ptr, label) #else /* !CONFIG_64BIT */ -#define __put_user_8(x, ptr, err) \ +#define __put_user_8(x, ptr, label) \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ u64 __x = (__typeof__((x)-(x)))(x); \ - __asm__ __volatile__ ( \ + asm goto( \ "1:\n" \ - " sw %z1, %3\n" \ + " sw %z0, %2\n" \ "2:\n" \ - " sw %z2, %4\n" \ - "3:\n" \ - _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %0) \ - _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %0) \ - : "+r" (err) \ - : "rJ" (__x), "rJ" (__x >> 32), \ + " sw %z1, %3\n" \ + _ASM_EXTABLE(1b, %l4) \ + _ASM_EXTABLE(2b, %l4) \ + : : "rJ" (__x), "rJ" (__x >> 32), \ "m" (__ptr[__LSW]), \ - "m" (__ptr[__MSW])); \ + "m" (__ptr[__MSW]) : : label); \ } while (0) #endif /* CONFIG_64BIT */ -#define __put_user_nocheck(x, __gu_ptr, __pu_err) \ +#define __put_user_nocheck(x, __gu_ptr, label) \ do { \ switch (sizeof(*__gu_ptr)) { \ case 1: \ - __put_user_asm("sb", (x), __gu_ptr, __pu_err); \ + __put_user_asm("sb", (x), __gu_ptr, label); \ break; \ case 2: \ - __put_user_asm("sh", (x), __gu_ptr, __pu_err); \ + __put_user_asm("sh", (x), __gu_ptr, label); \ break; \ case 4: \ - __put_user_asm("sw", (x), __gu_ptr, __pu_err); \ + __put_user_asm("sw", (x), __gu_ptr, label); \ break; \ case 8: \ - __put_user_8((x), __gu_ptr, __pu_err); \ + __put_user_8((x), __gu_ptr, label); \ break; \ default: \ BUILD_BUG(); \ } \ } while (0) +#define __put_user_error(x, ptr, err) \ +do { \ + __label__ err_label; \ + __put_user_nocheck(x, ptr, err_label); \ + break; \ +err_label: \ + (err) = -EFAULT; \ +} while (0) + /** * __put_user: - Write a simple value into user space, with less checking. * @x: Value to copy to user space. @@ -299,7 +304,7 @@ do { \ __chk_user_ptr(__gu_ptr); \ \ __enable_user_access(); \ - __put_user_nocheck(__val, __gu_ptr, __pu_err); \ + __put_user_error(__val, __gu_ptr, __pu_err); \ __disable_user_access(); \ \ __pu_err; \ @@ -373,13 +378,7 @@ do { \ } while (0) #define __put_kernel_nofault(dst, src, type, err_label) \ -do { \ - long __kr_err = 0; \ - \ - __put_user_nocheck(*((type *)(src)), (type *)(dst), __kr_err); \ - if (unlikely(__kr_err)) \ - goto err_label; \ -} while (0) + __put_user_nocheck(*((type *)(src)), (type *)(dst), err_label) static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) { @@ -398,12 +397,8 @@ static inline void user_access_restore(unsigned long enabled) { } * We want the unsafe accessors to always be inlined and use * the error labels - thus the macro games. */ -#define unsafe_put_user(x, ptr, label) do { \ - long __err = 0; \ - __put_user_nocheck(x, (ptr), __err); \ - if (__err) \ - goto label; \ -} while (0) +#define unsafe_put_user(x, ptr, label) \ + __put_user_nocheck(x, (ptr), label) #define unsafe_get_user(x, ptr, label) do { \ long __err = 0; \ -- cgit v1.2.3 From f6bff7827a48e59cff1ef98aae72452d65174e0c Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Thu, 10 Apr 2025 07:05:26 +0000 Subject: riscv: uaccess: use 'asm_goto_output' for get_user() With 'asm goto' we don't need to test the error etc, the exception just jumps to the error handling directly. Unlike put_user(), get_user() must work around GCC bugs [1] when using output clobbers in an asm goto statement. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 # 1 Signed-off-by: Jisheng Zhang [Cyril Bur: Rewritten commit message] Signed-off-by: Cyril Bur Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250410070526.3160847-6-cyrilbur@tenstorrent.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 95 ++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 719c9179a751..87d01168f80a 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -96,27 +96,58 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, unsigne * call. */ -#define __get_user_asm(insn, x, ptr, err) \ +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +#define __get_user_asm(insn, x, ptr, label) \ + asm_goto_output( \ + "1:\n" \ + " " insn " %0, %1\n" \ + _ASM_EXTABLE_UACCESS_ERR(1b, %l2, %0) \ + : "=&r" (x) \ + : "m" (*(ptr)) : : label) +#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ +#define __get_user_asm(insn, x, ptr, label) \ do { \ - __typeof__(x) __x; \ + long __gua_err = 0; \ __asm__ __volatile__ ( \ "1:\n" \ " " insn " %1, %2\n" \ "2:\n" \ _ASM_EXTABLE_UACCESS_ERR_ZERO(1b, 2b, %0, %1) \ - : "+r" (err), "=&r" (__x) \ + : "+r" (__gua_err), "=&r" (x) \ : "m" (*(ptr))); \ - (x) = __x; \ + if (__gua_err) \ + goto label; \ } while (0) +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ #ifdef CONFIG_64BIT -#define __get_user_8(x, ptr, err) \ - __get_user_asm("ld", x, ptr, err) +#define __get_user_8(x, ptr, label) \ + __get_user_asm("ld", x, ptr, label) #else /* !CONFIG_64BIT */ -#define __get_user_8(x, ptr, err) \ + +#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT +#define __get_user_8(x, ptr, label) \ + u32 __user *__ptr = (u32 __user *)(ptr); \ + u32 __lo, __hi; \ + asm_goto_output( \ + "1:\n" \ + " lw %0, %2\n" \ + "2:\n" \ + " lw %1, %3\n" \ + _ASM_EXTABLE_UACCESS_ERR(1b, %l4, %0) \ + _ASM_EXTABLE_UACCESS_ERR(2b, %l4, %0) \ + : "=&r" (__lo), "=r" (__hi) \ + : "m" (__ptr[__LSW]), "m" (__ptr[__MSW]) \ + : : label); \ + (x) = (__typeof__(x))((__typeof__((x) - (x)))( \ + (((u64)__hi << 32) | __lo))); \ + +#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ +#define __get_user_8(x, ptr, label) \ do { \ u32 __user *__ptr = (u32 __user *)(ptr); \ u32 __lo, __hi; \ + long __gu8_err = 0; \ __asm__ __volatile__ ( \ "1:\n" \ " lw %1, %3\n" \ @@ -125,35 +156,51 @@ do { \ "3:\n" \ _ASM_EXTABLE_UACCESS_ERR_ZERO(1b, 3b, %0, %1) \ _ASM_EXTABLE_UACCESS_ERR_ZERO(2b, 3b, %0, %1) \ - : "+r" (err), "=&r" (__lo), "=r" (__hi) \ + : "+r" (__gu8_err), "=&r" (__lo), "=r" (__hi) \ : "m" (__ptr[__LSW]), "m" (__ptr[__MSW])); \ - if (err) \ + if (__gu8_err) { \ __hi = 0; \ - (x) = (__typeof__(x))((__typeof__((x)-(x)))( \ + goto label; \ + } \ + (x) = (__typeof__(x))((__typeof__((x) - (x)))( \ (((u64)__hi << 32) | __lo))); \ } while (0) +#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */ + #endif /* CONFIG_64BIT */ -#define __get_user_nocheck(x, __gu_ptr, __gu_err) \ +#define __get_user_nocheck(x, __gu_ptr, label) \ do { \ switch (sizeof(*__gu_ptr)) { \ case 1: \ - __get_user_asm("lb", (x), __gu_ptr, __gu_err); \ + __get_user_asm("lb", (x), __gu_ptr, label); \ break; \ case 2: \ - __get_user_asm("lh", (x), __gu_ptr, __gu_err); \ + __get_user_asm("lh", (x), __gu_ptr, label); \ break; \ case 4: \ - __get_user_asm("lw", (x), __gu_ptr, __gu_err); \ + __get_user_asm("lw", (x), __gu_ptr, label); \ break; \ case 8: \ - __get_user_8((x), __gu_ptr, __gu_err); \ + __get_user_8((x), __gu_ptr, label); \ break; \ default: \ BUILD_BUG(); \ } \ } while (0) +#define __get_user_error(x, ptr, err) \ +do { \ + __label__ __gu_failed; \ + \ + __get_user_nocheck(x, ptr, __gu_failed); \ + err = 0; \ + break; \ +__gu_failed: \ + x = 0; \ + err = -EFAULT; \ +} while (0) + /** * __get_user: - Get a simple variable from user space, with less checking. * @x: Variable to store result. @@ -178,13 +225,16 @@ do { \ ({ \ const __typeof__(*(ptr)) __user *__gu_ptr = untagged_addr(ptr); \ long __gu_err = 0; \ + __typeof__(x) __gu_val; \ \ __chk_user_ptr(__gu_ptr); \ \ __enable_user_access(); \ - __get_user_nocheck(x, __gu_ptr, __gu_err); \ + __get_user_error(__gu_val, __gu_ptr, __gu_err); \ __disable_user_access(); \ \ + (x) = __gu_val; \ + \ __gu_err; \ }) @@ -369,13 +419,7 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n) } #define __get_kernel_nofault(dst, src, type, err_label) \ -do { \ - long __kr_err = 0; \ - \ - __get_user_nocheck(*((type *)(dst)), (type *)(src), __kr_err); \ - if (unlikely(__kr_err)) \ - goto err_label; \ -} while (0) + __get_user_nocheck(*((type *)(dst)), (type *)(src), err_label) #define __put_kernel_nofault(dst, src, type, err_label) \ __put_user_nocheck(*((type *)(src)), (type *)(dst), err_label) @@ -401,12 +445,9 @@ static inline void user_access_restore(unsigned long enabled) { } __put_user_nocheck(x, (ptr), label) #define unsafe_get_user(x, ptr, label) do { \ - long __err = 0; \ __inttype(*(ptr)) __gu_val; \ - __get_user_nocheck(__gu_val, (ptr), __err); \ + __get_user_nocheck(__gu_val, (ptr), label); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - if (__err) \ - goto label; \ } while (0) #define unsafe_copy_loop(dst, src, len, type, op, label) \ -- cgit v1.2.3 From 2940954c1ac527386e5203d4be8263d704491fbe Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Mon, 24 Feb 2025 19:20:40 +0800 Subject: riscv: vDSO: Remove --hash-style=both When RISC-V borned, DT_GNU_HASH had already became the de-facto standard so DT_HASH is just wasting storage space. Remove the explicit --hash-style=both setting and rely on the distro toolchain default, which is most likely "gnu" (i.e. generating only DT_GNU_HASH, no DT_HASH). Following the logic of commit 48f6430505c0 ("arm64/vdso: Remove --hash-style=sysv"). Signed-off-by: Xi Ruoyao Link: https://lore.kernel.org/r/20250224112042.60282-2-xry111@xry111.site Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/vdso/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 7575ef088adc..8d12f5646eb5 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -59,7 +59,7 @@ $(obj)/vdso.o: $(obj)/vdso.so $(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold_and_check) LDFLAGS_vdso.so.dbg = -shared -soname=linux-vdso.so.1 \ - --build-id=sha1 --hash-style=both --eh-frame-hdr + --build-id=sha1 --eh-frame-hdr # strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S -- cgit v1.2.3 From 2d147d77ae6e96c1c349a6ada0eac14111c3384a Mon Sep 17 00:00:00 2001 From: Cyan Yang Date: Fri, 18 Apr 2025 13:32:29 +0800 Subject: riscv: Add SiFive xsfvqmaccdod and xsfvqmaccqoq vendor extensions Add SiFive vendor extension support to the kernel with the target of "xsfvqmaccdod" and "xsfvqmaccqoq". Signed-off-by: Cyan Yang Link: https://lore.kernel.org/r/20250418053239.4351-3-cyan.yang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig.vendor | 13 +++++++++++++ arch/riscv/include/asm/vendor_extensions/sifive.h | 14 ++++++++++++++ arch/riscv/kernel/vendor_extensions.c | 10 ++++++++++ arch/riscv/kernel/vendor_extensions/Makefile | 1 + arch/riscv/kernel/vendor_extensions/sifive.c | 19 +++++++++++++++++++ 5 files changed, 57 insertions(+) create mode 100644 arch/riscv/include/asm/vendor_extensions/sifive.h create mode 100644 arch/riscv/kernel/vendor_extensions/sifive.c (limited to 'arch') diff --git a/arch/riscv/Kconfig.vendor b/arch/riscv/Kconfig.vendor index b096548fe0ff..e14f26368963 100644 --- a/arch/riscv/Kconfig.vendor +++ b/arch/riscv/Kconfig.vendor @@ -16,6 +16,19 @@ config RISCV_ISA_VENDOR_EXT_ANDES If you don't know what to do here, say Y. endmenu +menu "SiFive" +config RISCV_ISA_VENDOR_EXT_SIFIVE + bool "SiFive vendor extension support" + select RISCV_ISA_VENDOR_EXT + default y + help + Say N here if you want to disable all SiFive vendor extension + support. This will cause any SiFive vendor extensions that are + requested by hardware probing to be ignored. + + If you don't know what to do here, say Y. +endmenu + menu "T-Head" config RISCV_ISA_VENDOR_EXT_THEAD bool "T-Head vendor extension support" diff --git a/arch/riscv/include/asm/vendor_extensions/sifive.h b/arch/riscv/include/asm/vendor_extensions/sifive.h new file mode 100644 index 000000000000..608004250e2e --- /dev/null +++ b/arch/riscv/include/asm/vendor_extensions/sifive.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_H +#define _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_H + +#include + +#include + +#define RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD 0 +#define RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ 1 + +extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive; + +#endif diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c index 9feb7f67a0a3..92d8ff81f42c 100644 --- a/arch/riscv/kernel/vendor_extensions.c +++ b/arch/riscv/kernel/vendor_extensions.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,9 @@ struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[] = { #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES &riscv_isa_vendor_ext_list_andes, #endif +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE + &riscv_isa_vendor_ext_list_sifive, +#endif #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_THEAD &riscv_isa_vendor_ext_list_thead, #endif @@ -45,6 +49,12 @@ bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsig cpu_bmap = riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap; break; #endif + #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE + case SIFIVE_VENDOR_ID: + bmap = &riscv_isa_vendor_ext_list_sifive.all_harts_isa_bitmap; + cpu_bmap = riscv_isa_vendor_ext_list_sifive.per_hart_isa_bitmap; + break; + #endif #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_THEAD case THEAD_VENDOR_ID: bmap = &riscv_isa_vendor_ext_list_thead.all_harts_isa_bitmap; diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile index 866414c81a9f..d5fdde0e863b 100644 --- a/arch/riscv/kernel/vendor_extensions/Makefile +++ b/arch/riscv/kernel/vendor_extensions/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES) += andes.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead_hwprobe.o diff --git a/arch/riscv/kernel/vendor_extensions/sifive.c b/arch/riscv/kernel/vendor_extensions/sifive.c new file mode 100644 index 000000000000..990ac83b1f81 --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/sifive.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +#include +#include + +/* All SiFive vendor extensions supported in Linux */ +const struct riscv_isa_ext_data riscv_isa_vendor_ext_sifive[] = { + __RISCV_ISA_EXT_DATA(xsfvqmaccdod, RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD), + __RISCV_ISA_EXT_DATA(xsfvqmaccqoq, RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ), +}; + +struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive = { + .ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_sifive), + .ext_data = riscv_isa_vendor_ext_sifive, +}; -- cgit v1.2.3 From e8fd215ed0eb814486d50b4835007cbc50b2c2b7 Mon Sep 17 00:00:00 2001 From: Cyan Yang Date: Fri, 18 Apr 2025 13:32:30 +0800 Subject: riscv: hwprobe: Document SiFive xsfvqmaccdod and xsfvqmaccqoq vendor extensions Document the support for sifive vendor extensions using the key RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0 and two vendor extensions for SiFive Int8 Matrix Multiplication Instructions using RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCDOD and RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCQOQ. Signed-off-by: Cyan Yang Link: https://lore.kernel.org/r/20250418053239.4351-4-cyan.yang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hwprobe.h | 2 +- arch/riscv/include/uapi/asm/hwprobe.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index 1f690fea0e03..1c6977305776 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -8,7 +8,7 @@ #include -#define RISCV_HWPROBE_MAX_KEY 12 +#define RISCV_HWPROBE_MAX_KEY 13 static inline bool riscv_hwprobe_key_is_valid(__s64 key) { diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 3c2fce939673..9c70101f021b 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -104,6 +104,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED 4 #define RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0 11 #define RISCV_HWPROBE_KEY_ZICBOM_BLOCK_SIZE 12 +#define RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0 13 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */ /* Flags */ -- cgit v1.2.3 From 1a6274f035346e76835d46096136dd3e6cca9575 Mon Sep 17 00:00:00 2001 From: Cyan Yang Date: Fri, 18 Apr 2025 13:32:31 +0800 Subject: riscv: hwprobe: Add SiFive vendor extension support and probe for xsfqmaccdod and xsfqmaccqoq Add a new hwprobe key "RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0" which allows userspace to probe for the new vendor extensions from SiFive. Also, add new hwprobe for SiFive "xsfvqmaccdod" and "xsfvqmaccqoq" vendor extensions. Signed-off-by: Cyan Yang Link: https://lore.kernel.org/r/20250418053239.4351-5-cyan.yang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hwprobe.h | 1 + .../include/asm/vendor_extensions/sifive_hwprobe.h | 19 +++++++++++++++++++ arch/riscv/include/uapi/asm/vendor/sifive.h | 4 ++++ arch/riscv/kernel/sys_hwprobe.c | 5 +++++ arch/riscv/kernel/vendor_extensions/Makefile | 1 + arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c | 20 ++++++++++++++++++++ 6 files changed, 50 insertions(+) create mode 100644 arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h create mode 100644 arch/riscv/include/uapi/asm/vendor/sifive.h create mode 100644 arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c (limited to 'arch') diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h index 1c6977305776..7fe0a379474a 100644 --- a/arch/riscv/include/asm/hwprobe.h +++ b/arch/riscv/include/asm/hwprobe.h @@ -22,6 +22,7 @@ static inline bool hwprobe_key_is_bitmask(__s64 key) case RISCV_HWPROBE_KEY_IMA_EXT_0: case RISCV_HWPROBE_KEY_CPUPERF_0: case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: + case RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0: return true; } diff --git a/arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h b/arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h new file mode 100644 index 000000000000..90a61abd033c --- /dev/null +++ b/arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_HWPROBE_H +#define _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_HWPROBE_H + +#include + +#include + +#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE +void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cpumask *cpus); +#else +static inline void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, + const struct cpumask *cpus) +{ + pair->value = 0; +} +#endif + +#endif diff --git a/arch/riscv/include/uapi/asm/vendor/sifive.h b/arch/riscv/include/uapi/asm/vendor/sifive.h new file mode 100644 index 000000000000..f25d8cf110d1 --- /dev/null +++ b/arch/riscv/include/uapi/asm/vendor/sifive.h @@ -0,0 +1,4 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#define RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCDOD (1 << 0) +#define RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCQOQ (1 << 1) diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 249aec8594a9..138e74f05de7 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -300,6 +301,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair, pair->value = riscv_timebase; break; + case RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0: + hwprobe_isa_vendor_ext_sifive_0(pair, cpus); + break; + case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0: hwprobe_isa_vendor_ext_thead_0(pair, cpus); break; diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile index d5fdde0e863b..a4eca96d1c8a 100644 --- a/arch/riscv/kernel/vendor_extensions/Makefile +++ b/arch/riscv/kernel/vendor_extensions/Makefile @@ -2,5 +2,6 @@ obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES) += andes.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive.o +obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE) += sifive_hwprobe.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead.o obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD) += thead_hwprobe.o diff --git a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c new file mode 100644 index 000000000000..461ce0f305ce --- /dev/null +++ b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +#include +#include + +#include +#include + +void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cpumask *cpus) +{ + VENDOR_EXTENSION_SUPPORTED(pair, cpus, + riscv_isa_vendor_ext_list_sifive.per_hart_isa_bitmap, { + VENDOR_EXT_KEY(XSFVQMACCDOD); + VENDOR_EXT_KEY(XSFVQMACCQOQ); + }); +} -- cgit v1.2.3 From e84fffe21b7498ff50aed3a96773993d04cfaed0 Mon Sep 17 00:00:00 2001 From: Cyan Yang Date: Fri, 18 Apr 2025 13:32:33 +0800 Subject: riscv: Add SiFive xsfvfnrclipxfqf vendor extension Add SiFive vendor extension "xsfvfnrclipxfqf" support to the kernel. Signed-off-by: Cyan Yang Link: https://lore.kernel.org/r/20250418053239.4351-7-cyan.yang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vendor_extensions/sifive.h | 1 + arch/riscv/kernel/vendor_extensions/sifive.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/vendor_extensions/sifive.h b/arch/riscv/include/asm/vendor_extensions/sifive.h index 608004250e2e..2d05e3e73170 100644 --- a/arch/riscv/include/asm/vendor_extensions/sifive.h +++ b/arch/riscv/include/asm/vendor_extensions/sifive.h @@ -8,6 +8,7 @@ #define RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD 0 #define RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ 1 +#define RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF 2 extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive; diff --git a/arch/riscv/kernel/vendor_extensions/sifive.c b/arch/riscv/kernel/vendor_extensions/sifive.c index 990ac83b1f81..077315e5b2d7 100644 --- a/arch/riscv/kernel/vendor_extensions/sifive.c +++ b/arch/riscv/kernel/vendor_extensions/sifive.c @@ -9,6 +9,7 @@ /* All SiFive vendor extensions supported in Linux */ const struct riscv_isa_ext_data riscv_isa_vendor_ext_sifive[] = { + __RISCV_ISA_EXT_DATA(xsfvfnrclipxfqf, RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF), __RISCV_ISA_EXT_DATA(xsfvqmaccdod, RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD), __RISCV_ISA_EXT_DATA(xsfvqmaccqoq, RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ), }; -- cgit v1.2.3 From 1d91224394c92245942c402245370c4abb0fcbfb Mon Sep 17 00:00:00 2001 From: Cyan Yang Date: Fri, 18 Apr 2025 13:32:35 +0800 Subject: riscv: hwprobe: Add SiFive xsfvfnrclipxfqf vendor extension Add hwprobe for SiFive "xsfvfnrclipxfqf" vendor extension. Signed-off-by: Cyan Yang Link: https://lore.kernel.org/r/20250418053239.4351-9-cyan.yang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/vendor/sifive.h | 1 + arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/uapi/asm/vendor/sifive.h b/arch/riscv/include/uapi/asm/vendor/sifive.h index f25d8cf110d1..b772d4631284 100644 --- a/arch/riscv/include/uapi/asm/vendor/sifive.h +++ b/arch/riscv/include/uapi/asm/vendor/sifive.h @@ -2,3 +2,4 @@ #define RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCDOD (1 << 0) #define RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCQOQ (1 << 1) +#define RISCV_HWPROBE_VENDOR_EXT_XSFVFNRCLIPXFQF (1 << 2) diff --git a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c index 461ce0f305ce..2b9505079a9f 100644 --- a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c +++ b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c @@ -16,5 +16,6 @@ void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cp riscv_isa_vendor_ext_list_sifive.per_hart_isa_bitmap, { VENDOR_EXT_KEY(XSFVQMACCDOD); VENDOR_EXT_KEY(XSFVQMACCQOQ); + VENDOR_EXT_KEY(XSFVFNRCLIPXFQF); }); } -- cgit v1.2.3 From 34e9b16b4b888988730ffab9a9039cfcf305942e Mon Sep 17 00:00:00 2001 From: Cyan Yang Date: Fri, 18 Apr 2025 13:32:37 +0800 Subject: riscv: Add SiFive xsfvfwmaccqqq vendor extension Add SiFive vendor extension "xsfvfwmaccqqq" support to the kernel. Signed-off-by: Cyan Yang Link: https://lore.kernel.org/r/20250418053239.4351-11-cyan.yang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vendor_extensions/sifive.h | 1 + arch/riscv/kernel/vendor_extensions/sifive.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/vendor_extensions/sifive.h b/arch/riscv/include/asm/vendor_extensions/sifive.h index 2d05e3e73170..ac00e500361c 100644 --- a/arch/riscv/include/asm/vendor_extensions/sifive.h +++ b/arch/riscv/include/asm/vendor_extensions/sifive.h @@ -9,6 +9,7 @@ #define RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD 0 #define RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ 1 #define RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF 2 +#define RISCV_ISA_VENDOR_EXT_XSFVFWMACCQQQ 3 extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive; diff --git a/arch/riscv/kernel/vendor_extensions/sifive.c b/arch/riscv/kernel/vendor_extensions/sifive.c index 077315e5b2d7..1411337dc1e6 100644 --- a/arch/riscv/kernel/vendor_extensions/sifive.c +++ b/arch/riscv/kernel/vendor_extensions/sifive.c @@ -10,6 +10,7 @@ /* All SiFive vendor extensions supported in Linux */ const struct riscv_isa_ext_data riscv_isa_vendor_ext_sifive[] = { __RISCV_ISA_EXT_DATA(xsfvfnrclipxfqf, RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF), + __RISCV_ISA_EXT_DATA(xsfvfwmaccqqq, RISCV_ISA_VENDOR_EXT_XSFVFWMACCQQQ), __RISCV_ISA_EXT_DATA(xsfvqmaccdod, RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD), __RISCV_ISA_EXT_DATA(xsfvqmaccqoq, RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ), }; -- cgit v1.2.3 From d9669e33c8fadb5f81287f4961f01e40c0a11c23 Mon Sep 17 00:00:00 2001 From: Cyan Yang Date: Fri, 18 Apr 2025 13:32:39 +0800 Subject: riscv: hwprobe: Add SiFive xsfvfwmaccqqq vendor extension Add hwprobe for SiFive "xsfvfwmaccqqq" vendor extension. Signed-off-by: Cyan Yang Link: https://lore.kernel.org/r/20250418053239.4351-13-cyan.yang@sifive.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/vendor/sifive.h | 1 + arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/uapi/asm/vendor/sifive.h b/arch/riscv/include/uapi/asm/vendor/sifive.h index b772d4631284..9f3278a4b298 100644 --- a/arch/riscv/include/uapi/asm/vendor/sifive.h +++ b/arch/riscv/include/uapi/asm/vendor/sifive.h @@ -3,3 +3,4 @@ #define RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCDOD (1 << 0) #define RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCQOQ (1 << 1) #define RISCV_HWPROBE_VENDOR_EXT_XSFVFNRCLIPXFQF (1 << 2) +#define RISCV_HWPROBE_VENDOR_EXT_XSFVFWMACCQQQ (1 << 3) diff --git a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c index 2b9505079a9f..1f77f6309763 100644 --- a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c +++ b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c @@ -17,5 +17,6 @@ void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cp VENDOR_EXT_KEY(XSFVQMACCDOD); VENDOR_EXT_KEY(XSFVQMACCQOQ); VENDOR_EXT_KEY(XSFVFNRCLIPXFQF); + VENDOR_EXT_KEY(XSFVFWMACCQQQ); }); } -- cgit v1.2.3 From 7d33e0ee5f98b3fc74566fa00d0926bc5deb8174 Mon Sep 17 00:00:00 2001 From: Fabrizio Castro Date: Wed, 23 Apr 2025 15:34:22 +0100 Subject: arm64: dts: renesas: r9a09g057: Add DMAC nodes Add nodes for the DMAC IPs found on the Renesas RZ/V2H(P) SoC. Signed-off-by: Fabrizio Castro Reviewed-by: Geert Uytterhoeven Reviewed-by: Lad Prabhakar Link: https://lore.kernel.org/r/20250423143422.3747702-7-fabrizio.castro.jz@renesas.com Signed-off-by: Vinod Koul --- arch/arm64/boot/dts/renesas/r9a09g057.dtsi | 165 +++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/renesas/r9a09g057.dtsi b/arch/arm64/boot/dts/renesas/r9a09g057.dtsi index 0cd00bb05191..2689a3a6af85 100644 --- a/arch/arm64/boot/dts/renesas/r9a09g057.dtsi +++ b/arch/arm64/boot/dts/renesas/r9a09g057.dtsi @@ -280,6 +280,171 @@ resets = <&cpg 0x30>; }; + dmac0: dma-controller@11400000 { + compatible = "renesas,r9a09g057-dmac"; + reg = <0 0x11400000 0 0x10000>; + interrupts = , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + ; + interrupt-names = "error", + "ch0", "ch1", "ch2", "ch3", + "ch4", "ch5", "ch6", "ch7", + "ch8", "ch9", "ch10", "ch11", + "ch12", "ch13", "ch14", "ch15"; + clocks = <&cpg CPG_MOD 0x0>; + power-domains = <&cpg>; + resets = <&cpg 0x31>; + #dma-cells = <1>; + dma-channels = <16>; + renesas,icu = <&icu 4>; + }; + + dmac1: dma-controller@14830000 { + compatible = "renesas,r9a09g057-dmac"; + reg = <0 0x14830000 0 0x10000>; + interrupts = , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + ; + interrupt-names = "error", + "ch0", "ch1", "ch2", "ch3", + "ch4", "ch5", "ch6", "ch7", + "ch8", "ch9", "ch10", "ch11", + "ch12", "ch13", "ch14", "ch15"; + clocks = <&cpg CPG_MOD 0x1>; + power-domains = <&cpg>; + resets = <&cpg 0x32>; + #dma-cells = <1>; + dma-channels = <16>; + renesas,icu = <&icu 0>; + }; + + dmac2: dma-controller@14840000 { + compatible = "renesas,r9a09g057-dmac"; + reg = <0 0x14840000 0 0x10000>; + interrupts = , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + ; + interrupt-names = "error", + "ch0", "ch1", "ch2", "ch3", + "ch4", "ch5", "ch6", "ch7", + "ch8", "ch9", "ch10", "ch11", + "ch12", "ch13", "ch14", "ch15"; + clocks = <&cpg CPG_MOD 0x2>; + power-domains = <&cpg>; + resets = <&cpg 0x33>; + #dma-cells = <1>; + dma-channels = <16>; + renesas,icu = <&icu 1>; + }; + + dmac3: dma-controller@12000000 { + compatible = "renesas,r9a09g057-dmac"; + reg = <0 0x12000000 0 0x10000>; + interrupts = , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + ; + interrupt-names = "error", + "ch0", "ch1", "ch2", "ch3", + "ch4", "ch5", "ch6", "ch7", + "ch8", "ch9", "ch10", "ch11", + "ch12", "ch13", "ch14", "ch15"; + clocks = <&cpg CPG_MOD 0x3>; + power-domains = <&cpg>; + resets = <&cpg 0x34>; + #dma-cells = <1>; + dma-channels = <16>; + renesas,icu = <&icu 2>; + }; + + dmac4: dma-controller@12010000 { + compatible = "renesas,r9a09g057-dmac"; + reg = <0 0x12010000 0 0x10000>; + interrupts = , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + , + ; + interrupt-names = "error", + "ch0", "ch1", "ch2", "ch3", + "ch4", "ch5", "ch6", "ch7", + "ch8", "ch9", "ch10", "ch11", + "ch12", "ch13", "ch14", "ch15"; + clocks = <&cpg CPG_MOD 0x4>; + power-domains = <&cpg>; + resets = <&cpg 0x35>; + #dma-cells = <1>; + dma-channels = <16>; + renesas,icu = <&icu 3>; + }; + ostm0: timer@11800000 { compatible = "renesas,r9a09g057-ostm", "renesas,ostm"; reg = <0x0 0x11800000 0x0 0x1000>; -- cgit v1.2.3 From 4cc7543eb494daae7d6282e17459e1b06eff82aa Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Mon, 5 May 2025 14:57:58 +0200 Subject: MIPS: SMP: Move the AP sync point before the non-parallel aware functions When CONFIG_HOTPLUG_PARALLEL is enabled, the code executing before cpuhp_ap_sync_alive() is executed in parallel, while after it is serialized. The functions set_cpu_sibling_map() and set_cpu_core_map() were not designed to be executed in parallel, so by moving the cpuhp_ap_sync_alive() before cpuhp_ap_sync_alive(), we then ensure they will be called serialized. The measurement done on EyeQ5 did not show any relevant boot time increase after applying this patch. Fixes: 76c43eb507bc ("MIPS: SMP: Implement parallel CPU bring up for EyeQ") Reported-by: Huacai Chen Signed-off-by: Gregory CLEMENT Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/smp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 1726744f2b2e..7901b59d8f60 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -374,13 +374,13 @@ asmlinkage void start_secondary(void) calibrate_delay(); cpu_data[cpu].udelay_val = loops_per_jiffy; +#ifdef CONFIG_HOTPLUG_PARALLEL + cpuhp_ap_sync_alive(); +#endif set_cpu_sibling_map(cpu); set_cpu_core_map(cpu); cpumask_set_cpu(cpu, &cpu_coherent_mask); -#ifdef CONFIG_HOTPLUG_PARALLEL - cpuhp_ap_sync_alive(); -#endif notify_cpu_starting(cpu); #ifndef CONFIG_HOTPLUG_PARALLEL -- cgit v1.2.3 From 3590692a136d75e39cd67b0f23e032669fcdbcd2 Mon Sep 17 00:00:00 2001 From: Charan Pedumuru Date: Wed, 7 May 2025 06:29:35 +0000 Subject: mips: dts: pic32: pic32mzda: Rename the sdhci nodename to match with common mmc-controller binding Rename the sdhci nodename from "sdhci@" to "mmc@" to align with linux common mmc-controller binding. Signed-off-by: Charan Pedumuru Reviewed-by: Krzysztof Kozlowski Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/pic32/pic32mzda.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/boot/dts/pic32/pic32mzda.dtsi b/arch/mips/boot/dts/pic32/pic32mzda.dtsi index fdc721b414a8..feca35ba56a4 100644 --- a/arch/mips/boot/dts/pic32/pic32mzda.dtsi +++ b/arch/mips/boot/dts/pic32/pic32mzda.dtsi @@ -225,7 +225,7 @@ gpio-ranges = <&pic32_pinctrl 0 144 16>; }; - sdhci: sdhci@1f8ec000 { + sdhci: mmc@1f8ec000 { compatible = "microchip,pic32mzda-sdhci"; reg = <0x1f8ec000 0x100>; interrupts = <191 IRQ_TYPE_LEVEL_HIGH>; -- cgit v1.2.3 From 35fb26f94dfa1b291086b84b2421f957214824d1 Mon Sep 17 00:00:00 2001 From: Caleb James DeLisle Date: Wed, 7 May 2025 13:44:57 +0000 Subject: mips: Add EcoNet MIPS platform support Add platform support for EcoNet MIPS SoCs. Signed-off-by: Caleb James DeLisle Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kbuild.platforms | 1 + arch/mips/Kconfig | 25 +++++++++++ arch/mips/boot/compressed/uart-16550.c | 5 +++ arch/mips/econet/Kconfig | 37 ++++++++++++++++ arch/mips/econet/Makefile | 2 + arch/mips/econet/Platform | 5 +++ arch/mips/econet/init.c | 78 ++++++++++++++++++++++++++++++++++ 7 files changed, 153 insertions(+) create mode 100644 arch/mips/econet/Kconfig create mode 100644 arch/mips/econet/Makefile create mode 100644 arch/mips/econet/Platform create mode 100644 arch/mips/econet/init.c (limited to 'arch') diff --git a/arch/mips/Kbuild.platforms b/arch/mips/Kbuild.platforms index bca37ddf974b..41a00fa860c1 100644 --- a/arch/mips/Kbuild.platforms +++ b/arch/mips/Kbuild.platforms @@ -11,6 +11,7 @@ platform-$(CONFIG_CAVIUM_OCTEON_SOC) += cavium-octeon/ platform-$(CONFIG_EYEQ) += mobileye/ platform-$(CONFIG_MIPS_COBALT) += cobalt/ platform-$(CONFIG_MACH_DECSTATION) += dec/ +platform-$(CONFIG_ECONET) += econet/ platform-$(CONFIG_MIPS_GENERIC) += generic/ platform-$(CONFIG_MACH_JAZZ) += jazz/ platform-$(CONFIG_LANTIQ) += lantiq/ diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index e0e6ce2592b4..c3dbdc808664 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -391,6 +391,30 @@ config MACH_DECSTATION otherwise choose R3000. +config ECONET + bool "EcoNet MIPS family" + select BOOT_RAW + select CPU_BIG_ENDIAN + select DEBUG_ZBOOT + select EARLY_PRINTK_8250 + select ECONET_EN751221_TIMER + select SERIAL_OF_PLATFORM + select SYS_SUPPORTS_BIG_ENDIAN + select SYS_HAS_CPU_MIPS32_R1 + select SYS_HAS_CPU_MIPS32_R2 + select SYS_HAS_EARLY_PRINTK + select SYS_SUPPORTS_32BIT_KERNEL + select SYS_SUPPORTS_MIPS16 + select SYS_SUPPORTS_ZBOOT_UART16550 + select USE_GENERIC_EARLY_PRINTK_8250 + select USE_OF + help + EcoNet EN75xx MIPS devices are big endian MIPS machines used + in XPON (fiber) and DSL applications. They have SPI, PCI, USB, + GPIO, and Ethernet, with optional XPON, DSL, and VoIP DSP cores. + Don't confuse these with the Airoha ARM devices sometimes referred + to as "EcoNet", this family is for MIPS based devices only. + config MACH_JAZZ bool "Jazz family of machines" select ARC_MEMORY @@ -1021,6 +1045,7 @@ source "arch/mips/ath79/Kconfig" source "arch/mips/bcm47xx/Kconfig" source "arch/mips/bcm63xx/Kconfig" source "arch/mips/bmips/Kconfig" +source "arch/mips/econet/Kconfig" source "arch/mips/generic/Kconfig" source "arch/mips/ingenic/Kconfig" source "arch/mips/jazz/Kconfig" diff --git a/arch/mips/boot/compressed/uart-16550.c b/arch/mips/boot/compressed/uart-16550.c index db618e72a0c4..529e77a6487c 100644 --- a/arch/mips/boot/compressed/uart-16550.c +++ b/arch/mips/boot/compressed/uart-16550.c @@ -20,6 +20,11 @@ #define PORT(offset) (CKSEG1ADDR(INGENIC_UART_BASE_ADDR) + (4 * offset)) #endif +#ifdef CONFIG_ECONET +#define EN75_UART_BASE 0x1fbf0003 +#define PORT(offset) (CKSEG1ADDR(EN75_UART_BASE) + (4 * (offset))) +#endif + #ifndef IOTYPE #define IOTYPE char #endif diff --git a/arch/mips/econet/Kconfig b/arch/mips/econet/Kconfig new file mode 100644 index 000000000000..d03f90f3daa4 --- /dev/null +++ b/arch/mips/econet/Kconfig @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: GPL-2.0 +if ECONET + +choice + prompt "EcoNet SoC selection" + default SOC_ECONET_EN751221 + help + Select EcoNet MIPS SoC type. Individual SoCs within a family are + very similar, so is it enough to select the right family, and + then customize to the specific SoC using the device tree only. + + config SOC_ECONET_EN751221 + bool "EN751221 family" + select COMMON_CLK + select ECONET_EN751221_INTC + select IRQ_MIPS_CPU + select SMP + select SMP_UP + select SYS_SUPPORTS_SMP + help + The EN751221 family includes EN7512, RN7513, EN7521, EN7526. + They are based on single core MIPS 34Kc processors. To boot + this kernel, you will need a device tree such as + MIPS_RAW_APPENDED_DTB=y, and a root filesystem. +endchoice + +choice + prompt "Devicetree selection" + default DTB_ECONET_NONE + help + Select the devicetree. + + config DTB_ECONET_NONE + bool "None" +endchoice + +endif diff --git a/arch/mips/econet/Makefile b/arch/mips/econet/Makefile new file mode 100644 index 000000000000..7e4529e7d3d7 --- /dev/null +++ b/arch/mips/econet/Makefile @@ -0,0 +1,2 @@ + +obj-y := init.o diff --git a/arch/mips/econet/Platform b/arch/mips/econet/Platform new file mode 100644 index 000000000000..ea5616447bcd --- /dev/null +++ b/arch/mips/econet/Platform @@ -0,0 +1,5 @@ +# To address a 7.2MB kernel size limit in the EcoNet SDK bootloader, +# we put the load address well above where the bootloader loads and then use +# zboot. So please set CONFIG_ZBOOT_LOAD_ADDRESS to the address where your +# bootloader actually places the kernel. +load-$(CONFIG_ECONET) += 0xffffffff81000000 diff --git a/arch/mips/econet/init.c b/arch/mips/econet/init.c new file mode 100644 index 000000000000..6f43ffb209cb --- /dev/null +++ b/arch/mips/econet/init.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * EcoNet setup code + * + * Copyright (C) 2025 Caleb James DeLisle + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define CR_AHB_RSTCR ((void __iomem *)CKSEG1ADDR(0x1fb00040)) +#define RESET BIT(31) + +#define UART_BASE CKSEG1ADDR(0x1fbf0003) +#define UART_REG_SHIFT 2 + +static void hw_reset(char *command) +{ + iowrite32(RESET, CR_AHB_RSTCR); +} + +/* 1. Bring up early printk. */ +void __init prom_init(void) +{ + setup_8250_early_printk_port(UART_BASE, UART_REG_SHIFT, 0); + _machine_restart = hw_reset; +} + +/* 2. Parse the DT and find memory */ +void __init plat_mem_setup(void) +{ + void *dtb; + + set_io_port_base(KSEG1); + + dtb = get_fdt(); + if (!dtb) + panic("no dtb found"); + + __dt_setup_arch(dtb); + + early_init_dt_scan_memory(); +} + +/* 3. Overload __weak device_tree_init(), add SMP_UP ops */ +void __init device_tree_init(void) +{ + unflatten_and_copy_device_tree(); + + register_up_smp_ops(); +} + +const char *get_system_type(void) +{ + return "EcoNet-EN75xx"; +} + +/* 4. Initialize the IRQ subsystem */ +void __init arch_init_irq(void) +{ + irqchip_init(); +} + +/* 5. Timers */ +void __init plat_time_init(void) +{ + of_clk_init(NULL); + timer_probe(); +} -- cgit v1.2.3 From 0ec4887009729297f7c10368084e41a8a9fbbd0e Mon Sep 17 00:00:00 2001 From: Caleb James DeLisle Date: Wed, 7 May 2025 13:44:59 +0000 Subject: mips: dts: Add EcoNet DTS with EN751221 and SmartFiber XP8421-B board Add DTS files in support of EcoNet platform, including SmartFiber XP8421-B, a low cost commercially available board based on EN751221. Signed-off-by: Caleb James DeLisle Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/dts/Makefile | 1 + arch/mips/boot/dts/econet/Makefile | 2 + arch/mips/boot/dts/econet/en751221.dtsi | 67 ++++++++++++++++++++++ .../dts/econet/en751221_smartfiber_xp8421-b.dts | 19 ++++++ arch/mips/econet/Kconfig | 11 ++++ 5 files changed, 100 insertions(+) create mode 100644 arch/mips/boot/dts/econet/Makefile create mode 100644 arch/mips/boot/dts/econet/en751221.dtsi create mode 100644 arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts (limited to 'arch') diff --git a/arch/mips/boot/dts/Makefile b/arch/mips/boot/dts/Makefile index ff468439a8c4..7375c6ced82b 100644 --- a/arch/mips/boot/dts/Makefile +++ b/arch/mips/boot/dts/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 subdir-$(CONFIG_BMIPS_GENERIC) += brcm subdir-$(CONFIG_CAVIUM_OCTEON_SOC) += cavium-octeon +subdir-$(CONFIG_ECONET) += econet subdir-$(CONFIG_EYEQ) += mobileye subdir-$(CONFIG_FIT_IMAGE_FDT_MARDUK) += img subdir-$(CONFIG_FIT_IMAGE_FDT_BOSTON) += img diff --git a/arch/mips/boot/dts/econet/Makefile b/arch/mips/boot/dts/econet/Makefile new file mode 100644 index 000000000000..b467d5624e39 --- /dev/null +++ b/arch/mips/boot/dts/econet/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +dtb-$(CONFIG_DTB_ECONET_SMARTFIBER_XP8421_B) += en751221_smartfiber_xp8421-b.dtb diff --git a/arch/mips/boot/dts/econet/en751221.dtsi b/arch/mips/boot/dts/econet/en751221.dtsi new file mode 100644 index 000000000000..66197e73d4f0 --- /dev/null +++ b/arch/mips/boot/dts/econet/en751221.dtsi @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/dts-v1/; + +/ { + compatible = "econet,en751221"; + #address-cells = <1>; + #size-cells = <1>; + + hpt_clock: clock { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <200000000>; /* 200 MHz */ + }; + + cpus: cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + compatible = "mips,mips24KEc"; + reg = <0>; + }; + }; + + cpuintc: interrupt-controller { + compatible = "mti,cpu-interrupt-controller"; + interrupt-controller; + #address-cells = <0>; + #interrupt-cells = <1>; + }; + + intc: interrupt-controller@1fb40000 { + compatible = "econet,en751221-intc"; + reg = <0x1fb40000 0x100>; + interrupt-parent = <&cpuintc>; + interrupts = <2>; + + interrupt-controller; + #interrupt-cells = <1>; + econet,shadow-interrupts = <7 2>, <8 3>, <13 12>, <30 29>; + }; + + uart: serial@1fbf0000 { + compatible = "ns16550"; + reg = <0x1fbf0000 0x30>; + reg-io-width = <4>; + reg-shift = <2>; + interrupt-parent = <&intc>; + interrupts = <0>; + /* + * Conversion of baud rate to clock frequency requires a + * computation that is not in the ns16550 driver, so this + * uart is fixed at 115200 baud. + */ + clock-frequency = <1843200>; + }; + + timer_hpt: timer@1fbf0400 { + compatible = "econet,en751221-timer"; + reg = <0x1fbf0400 0x100>; + + interrupt-parent = <&intc>; + interrupts = <30>; + clocks = <&hpt_clock>; + }; +}; diff --git a/arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts b/arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts new file mode 100644 index 000000000000..8223c5bce67f --- /dev/null +++ b/arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/dts-v1/; + +#include "en751221.dtsi" + +/ { + model = "SmartFiber XP8421-B"; + compatible = "smartfiber,xp8421-b", "econet,en751221"; + + memory@0 { + device_type = "memory"; + reg = <0x00000000 0x1c000000>; + }; + + chosen { + stdout-path = "/serial@1fbf0000:115200"; + linux,usable-memory-range = <0x00020000 0x1bfe0000>; + }; +}; diff --git a/arch/mips/econet/Kconfig b/arch/mips/econet/Kconfig index d03f90f3daa4..fd69884cc9a8 100644 --- a/arch/mips/econet/Kconfig +++ b/arch/mips/econet/Kconfig @@ -32,6 +32,17 @@ choice config DTB_ECONET_NONE bool "None" + + config DTB_ECONET_SMARTFIBER_XP8421_B + bool "EN751221 SmartFiber XP8421-B" + depends on SOC_ECONET_EN751221 + select BUILTIN_DTB + help + The SmartFiber XP8421-B is a device based on the EN751221 SoC. + It has 512MB of memory and 256MB of NAND flash. This kernel + needs only an appended initramfs to boot. It can be loaded + through XMODEM and booted from memory in the bootloader, or + it can be packed in tclinux.trx format and written to flash. endchoice endif -- cgit v1.2.3 From 680ef57915db0f715e99a683c15f0f9f7c700e64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Wed, 9 Apr 2025 21:37:29 +0100 Subject: mfd: sec: Split into core and transport (i2c) drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As a preparation for adding support for Samsung's S2MPG10, which is connected via SPEEDY / ACPM rather than I2C, split out (move) all I2C-specific driver code into its own kernel module, sec-i2c, and make the existing sec-core module be just the transport-agnostic core driver kernel module. At the same time, update all defconfigs that reference the old kconfig symbol name. While at it, also update file header comments and module description(s) to drop references to 'mfd', and update comments to be C-style, not C++. Reviewed-by: Krzysztof Kozlowski Signed-off-by: André Draszik Link: https://lore.kernel.org/r/20250409-s2mpg10-v4-8-d66d5f39b6bf@linaro.org Signed-off-by: Lee Jones --- arch/arm/configs/exynos_defconfig | 2 +- arch/arm/configs/multi_v7_defconfig | 2 +- arch/arm/configs/pxa_defconfig | 2 +- arch/arm64/configs/defconfig | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig index 7ad48fdda1da..251f45be6c14 100644 --- a/arch/arm/configs/exynos_defconfig +++ b/arch/arm/configs/exynos_defconfig @@ -167,7 +167,7 @@ CONFIG_MFD_MAX77686=y CONFIG_MFD_MAX77693=y CONFIG_MFD_MAX8997=y CONFIG_MFD_MAX8998=y -CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SEC_I2C=y CONFIG_MFD_STMPE=y CONFIG_STMPE_I2C=y CONFIG_MFD_TPS65090=y diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index ad037c175fdb..7d06ac5369b1 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -612,7 +612,7 @@ CONFIG_MFD_QCOM_RPM=y CONFIG_MFD_SPMI_PMIC=y CONFIG_MFD_RK8XX_I2C=y CONFIG_MFD_RN5T618=y -CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SEC_I2C=y CONFIG_MFD_STMPE=y CONFIG_MFD_PALMAS=y CONFIG_MFD_TPS65090=y diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index de0ac8f521d7..064e79baf20d 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -335,7 +335,7 @@ CONFIG_MFD_MAX77693=y CONFIG_MFD_MAX8907=m CONFIG_EZX_PCAP=y CONFIG_UCB1400_CORE=m -CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SEC_I2C=y CONFIG_MFD_PALMAS=y CONFIG_MFD_TPS65090=y CONFIG_MFD_TPS6586X=y diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 5bb8f09422a2..f021fb29683b 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -769,7 +769,7 @@ CONFIG_MFD_MT6397=y CONFIG_MFD_SPMI_PMIC=y CONFIG_MFD_RK8XX_I2C=y CONFIG_MFD_RK8XX_SPI=y -CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SEC_I2C=y CONFIG_MFD_SL28CPLD=y CONFIG_RZ_MTU3=y CONFIG_MFD_TI_AM335X_TSCADC=m -- cgit v1.2.3 From 46bc169f6f07eca9a7d2346c703ea59bff385e22 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 2 Apr 2025 15:26:34 +0200 Subject: arm64: Kconfig: switch to HAVE_PWRCTRL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HAVE_PWRCTRL symbol has been renamed to reflect the pwrctrl framework name. Switch to the non-deprecated symbol. Signed-off-by: Johan Hovold Signed-off-by: Krzysztof Wilczyński Signed-off-by: Bjorn Helgaas Acked-by: Bartosz Golaszewski Link: https://patch.msgid.link/20250402132634.18065-5-johan+linaro@kernel.org --- arch/arm64/Kconfig.platforms | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index 8b76821f190f..a541bb029aa4 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -269,7 +269,7 @@ config ARCH_QCOM bool "Qualcomm Platforms" select GPIOLIB select PINCTRL - select HAVE_PWRCTL if PCI + select HAVE_PWRCTRL if PCI help This enables support for the ARMv8 based Qualcomm chipsets. -- cgit v1.2.3 From 79ee1d20e37cd553cc961962fca8107e69a0c293 Mon Sep 17 00:00:00 2001 From: Caleb James DeLisle Date: Wed, 21 May 2025 21:33:33 +0000 Subject: mips: econet: Fix incorrect Kconfig dependencies config ECONET selects SERIAL_OF_PLATFORM and that depends on SERIAL_8250 so we need to select SERIAL_8250 directly. Also do not enable DEBUG_ZBOOT unless DEBUG_KERNEL is set. Signed-off-by: Caleb James DeLisle Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505211654.CBdIsoTq-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202505211451.WRjyf3a9-lkp@intel.com/ Signed-off-by: Thomas Bogendoerfer --- arch/mips/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index c3dbdc808664..1e48184ecf1e 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -395,9 +395,10 @@ config ECONET bool "EcoNet MIPS family" select BOOT_RAW select CPU_BIG_ENDIAN - select DEBUG_ZBOOT + select DEBUG_ZBOOT if DEBUG_KERNEL select EARLY_PRINTK_8250 select ECONET_EN751221_TIMER + select SERIAL_8250 select SERIAL_OF_PLATFORM select SYS_SUPPORTS_BIG_ENDIAN select SYS_HAS_CPU_MIPS32_R1 -- cgit v1.2.3 From ab535361efdf8129dc593f8f2d80b76767c07813 Mon Sep 17 00:00:00 2001 From: Gregory CLEMENT Date: Fri, 23 May 2025 09:58:15 +0200 Subject: MIPS: SMP: Move the AP sync point before the calibration delay In the calibration delay process, some resources are shared, so it's better to move it after the parallel execution part. Thanks to the patch optimizing CPU delay calibration, this change has no impact on the boot time improvements gained from CPU parallel boot. Signed-off-by: Gregory CLEMENT Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/smp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 7901b59d8f60..4868e79f3b30 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -371,12 +371,12 @@ asmlinkage void start_secondary(void) * to an option instead of something based on .cputype */ - calibrate_delay(); - cpu_data[cpu].udelay_val = loops_per_jiffy; - #ifdef CONFIG_HOTPLUG_PARALLEL cpuhp_ap_sync_alive(); #endif + calibrate_delay(); + cpu_data[cpu].udelay_val = loops_per_jiffy; + set_cpu_sibling_map(cpu); set_cpu_core_map(cpu); -- cgit v1.2.3 From 293bb049380e14a176e98d3701b85b6b5d3140fd Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 2 Apr 2025 08:05:31 +0100 Subject: ARM: 9446/1: Disallow kernel mode NEON when IRQs are disabled Commit c79f81631142 ("ARM: 9283/1: permit non-nested kernel mode NEON in softirq context") relaxed the rules around the use of SIMD instructions in kernel mode on ARM, to allow such use when serving a softirq. To avoid having to preserve/restore kernel mode NEON state when such a softirq is taken, softirqs are now disabled when using the NEON from task context. However, the fact that the softirq API does not allow unmasking of softirqs with interrupts disabled was overlooked, resulting in a WARN() in some cases, as reported by Guenter: WARNING: CPU: 0 PID: 1145 at kernel/softirq.c:369 __local_bh_enable_ip+0x118/0x194 Call trace: unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x7c/0xac dump_stack_lvl from __warn+0x7c/0x1b8 __warn from warn_slowpath_fmt+0x19c/0x1a4 warn_slowpath_fmt from __local_bh_enable_ip+0x118/0x194 __local_bh_enable_ip from crc_t10dif_arch+0xd4/0xe8 crc_t10dif_arch from crc_t10dif_wrapper+0x14/0x1c crc_t10dif_wrapper from crc_main_test+0x178/0x360 crc_main_test from kunit_try_run_case+0x78/0x1e0 kunit_try_run_case from kunit_generic_run_threadfn_adapter+0x1c/0x34 kunit_generic_run_threadfn_adapter from kthread+0x118/0x254 kthread from ret_from_fork+0x14/0x28 While disabling softirqs is not really needed when running with IRQs disabled (given that the only way a softirq can be delivered asynchrously is over the back of an IRQ), let's not complicate this logic more than needed, and simply disallow use of the NEON in kernel mode when IRQs are disabled. Another approach might be to only disable and re-enable softirqs if IRQs are enabled, but other than the test case above, there are no clear use cases for doing non-trivial arithmetic processing (hence using an accelerated SIMD implementation) with IRQs disabled. Reported-by: Guenter Roeck Tested-by: Guenter Roeck Link: https://lore.kernel.org/all/389b899f-893c-4855-9e30-d8920a5d6f91@roeck-us.net Reviewed-by: Eric Biggers Signed-off-by: Ard Biesheuvel Signed-off-by: Russell King (Oracle) --- arch/arm/include/asm/simd.h | 3 ++- arch/arm/vfp/vfpmodule.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h index 82191dbd7e78..e604a9382960 100644 --- a/arch/arm/include/asm/simd.h +++ b/arch/arm/include/asm/simd.h @@ -4,5 +4,6 @@ static __must_check inline bool may_use_simd(void) { - return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq(); + return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq() + && !irqs_disabled(); } diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c index 7803d50b90f8..e559ad3cd148 100644 --- a/arch/arm/vfp/vfpmodule.c +++ b/arch/arm/vfp/vfpmodule.c @@ -877,6 +877,7 @@ void kernel_neon_begin(void) * the kernel mode NEON register contents never need to be preserved. */ BUG_ON(in_hardirq()); + BUG_ON(irqs_disabled()); cpu = __smp_processor_id(); fpexc = fmrx(FPEXC) | FPEXC_EN; -- cgit v1.2.3 From 314da9d6b3119328c1a38234c68e720380d14957 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 28 May 2025 20:19:56 -0700 Subject: MIPS: loongson2ef: cs5536: add missing function prototypes Add missing function prototypes for cs5536, mostly for PCI functions, and for init_mfgpt_clocksource(). arch/mips/loongson2ef/common/cs5536/cs5536_ide.c:15:6: warning: no previous prototype for 'pci_ide_write_reg' [-Wmissing-prototypes] 15 | void pci_ide_write_reg(int reg, u32 value) arch/mips/loongson2ef/common/cs5536/cs5536_ide.c:96:5: warning: no previous prototype for 'pci_ide_read_reg' [-Wmissing-prototypes] 96 | u32 pci_ide_read_reg(int reg) arch/mips/loongson2ef/common/cs5536/cs5536_ehci.c:15:6: warning: no previous prototype for 'pci_ehci_write_reg' [-Wmissing-prototypes] 15 | void pci_ehci_write_reg(int reg, u32 value) arch/mips/loongson2ef/common/cs5536/cs5536_ehci.c:75:5: warning: no previous prototype for 'pci_ehci_read_reg' [-Wmissing-prototypes] 75 | u32 pci_ehci_read_reg(int reg) arch/mips/loongson2ef/common/cs5536/cs5536_acc.c:15:6: warning: no previous prototype for 'pci_acc_write_reg' [-Wmissing-prototypes] 15 | void pci_acc_write_reg(int reg, u32 value) arch/mips/loongson2ef/common/cs5536/cs5536_acc.c:62:5: warning: no previous prototype for 'pci_acc_read_reg' [-Wmissing-prototypes] 62 | u32 pci_acc_read_reg(int reg) arch/mips/loongson2ef/common/cs5536/cs5536_ohci.c:15:6: warning: no previous prototype for 'pci_ohci_write_reg' [-Wmissing-prototypes] 15 | void pci_ohci_write_reg(int reg, u32 value) arch/mips/loongson2ef/common/cs5536/cs5536_ohci.c:70:5: warning: no previous prototype for 'pci_ohci_read_reg' [-Wmissing-prototypes] 70 | u32 pci_ohci_read_reg(int reg) arch/mips/loongson2ef/common/cs5536/cs5536_isa.c:84:6: warning: no previous prototype for 'pci_isa_write_bar' [-Wmissing-prototypes] 84 | void pci_isa_write_bar(int n, u32 value) arch/mips/loongson2ef/common/cs5536/cs5536_isa.c:110:5: warning: no previous prototype for 'pci_isa_read_bar' [-Wmissing-prototypes] 110 | u32 pci_isa_read_bar(int n) arch/mips/loongson2ef/common/cs5536/cs5536_isa.c:134:6: warning: no previous prototype for 'pci_isa_write_reg' [-Wmissing-prototypes] 134 | void pci_isa_write_reg(int reg, u32 value) arch/mips/loongson2ef/common/cs5536/cs5536_isa.c:228:5: warning: no previous prototype for 'pci_isa_read_reg' [-Wmissing-prototypes] 228 | u32 pci_isa_read_reg(int reg) arch/mips/loongson2ef/common/cs5536/cs5536_mfgpt.c:195:12: warning: no previous prototype for 'init_mfgpt_clocksource' [-Wmissing-prototypes] 195 | int __init init_mfgpt_clocksource(void) Signed-off-by: Randy Dunlap Cc: Jiaxun Yang Cc: linux-mips@vger.kernel.org Cc: Thomas Bogendoerfer Signed-off-by: Thomas Bogendoerfer --- .../include/asm/mach-loongson2ef/cs5536/cs5536_pci.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch') diff --git a/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h b/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h index a0d4b752899e..5dbc9b13d15b 100644 --- a/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h +++ b/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h @@ -12,12 +12,32 @@ #ifndef _CS5536_PCI_H #define _CS5536_PCI_H +#include #include #include extern void cs5536_pci_conf_write4(int function, int reg, u32 value); extern u32 cs5536_pci_conf_read4(int function, int reg); +extern void pci_ehci_write_reg(int reg, u32 value); +extern u32 pci_ehci_read_reg(int reg); + +extern void pci_ide_write_reg(int reg, u32 value); +extern u32 pci_ide_read_reg(int reg); + +extern void pci_acc_write_reg(int reg, u32 value); +extern u32 pci_acc_read_reg(int reg); + +extern void pci_ohci_write_reg(int reg, u32 value); +extern u32 pci_ohci_read_reg(int reg); + +extern void pci_isa_write_bar(int n, u32 value); +extern u32 pci_isa_read_bar(int n); +extern void pci_isa_write_reg(int reg, u32 value); +extern u32 pci_isa_read_reg(int reg); + +extern int __init init_mfgpt_clocksource(void); + #define CS5536_ACC_INTR 9 #define CS5536_IDE_INTR 14 #define CS5536_USB_INTR 11 -- cgit v1.2.3 From 5a0c749125c001cba673e9951b0002fba7ea2886 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 28 May 2025 20:20:05 -0700 Subject: MIPS: loongson2ef: lemote-2f: add missing function prototypes Add several missing function prototypes for lemote-2f to eliminate build warnings: arch/mips/loongson2ef/lemote-2f/machtype.c:10:13: warning: no previous prototype for 'mach_prom_init_machtype' [-Wmissing-prototypes] 10 | void __init mach_prom_init_machtype(void) arch/mips/loongson2ef/common/machtype.c:34:20: warning: no previous prototype for 'mach_prom_init_machtype' [-Wmissing-prototypes] 34 | void __weak __init mach_prom_init_machtype(void) arch/mips/loongson2ef/lemote-2f/pm.c:52:6: warning: no previous prototype for 'setup_wakeup_events' [-Wmissing-prototypes] 52 | void setup_wakeup_events(void) arch/mips/loongson2ef/lemote-2f/pm.c:90:5: warning: no previous prototype for 'wakeup_loongson' [-Wmissing-prototypes] 90 | int wakeup_loongson(void) arch/mips/loongson2ef/lemote-2f/pm.c:137:13: warning: no previous prototype for 'mach_suspend' [-Wmissing-prototypes] 137 | void __weak mach_suspend(void) arch/mips/loongson2ef/lemote-2f/pm.c:142:13: warning: no previous prototype for 'mach_resume' [-Wmissing-prototypes] 142 | void __weak mach_resume(void) Signed-off-by: Randy Dunlap Cc: Jiaxun Yang Cc: linux-mips@vger.kernel.org Cc: Thomas Bogendoerfer Signed-off-by: Thomas Bogendoerfer --- arch/mips/include/asm/mach-loongson2ef/loongson.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/mips/include/asm/mach-loongson2ef/loongson.h b/arch/mips/include/asm/mach-loongson2ef/loongson.h index ca039b8dcde3..4a098fb10232 100644 --- a/arch/mips/include/asm/mach-loongson2ef/loongson.h +++ b/arch/mips/include/asm/mach-loongson2ef/loongson.h @@ -18,6 +18,9 @@ extern void bonito_irq_init(void); extern void mach_prepare_reboot(void); extern void mach_prepare_shutdown(void); +/* machine-specific PROM functions */ +extern void __init mach_prom_init_machtype(void); + /* environment arguments from bootloader */ extern u32 cpu_clock_freq; extern u32 memsize, highmemsize; @@ -45,6 +48,12 @@ extern void __init mach_init_irq(void); extern void mach_irq_dispatch(unsigned int pending); extern int mach_i8259_irq(void); +/* power management functions */ +extern void setup_wakeup_events(void); +extern int wakeup_loongson(void); +extern void __weak mach_suspend(void); +extern void __weak mach_resume(void); + /* We need this in some places... */ #define delay() ({ \ int x; \ -- cgit v1.2.3 From a96c7330da0b4cc64c6f79044d03f52532bdfc5b Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Fri, 30 May 2025 21:45:33 +0800 Subject: LoongArch: Add a default install.sh As specified in scripts/install.sh, the priority order is as follows (from highest to lowest): ~/bin/installkernel /sbin/installkernel arch/loongarch/boot/install.sh Fallback to default install.sh if installkernel is not found. Signed-off-by: Youling Tang Signed-off-by: Huacai Chen --- arch/loongarch/boot/install.sh | 56 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100755 arch/loongarch/boot/install.sh (limited to 'arch') diff --git a/arch/loongarch/boot/install.sh b/arch/loongarch/boot/install.sh new file mode 100755 index 000000000000..daac197d3315 --- /dev/null +++ b/arch/loongarch/boot/install.sh @@ -0,0 +1,56 @@ +#!/bin/sh +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (C) 1995 by Linus Torvalds +# +# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin +# Adapted from code in arch/i386/boot/install.sh by Russell King +# +# "make install" script for the LoongArch Linux port +# +# Arguments: +# $1 - kernel version +# $2 - kernel image file +# $3 - kernel map file +# $4 - default install path (blank if root directory) + +set -e + +case "${2##*/}" in +vmlinux.elf) + echo "Installing uncompressed vmlinux.elf kernel" + base=vmlinux + ;; +vmlinux.efi) + echo "Installing uncompressed vmlinux.efi kernel" + base=vmlinux + ;; +vmlinuz.efi) + echo "Installing gzip/zstd compressed vmlinuz.efi kernel" + base=vmlinuz + ;; +*) + echo "Warning: Unexpected kernel type" + exit 1 + ;; +esac + +if [ -f $4/$base-$1 ]; then + mv $4/$base-$1 $4/$base-$1.old +fi +cat $2 > $4/$base-$1 + +# Install system map file +if [ -f $4/System.map-$1 ]; then + mv $4/System.map-$1 $4/System.map-$1.old +fi +cp $3 $4/System.map-$1 + +# Install kernel config file +if [ -f $4/config-$1 ]; then + mv $4/config-$1 $4/config-$1.old +fi +cp .config $4/config-$1 -- cgit v1.2.3 From 75cffd392bfabc30ee88836887ea5439ec3b8089 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Fri, 30 May 2025 21:45:42 +0800 Subject: LoongArch: Using generic scripts/install.sh in `make install` Use the generic script/install.sh to perform the make install operation. This will automatically generate the initrd file and modify the grub.cfg without manual intervention (The previous kernel image, config file and System.map will also be generated), similar to other architectures. Signed-off-by: Youling Tang Signed-off-by: Huacai Chen --- arch/loongarch/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 0304eabbe606..64bdb52ddf7c 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -181,9 +181,7 @@ vmlinux.elf vmlinux.efi vmlinuz.efi: vmlinux $(Q)$(MAKE) $(build)=$(boot) $(bootvars-y) $(boot)/$@ install: - $(Q)install -D -m 755 $(KBUILD_IMAGE) $(INSTALL_PATH)/$(image-name-y)-$(KERNELRELEASE) - $(Q)install -D -m 644 .config $(INSTALL_PATH)/config-$(KERNELRELEASE) - $(Q)install -D -m 644 System.map $(INSTALL_PATH)/System.map-$(KERNELRELEASE) + $(call cmd,install) define archhelp echo ' install - install kernel into $(INSTALL_PATH)' -- cgit v1.2.3 From 980d4a42d595a00be2cec6a39ae3bfa6011ffcb3 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Fri, 30 May 2025 21:45:42 +0800 Subject: LoongArch: Add some annotations in archhelp - Add annotations to the kernel image. - Modify the annotations of make insatll. Signed-off-by: Youling Tang Signed-off-by: Huacai Chen --- arch/loongarch/Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 64bdb52ddf7c..b0703a4e02a2 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -184,6 +184,11 @@ install: $(call cmd,install) define archhelp - echo ' install - install kernel into $(INSTALL_PATH)' + echo ' vmlinux.elf - Uncompressed ELF kernel image (arch/loongarch/boot/vmlinux.elf)' + echo ' vmlinux.efi - Uncompressed EFI kernel image (arch/loongarch/boot/vmlinux.efi)' + echo ' vmlinuz.efi - GZIP/ZSTD-compressed EFI kernel image (arch/loongarch/boot/vmlinuz.efi)' + echo ' Default when CONFIG_EFI_ZBOOT=y' + echo ' install - Install kernel using (your) ~/bin/$(INSTALLKERNEL) or' + echo ' (distribution) /sbin/$(INSTALLKERNEL) or install.sh to $$(INSTALL_PATH)' echo endef -- cgit v1.2.3 From 93f437315660219245f99724d7597e5b2ea40df3 Mon Sep 17 00:00:00 2001 From: Tianyang Zhang Date: Fri, 30 May 2025 21:45:42 +0800 Subject: LoongArch: Add SCHED_MC (Multi-core scheduler) support In order to achieve more reasonable load balancing behavior, add SCHED_MC (Multi-core scheduler) support. The LLC distribution of LoongArch now is consistent with NUMA node, the balancing domain of SCHED_MC can effectively reduce the situation where processes are awakened to smt_sibling. Co-developed-by: Hongliang Wang Signed-off-by: Hongliang Wang Signed-off-by: Tianyang Zhang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 9 +++++++++ arch/loongarch/include/asm/smp.h | 1 + arch/loongarch/include/asm/topology.h | 8 ++++++++ arch/loongarch/kernel/smp.c | 38 +++++++++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+) (limited to 'arch') diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 1a2cf012b8f2..609b15a26621 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -456,6 +456,15 @@ config SCHED_SMT Improves scheduler's performance when there are multiple threads in one physical core. +config SCHED_MC + bool "Multi-core scheduler support" + depends on SMP + default y + help + Multi-core scheduler support improves the CPU scheduler's decision + making when dealing with multi-core CPU chips at a cost of slightly + increased overhead in some places. + config SMP bool "Multi-Processing support" help diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h index b87d1d5e5890..ad0bd234a0f1 100644 --- a/arch/loongarch/include/asm/smp.h +++ b/arch/loongarch/include/asm/smp.h @@ -25,6 +25,7 @@ extern int smp_num_siblings; extern int num_processors; extern int disabled_cpus; extern cpumask_t cpu_sibling_map[]; +extern cpumask_t cpu_llc_shared_map[]; extern cpumask_t cpu_core_map[]; extern cpumask_t cpu_foreign_map[]; diff --git a/arch/loongarch/include/asm/topology.h b/arch/loongarch/include/asm/topology.h index 50273c9187d0..ab206678e47f 100644 --- a/arch/loongarch/include/asm/topology.h +++ b/arch/loongarch/include/asm/topology.h @@ -30,6 +30,14 @@ void numa_set_distance(int from, int to, int distance); #endif #ifdef CONFIG_SMP +/* + * Return cpus that shares the last level cache. + */ +static inline const struct cpumask *cpu_coregroup_mask(int cpu) +{ + return &cpu_llc_shared_map[cpu]; +} + #define topology_physical_package_id(cpu) (cpu_data[cpu].package) #define topology_core_id(cpu) (cpu_data[cpu].core) #define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 4b24589c0b56..46036d98da75 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -46,6 +46,10 @@ EXPORT_SYMBOL(__cpu_logical_map); cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_sibling_map); +/* Representing the last level cache shared map of each logical CPU */ +cpumask_t cpu_llc_shared_map[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(cpu_llc_shared_map); + /* Representing the core map of multi-core chips of each logical CPU */ cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); @@ -63,6 +67,9 @@ EXPORT_SYMBOL(cpu_foreign_map); /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; +/* representing cpus for which llc shared maps can be computed */ +static cpumask_t cpu_llc_shared_setup_map; + /* representing cpus for which core maps can be computed */ static cpumask_t cpu_core_setup_map; @@ -102,6 +109,34 @@ static inline void set_cpu_core_map(int cpu) } } +static inline void set_cpu_llc_shared_map(int cpu) +{ + int i; + + cpumask_set_cpu(cpu, &cpu_llc_shared_setup_map); + + for_each_cpu(i, &cpu_llc_shared_setup_map) { + if (cpu_to_node(cpu) == cpu_to_node(i)) { + cpumask_set_cpu(i, &cpu_llc_shared_map[cpu]); + cpumask_set_cpu(cpu, &cpu_llc_shared_map[i]); + } + } +} + +static inline void clear_cpu_llc_shared_map(int cpu) +{ + int i; + + for_each_cpu(i, &cpu_llc_shared_setup_map) { + if (cpu_to_node(cpu) == cpu_to_node(i)) { + cpumask_clear_cpu(i, &cpu_llc_shared_map[cpu]); + cpumask_clear_cpu(cpu, &cpu_llc_shared_map[i]); + } + } + + cpumask_clear_cpu(cpu, &cpu_llc_shared_setup_map); +} + static inline void set_cpu_sibling_map(int cpu) { int i; @@ -406,6 +441,7 @@ int loongson_cpu_disable(void) #endif set_cpu_online(cpu, false); clear_cpu_sibling_map(cpu); + clear_cpu_llc_shared_map(cpu); calculate_cpu_foreign_map(); local_irq_save(flags); irq_migrate_all_off_this_cpu(); @@ -572,6 +608,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) current_thread_info()->cpu = 0; loongson_prepare_cpus(max_cpus); set_cpu_sibling_map(0); + set_cpu_llc_shared_map(0); set_cpu_core_map(0); calculate_cpu_foreign_map(); #ifndef CONFIG_HOTPLUG_CPU @@ -613,6 +650,7 @@ asmlinkage void start_secondary(void) loongson_init_secondary(); set_cpu_sibling_map(cpu); + set_cpu_llc_shared_map(cpu); set_cpu_core_map(cpu); notify_cpu_starting(cpu); -- cgit v1.2.3 From b37981ce540dffa64a4664ccf0e20dbef6c2c638 Mon Sep 17 00:00:00 2001 From: Yuli Wang Date: Fri, 30 May 2025 21:45:42 +0800 Subject: LoongArch: Enable ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS Provide support for CONFIG_MSEAL_SYSTEM_MAPPINGS on LoongArch, covering the vdso. Link: https://lore.kernel.org/all/25bad37f-273e-4626-999c-e1890be96182@lucifer.local/ Acked-by: Liam R. Howlett Acked-by: Lorenzo Stoakes Reviewed-by: Jeff Xu Tested-by: Yuli Wang Signed-off-by: Yuli Wang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + arch/loongarch/kernel/vdso.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 609b15a26621..bfc5604c494d 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -69,6 +69,7 @@ config LOONGARCH select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_LTO_CLANG select ARCH_SUPPORTS_LTO_CLANG_THIN + select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c index 10cf1608c7b3..7b888d9085a0 100644 --- a/arch/loongarch/kernel/vdso.c +++ b/arch/loongarch/kernel/vdso.c @@ -105,7 +105,9 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vdso_addr = data_addr + VVAR_SIZE; vma = _install_special_mapping(mm, vdso_addr, info->size, - VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, + VM_READ | VM_EXEC | + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | + VM_SEALED_SYSMAP, &info->code_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); -- cgit v1.2.3 From a45728fd4120011c78af1d056e571b84d47dfcc1 Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Fri, 30 May 2025 21:45:42 +0800 Subject: LoongArch: Enable HAVE_ARCH_STACKLEAK Add support for the stackleak feature. It initializes the stack with the poison value before returning from system calls which improves the kernel security. At the same time, disables the plugin in EFI stub code because EFI stub is out of scope for the protection. Tested on Loongson-3A5000 (enable GCC_PLUGIN_STACKLEAK and LKDTM): # echo STACKLEAK_ERASING > /sys/kernel/debug/provoke-crash/DIRECT # dmesg lkdtm: Performing direct entry STACKLEAK_ERASING lkdtm: stackleak stack usage: high offset: 320 bytes current: 448 bytes lowest: 1264 bytes tracked: 1264 bytes untracked: 208 bytes poisoned: 14528 bytes low offset: 64 bytes lkdtm: OK: the rest of the thread stack is properly erased Signed-off-by: Youling Tang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/entry-common.h | 8 +------- arch/loongarch/include/asm/stackframe.h | 6 ++++++ arch/loongarch/include/asm/stacktrace.h | 5 +++++ arch/loongarch/kernel/entry.S | 3 +++ 5 files changed, 16 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index bfc5604c494d..38706186cf13 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -124,6 +124,7 @@ config LOONGARCH select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET select HAVE_ARCH_SECCOMP select HAVE_ARCH_SECCOMP_FILTER + select HAVE_ARCH_STACKLEAK select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD diff --git a/arch/loongarch/include/asm/entry-common.h b/arch/loongarch/include/asm/entry-common.h index 0fe2a098ded9..099132980dc9 100644 --- a/arch/loongarch/include/asm/entry-common.h +++ b/arch/loongarch/include/asm/entry-common.h @@ -2,12 +2,6 @@ #ifndef ARCH_LOONGARCH_ENTRY_COMMON_H #define ARCH_LOONGARCH_ENTRY_COMMON_H -#include -#include - -static inline bool on_thread_stack(void) -{ - return !(((unsigned long)(current->stack) ^ current_stack_pointer) & ~(THREAD_SIZE - 1)); -} +#include /* For on_thread_stack() */ #endif diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h index 66736837085b..3eda298702b1 100644 --- a/arch/loongarch/include/asm/stackframe.h +++ b/arch/loongarch/include/asm/stackframe.h @@ -57,6 +57,12 @@ jirl zero, \temp1, 0xc .endm + .macro STACKLEAK_ERASE +#ifdef CONFIG_GCC_PLUGIN_STACKLEAK + bl stackleak_erase_on_task_stack +#endif + .endm + .macro BACKUP_T0T1 csrwr t0, EXCEPTION_KS0 csrwr t1, EXCEPTION_KS1 diff --git a/arch/loongarch/include/asm/stacktrace.h b/arch/loongarch/include/asm/stacktrace.h index fc8b64773794..5c8be156567c 100644 --- a/arch/loongarch/include/asm/stacktrace.h +++ b/arch/loongarch/include/asm/stacktrace.h @@ -31,6 +31,11 @@ bool in_irq_stack(unsigned long stack, struct stack_info *info); bool in_task_stack(unsigned long stack, struct task_struct *task, struct stack_info *info); int get_stack_info(unsigned long stack, struct task_struct *task, struct stack_info *info); +static __always_inline bool on_thread_stack(void) +{ + return !(((unsigned long)(current->stack) ^ current_stack_pointer) & ~(THREAD_SIZE - 1)); +} + #define STR_LONG_L __stringify(LONG_L) #define STR_LONG_S __stringify(LONG_S) #define STR_LONGSIZE __stringify(LONGSIZE) diff --git a/arch/loongarch/kernel/entry.S b/arch/loongarch/kernel/entry.S index 2abc29e57381..47e1db9a1ce4 100644 --- a/arch/loongarch/kernel/entry.S +++ b/arch/loongarch/kernel/entry.S @@ -73,6 +73,7 @@ SYM_CODE_START(handle_syscall) move a0, sp bl do_syscall + STACKLEAK_ERASE RESTORE_ALL_AND_RET SYM_CODE_END(handle_syscall) _ASM_NOKPROBE(handle_syscall) @@ -81,6 +82,7 @@ SYM_CODE_START(ret_from_fork_asm) UNWIND_HINT_REGS move a1, sp bl ret_from_fork + STACKLEAK_ERASE RESTORE_STATIC RESTORE_SOME RESTORE_SP_AND_RET @@ -92,6 +94,7 @@ SYM_CODE_START(ret_from_kernel_thread_asm) move a2, s0 move a3, s1 bl ret_from_kernel_thread + STACKLEAK_ERASE RESTORE_STATIC RESTORE_SOME RESTORE_SP_AND_RET -- cgit v1.2.3 From 9559d5806319a9254d9053b22a31324e1929aac6 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 30 May 2025 21:45:43 +0800 Subject: LoongArch: Increase max supported CPUs up to 2048 Increase max supported CPUs up to 2048, including: 1. Increase CSR.CPUID register's effective width; 2. Define MAX_CORE_PIC (a.k.a. max physical ID) to 2048; 3. Allow NR_CPUS (a.k.a. max logical ID) to be as large as 2048; 4. Introduce acpi_numa_x2apic_affinity_init() to handle ACPI SRAT for CPUID >= 256. Note: The reason of increasing to 2048 rather than 4096/8192 is because the IPI hardware can only support 2048 as a maximum. Reviewed-by: Yanteng Si Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 6 ++--- arch/loongarch/include/asm/acpi.h | 2 +- arch/loongarch/include/asm/loongarch.h | 4 ++-- arch/loongarch/kernel/acpi.c | 41 +++++++++++++++++++++++++++++----- 4 files changed, 41 insertions(+), 12 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 38706186cf13..0c356de91bc4 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -496,10 +496,10 @@ config HOTPLUG_CPU Say N if you want to disable CPU hotplug. config NR_CPUS - int "Maximum number of CPUs (2-256)" - range 2 256 + int "Maximum number of CPUs (2-2048)" + range 2 2048 + default "2048" depends on SMP - default "64" help This allows you to specify the maximum number of CPUs which this kernel will support. diff --git a/arch/loongarch/include/asm/acpi.h b/arch/loongarch/include/asm/acpi.h index 313f66f7913a..7376840fa9f7 100644 --- a/arch/loongarch/include/asm/acpi.h +++ b/arch/loongarch/include/asm/acpi.h @@ -33,7 +33,7 @@ static inline bool acpi_has_cpu_in_madt(void) return true; } -#define MAX_CORE_PIC 256 +#define MAX_CORE_PIC 2048 extern struct list_head acpi_wakeup_device_list; extern struct acpi_madt_core_pic acpi_core_pic[MAX_CORE_PIC]; diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h index 52651aa0e583..d84dac88a584 100644 --- a/arch/loongarch/include/asm/loongarch.h +++ b/arch/loongarch/include/asm/loongarch.h @@ -411,8 +411,8 @@ /* Config CSR registers */ #define LOONGARCH_CSR_CPUID 0x20 /* CPU core id */ -#define CSR_CPUID_COREID_WIDTH 9 -#define CSR_CPUID_COREID _ULCAST_(0x1ff) +#define CSR_CPUID_COREID_WIDTH 11 +#define CSR_CPUID_COREID _ULCAST_(0x7ff) #define LOONGARCH_CSR_PRCFG1 0x21 /* Config1 */ #define CSR_CONF1_VSMAX_SHIFT 12 diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c index 1120ac2824f6..fba4bb93e1bd 100644 --- a/arch/loongarch/kernel/acpi.c +++ b/arch/loongarch/kernel/acpi.c @@ -244,11 +244,6 @@ fdt_earlycon: #ifdef CONFIG_ACPI_NUMA -static __init int setup_node(int pxm) -{ - return acpi_map_pxm_to_node(pxm); -} - void __init numa_set_distance(int from, int to, int distance) { if ((u8)distance != distance || (from == to && distance != LOCAL_DISTANCE)) { @@ -280,7 +275,41 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) pxm |= (pa->proximity_domain_hi[1] << 16); pxm |= (pa->proximity_domain_hi[2] << 24); } - node = setup_node(pxm); + node = acpi_map_pxm_to_node(pxm); + if (node < 0) { + pr_err("SRAT: Too many proximity domains %x\n", pxm); + bad_srat(); + return; + } + + if (pa->apic_id >= CONFIG_NR_CPUS) { + pr_info("SRAT: PXM %u -> CPU 0x%02x -> Node %u skipped apicid that is too big\n", + pxm, pa->apic_id, node); + return; + } + + early_numa_add_cpu(pa->apic_id, node); + + set_cpuid_to_node(pa->apic_id, node); + node_set(node, numa_nodes_parsed); + pr_info("SRAT: PXM %u -> CPU 0x%02x -> Node %u\n", pxm, pa->apic_id, node); +} + +void __init +acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) +{ + int pxm, node; + + if (srat_disabled()) + return; + if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { + bad_srat(); + return; + } + if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) + return; + pxm = pa->proximity_domain; + node = acpi_map_pxm_to_node(pxm); if (node < 0) { pr_err("SRAT: Too many proximity domains %x\n", pxm); bad_srat(); -- cgit v1.2.3 From a24f2fb70cb62180486ad4d74f809ff35ddd1cf9 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 30 May 2025 21:45:43 +0800 Subject: LoongArch: Introduce the numa_memblks conversion Commit 87482708210ff3333a ("mm: introduce numa_memblks") has moved numa_memblks from x86 to the generic code, but LoongArch was left out of this conversion. This patch introduces the generic numa_memblks for LoongArch. In detail: 1. Enable NUMA_MEMBLKS (but disable NUMA_EMU) in Kconfig; 2. Use generic definition for numa_memblk and numa_meminfo; 3. Use generic implementation for numa_add_memblk() and its friends; 4. Use generic implementation for numa_set_distance() and its friends; 5. Use generic implementation for memory_add_physaddr_to_nid() and its friends. Note: Disable NUMA_EMU because it needs more efforts and no obvious demand now. Tested-by: Binbin Zhou Signed-off-by: Yuquan Wang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/numa.h | 14 ----- arch/loongarch/include/asm/sparsemem.h | 5 -- arch/loongarch/include/asm/topology.h | 7 +-- arch/loongarch/kernel/acpi.c | 11 ---- arch/loongarch/kernel/numa.c | 108 ++++----------------------------- arch/loongarch/mm/init.c | 8 --- 7 files changed, 15 insertions(+), 139 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 0c356de91bc4..4b19f93379a1 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -189,6 +189,7 @@ config LOONGARCH select MODULES_USE_ELF_RELA if MODULES select NEED_PER_CPU_EMBED_FIRST_CHUNK select NEED_PER_CPU_PAGE_FIRST_CHUNK + select NUMA_MEMBLKS if NUMA select OF select OF_EARLY_FLATTREE select PCI diff --git a/arch/loongarch/include/asm/numa.h b/arch/loongarch/include/asm/numa.h index b5f9de9f102e..bbf9f70bd25f 100644 --- a/arch/loongarch/include/asm/numa.h +++ b/arch/loongarch/include/asm/numa.h @@ -22,20 +22,6 @@ extern int numa_off; extern s16 __cpuid_to_node[CONFIG_NR_CPUS]; extern nodemask_t numa_nodes_parsed __initdata; -struct numa_memblk { - u64 start; - u64 end; - int nid; -}; - -#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) -struct numa_meminfo { - int nr_blks; - struct numa_memblk blk[NR_NODE_MEMBLKS]; -}; - -extern int __init numa_add_memblk(int nodeid, u64 start, u64 end); - extern void __init early_numa_add_cpu(int cpuid, s16 node); extern void numa_add_cpu(unsigned int cpu); extern void numa_remove_cpu(unsigned int cpu); diff --git a/arch/loongarch/include/asm/sparsemem.h b/arch/loongarch/include/asm/sparsemem.h index 8d4af6aff8a8..4501efac1a87 100644 --- a/arch/loongarch/include/asm/sparsemem.h +++ b/arch/loongarch/include/asm/sparsemem.h @@ -21,11 +21,6 @@ #define VMEMMAP_SIZE 0 /* 1, For FLATMEM; 2, For SPARSEMEM without VMEMMAP. */ #endif -#ifdef CONFIG_MEMORY_HOTPLUG -int memory_add_physaddr_to_nid(u64 addr); -#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid -#endif - #define INIT_MEMBLOCK_RESERVED_REGIONS (INIT_MEMBLOCK_REGIONS + NR_CPUS) #endif /* _LOONGARCH_SPARSEMEM_H */ diff --git a/arch/loongarch/include/asm/topology.h b/arch/loongarch/include/asm/topology.h index ab206678e47f..f06e7ff25bb7 100644 --- a/arch/loongarch/include/asm/topology.h +++ b/arch/loongarch/include/asm/topology.h @@ -19,11 +19,8 @@ extern int pcibus_to_node(struct pci_bus *); #define cpumask_of_pcibus(bus) (cpu_online_mask) -extern unsigned char node_distances[MAX_NUMNODES][MAX_NUMNODES]; - -void numa_set_distance(int from, int to, int distance); - -#define node_distance(from, to) (node_distances[(from)][(to)]) +int __node_distance(int from, int to); +#define node_distance(from, to) __node_distance(from, to) #else #define pcibus_to_node(bus) 0 diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c index fba4bb93e1bd..a54cd6fd3796 100644 --- a/arch/loongarch/kernel/acpi.c +++ b/arch/loongarch/kernel/acpi.c @@ -244,17 +244,6 @@ fdt_earlycon: #ifdef CONFIG_ACPI_NUMA -void __init numa_set_distance(int from, int to, int distance) -{ - if ((u8)distance != distance || (from == to && distance != LOCAL_DISTANCE)) { - pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n", - from, to, distance); - return; - } - - node_distances[from][to] = distance; -} - /* Callback for Proximity Domain -> CPUID mapping */ void __init acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index 30a72fd528c0..d6e73e8f9c0b 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -27,10 +28,6 @@ #include int numa_off; -unsigned char node_distances[MAX_NUMNODES][MAX_NUMNODES]; -EXPORT_SYMBOL(node_distances); - -static struct numa_meminfo numa_meminfo; cpumask_t cpus_on_node[MAX_NUMNODES]; cpumask_t phys_cpus_on_node[MAX_NUMNODES]; EXPORT_SYMBOL(cpus_on_node); @@ -43,8 +40,6 @@ s16 __cpuid_to_node[CONFIG_NR_CPUS] = { }; EXPORT_SYMBOL(__cpuid_to_node); -nodemask_t numa_nodes_parsed __initdata; - #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); @@ -145,48 +140,6 @@ void numa_remove_cpu(unsigned int cpu) cpumask_clear_cpu(cpu, &cpus_on_node[nid]); } -static int __init numa_add_memblk_to(int nid, u64 start, u64 end, - struct numa_meminfo *mi) -{ - /* ignore zero length blks */ - if (start == end) - return 0; - - /* whine about and ignore invalid blks */ - if (start > end || nid < 0 || nid >= MAX_NUMNODES) { - pr_warn("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", - nid, start, end - 1); - return 0; - } - - if (mi->nr_blks >= NR_NODE_MEMBLKS) { - pr_err("NUMA: too many memblk ranges\n"); - return -EINVAL; - } - - mi->blk[mi->nr_blks].start = PFN_ALIGN(start); - mi->blk[mi->nr_blks].end = PFN_ALIGN(end - PAGE_SIZE + 1); - mi->blk[mi->nr_blks].nid = nid; - mi->nr_blks++; - return 0; -} - -/** - * numa_add_memblk - Add one numa_memblk to numa_meminfo - * @nid: NUMA node ID of the new memblk - * @start: Start address of the new memblk - * @end: End address of the new memblk - * - * Add a new memblk to the default numa_meminfo. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int __init numa_add_memblk(int nid, u64 start, u64 end) -{ - return numa_add_memblk_to(nid, start, end, &numa_meminfo); -} - static void __init node_mem_init(unsigned int node) { unsigned long start_pfn, end_pfn; @@ -205,18 +158,6 @@ static void __init node_mem_init(unsigned int node) #ifdef CONFIG_ACPI_NUMA -static void __init add_node_intersection(u32 node, u64 start, u64 size, u32 type) -{ - static unsigned long num_physpages; - - num_physpages += (size >> PAGE_SHIFT); - pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n", - node, type, start, size); - pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", - start >> PAGE_SHIFT, (start + size) >> PAGE_SHIFT, num_physpages); - memblock_set_node(start, size, &memblock.memory, node); -} - /* * add_numamem_region * @@ -228,28 +169,21 @@ static void __init add_node_intersection(u32 node, u64 start, u64 size, u32 type */ static void __init add_numamem_region(u64 start, u64 end, u32 type) { - u32 i; - u64 ofs = start; + u32 node = pa_to_nid(start); + u64 size = end - start; + static unsigned long num_physpages; if (start >= end) { pr_debug("Invalid region: %016llx-%016llx\n", start, end); return; } - for (i = 0; i < numa_meminfo.nr_blks; i++) { - struct numa_memblk *mb = &numa_meminfo.blk[i]; - - if (ofs > mb->end) - continue; - - if (end > mb->end) { - add_node_intersection(mb->nid, ofs, mb->end - ofs, type); - ofs = mb->end; - } else { - add_node_intersection(mb->nid, ofs, end - ofs, type); - break; - } - } + num_physpages += (size >> PAGE_SHIFT); + pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n", + node, type, start, size); + pr_info(" start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n", + start >> PAGE_SHIFT, end >> PAGE_SHIFT, num_physpages); + memblock_set_node(start, size, &memblock.memory, node); } static void __init init_node_memblock(void) @@ -291,24 +225,6 @@ static void __init init_node_memblock(void) } } -static void __init numa_default_distance(void) -{ - int row, col; - - for (row = 0; row < MAX_NUMNODES; row++) - for (col = 0; col < MAX_NUMNODES; col++) { - if (col == row) - node_distances[row][col] = LOCAL_DISTANCE; - else - /* We assume that one node per package here! - * - * A SLIT should be used for multiple nodes - * per package to override default setting. - */ - node_distances[row][col] = REMOTE_DISTANCE; - } -} - /* * fake_numa_init() - For Non-ACPI systems * Return: 0 on success, -errno on failure. @@ -333,11 +249,11 @@ int __init init_numa_memory(void) for (i = 0; i < NR_CPUS; i++) set_cpuid_to_node(i, NUMA_NO_NODE); - numa_default_distance(); + numa_reset_distance(); nodes_clear(numa_nodes_parsed); nodes_clear(node_possible_map); nodes_clear(node_online_map); - memset(&numa_meminfo, 0, sizeof(numa_meminfo)); + WARN_ON(memblock_clear_hotplug(0, PHYS_ADDR_MAX)); /* Parse SRAT and SLIT if provided by firmware. */ ret = acpi_disabled ? fake_numa_init() : acpi_numa_init(); diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 06f11d9e4ec1..c3e4586a7975 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -106,14 +106,6 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) page += vmem_altmap_offset(altmap); __remove_pages(start_pfn, nr_pages, altmap); } - -#ifdef CONFIG_NUMA -int memory_add_physaddr_to_nid(u64 start) -{ - return pa_to_nid(start); -} -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -- cgit v1.2.3 From 52c22661c79a7b6af7fad9f77200738fc6c51878 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 30 May 2025 21:45:48 +0800 Subject: LoongArch: Avoid using $r0/$r1 as "mask" for csrxchg When building kernel with LLVM there are occasionally such errors: In file included from ./include/linux/spinlock.h:59: In file included from ./include/linux/irqflags.h:17: arch/loongarch/include/asm/irqflags.h:38:3: error: must not be $r0 or $r1 38 | "csrxchg %[val], %[mask], %[reg]\n\t" | ^ :1:16: note: instantiated into assembly here 1 | csrxchg $a1, $ra, 0 | ^ To prevent the compiler from allocating $r0 or $r1 for the "mask" of the csrxchg instruction, the 'q' constraint must be used but Clang < 21 does not support it. So force to use $t0 in the inline asm, in order to avoid using $r0/$r1 while keeping the backward compatibility. Cc: stable@vger.kernel.org Link: https://github.com/llvm/llvm-project/pull/141037 Reviewed-by: Yanteng Si Suggested-by: WANG Rui Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/irqflags.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/include/asm/irqflags.h b/arch/loongarch/include/asm/irqflags.h index 319a8c616f1f..003172b8406b 100644 --- a/arch/loongarch/include/asm/irqflags.h +++ b/arch/loongarch/include/asm/irqflags.h @@ -14,40 +14,48 @@ static inline void arch_local_irq_enable(void) { u32 flags = CSR_CRMD_IE; + register u32 mask asm("t0") = CSR_CRMD_IE; + __asm__ __volatile__( "csrxchg %[val], %[mask], %[reg]\n\t" : [val] "+r" (flags) - : [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) : "memory"); } static inline void arch_local_irq_disable(void) { u32 flags = 0; + register u32 mask asm("t0") = CSR_CRMD_IE; + __asm__ __volatile__( "csrxchg %[val], %[mask], %[reg]\n\t" : [val] "+r" (flags) - : [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) : "memory"); } static inline unsigned long arch_local_irq_save(void) { u32 flags = 0; + register u32 mask asm("t0") = CSR_CRMD_IE; + __asm__ __volatile__( "csrxchg %[val], %[mask], %[reg]\n\t" : [val] "+r" (flags) - : [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) : "memory"); return flags; } static inline void arch_local_irq_restore(unsigned long flags) { + register u32 mask asm("t0") = CSR_CRMD_IE; + __asm__ __volatile__( "csrxchg %[val], %[mask], %[reg]\n\t" : [val] "+r" (flags) - : [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) : "memory"); } -- cgit v1.2.3 From 712e6a914328e9672d534fc77edd596234ec2d7f Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 30 May 2025 21:45:48 +0800 Subject: LoongArch: Preserve firmware configuration when desired If we must preserve the firmware resource assignments, claim the existing resources rather than reassigning everything. According to PCI Firmware Specification: if ACPI DSM#5 function returns 0, the OS must retain the resource allocation for PCI in the firmware; if ACPI DSM#5 function returns 1, the OS can ignore the resource allocation for PCI and reallocate it. Signed-off-by: Qihang Gao Signed-off-by: Juxin Gao Signed-off-by: Huacai Chen --- arch/loongarch/pci/acpi.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/pci/acpi.c b/arch/loongarch/pci/acpi.c index 1da4dc46df43..50c9016641a4 100644 --- a/arch/loongarch/pci/acpi.c +++ b/arch/loongarch/pci/acpi.c @@ -194,6 +194,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) { struct pci_bus *bus; struct pci_root_info *info; + struct pci_host_bridge *host; struct acpi_pci_root_ops *root_ops; int domain = root->segment; int busnum = root->secondary.start; @@ -237,8 +238,17 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) return NULL; } - pci_bus_size_bridges(bus); - pci_bus_assign_resources(bus); + /* If we must preserve the resource configuration, claim now */ + host = pci_find_host_bridge(bus); + if (host->preserve_config) + pci_bus_claim_resources(bus); + + /* + * Assign whatever was left unassigned. If we didn't claim above, + * this will reassign everything. + */ + pci_assign_unassigned_root_bus_resources(bus); + list_for_each_entry(child, &bus->children, node) pcie_bus_configure_settings(child); } -- cgit v1.2.3 From ee084fa96123ede8b0563a1b5a9b23adc43cd50d Mon Sep 17 00:00:00 2001 From: Tianyang Zhang Date: Fri, 30 May 2025 21:45:57 +0800 Subject: LoongArch: Fix panic caused by NULL-PMD in huge_pte_offset() ERROR INFO: CPU 25 Unable to handle kernel paging request at virtual address 0x0 ... Call Trace: [<900000000023c30c>] huge_pte_offset+0x3c/0x58 [<900000000057fd4c>] hugetlb_follow_page_mask+0x74/0x438 [<900000000051fee8>] __get_user_pages+0xe0/0x4c8 [<9000000000522414>] faultin_page_range+0x84/0x380 [<9000000000564e8c>] madvise_vma_behavior+0x534/0xa48 [<900000000056689c>] do_madvise+0x1bc/0x3e8 [<9000000000566df4>] sys_madvise+0x24/0x38 [<90000000015b9e88>] do_syscall+0x78/0x98 [<9000000000221f18>] handle_syscall+0xb8/0x158 In some cases, pmd may be NULL and rely on NULL as the return value for processing, so it is necessary to determine this situation here. Cc: stable@vger.kernel.org Fixes: bd51834d1cf6 ("LoongArch: Return NULL from huge_pte_offset() for invalid PMD") Signed-off-by: Tianyang Zhang Signed-off-by: Huacai Chen --- arch/loongarch/mm/hugetlbpage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/loongarch/mm/hugetlbpage.c b/arch/loongarch/mm/hugetlbpage.c index cea84d7f2b91..02dad4624fe3 100644 --- a/arch/loongarch/mm/hugetlbpage.c +++ b/arch/loongarch/mm/hugetlbpage.c @@ -47,7 +47,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, pmd = pmd_offset(pud, addr); } } - return pmd_none(pmdp_get(pmd)) ? NULL : (pte_t *) pmd; + + return (!pmd || pmd_none(pmdp_get(pmd))) ? NULL : (pte_t *) pmd; } uint64_t pmd_to_entrylo(unsigned long pmd_val) -- cgit v1.2.3 From 99850a1c93fe7ca40ad9efddc00acec6e85c5e48 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 29 May 2025 13:10:24 -0400 Subject: x86/fpu: Remove unused trace events The following trace events are not used and defining them just wastes memory: x86_fpu_before_restore x86_fpu_after_restore x86_fpu_init_state Simply remove them. Signed-off-by: Steven Rostedt (Google) Signed-off-by: Ingo Molnar Cc: H. Peter Anvin Cc: Dave Hansen Cc: Linux Trace Kernel Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Oleg Nesterov Link: https://lore.kernel.org/all/20250529130138.544ffec4@gandalf.local.home # background Link: https://lore.kernel.org/r/20250529131024.7c2ef96f@gandalf.local.home # x86 submission --- arch/x86/include/asm/trace/fpu.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 0454d5e60e5d..721b408d9a67 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -44,16 +44,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_after_save, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_before_restore, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - -DEFINE_EVENT(x86_fpu, x86_fpu_after_restore, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) @@ -64,11 +54,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_init_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_dropped, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) -- cgit v1.2.3 From 86aa94cd50b138be0dd872b0779fa3036e641881 Mon Sep 17 00:00:00 2001 From: Dapeng Mi Date: Thu, 29 May 2025 08:02:36 +0000 Subject: perf/x86/intel: Fix incorrect MSR index calculations in intel_pmu_config_acr() The MSR offset calculations in intel_pmu_config_acr() are buggy. To calculate fixed counter MSR addresses in intel_pmu_config_acr(), the HW counter index "idx" is subtracted by INTEL_PMC_IDX_FIXED. This leads to the ACR mask value of fixed counters to be incorrectly saved to the positions of GP counters in acr_cfg_b[], e.g. For fixed counter 0, its ACR counter mask should be saved to acr_cfg_b[32], but it's saved to acr_cfg_b[0] incorrectly. Fix this issue. [ mingo: Clarified & improved the changelog. ] Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload") Signed-off-by: Dapeng Mi Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250529080236.2552247-2-dapeng1.mi@linux.intel.com --- arch/x86/events/intel/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 466283326630..741b229f0718 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2900,6 +2900,7 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int msr_b, msr_c; + int msr_offset; if (!mask && !cpuc->acr_cfg_b[idx]) return; @@ -2907,19 +2908,20 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload) if (idx < INTEL_PMC_IDX_FIXED) { msr_b = MSR_IA32_PMC_V6_GP0_CFG_B; msr_c = MSR_IA32_PMC_V6_GP0_CFG_C; + msr_offset = x86_pmu.addr_offset(idx, false); } else { msr_b = MSR_IA32_PMC_V6_FX0_CFG_B; msr_c = MSR_IA32_PMC_V6_FX0_CFG_C; - idx -= INTEL_PMC_IDX_FIXED; + msr_offset = x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false); } if (cpuc->acr_cfg_b[idx] != mask) { - wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask); + wrmsrl(msr_b + msr_offset, mask); cpuc->acr_cfg_b[idx] = mask; } /* Only need to update the reload value when there is a valid config value. */ if (mask && cpuc->acr_cfg_c[idx] != reload) { - wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload); + wrmsrl(msr_c + msr_offset, reload); cpuc->acr_cfg_c[idx] = reload; } } -- cgit v1.2.3 From b8c9c3b822fe8e033b9802516f6466099d915488 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 9 May 2025 10:40:52 +0200 Subject: um: stop using PCI port I/O arch/um is one of the last users of CONFIG_GENERIC_IOMAP, but upon closer look it appears that the PCI host bridge does not register any port I/O, and the absense of both custom inb/outb functions and a PCI_IOBASE constant means that actually trying to use port I/O results on a NULL pointer access. Build testing with clang confirms this by warning about this exact problem: include/asm-generic/io.h:549:31: error: performing pointer arithmetic on a null pointer has undefined behavior [-Werror,-Wnull-pointer-arithmetic] 549 | val = __raw_readb(PCI_IOBASE + addr); | ~~~~~~~~~~ ^ Remove all the Kconfig selects that refer to legacy port I/O and instead just build the normal MMIO path that is emulated by the virtio PCI host. Signed-off-by: Arnd Bergmann Link: https://patch.msgid.link/20250509084125.1488601-1-arnd@kernel.org Signed-off-by: Johannes Berg --- arch/um/Kconfig | 6 ------ arch/um/kernel/Makefile | 1 - arch/um/kernel/ioport.c | 13 ------------- 3 files changed, 20 deletions(-) delete mode 100644 arch/um/kernel/ioport.c (limited to 'arch') diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 79509c7f39de..f08e8a7fac93 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -52,13 +52,7 @@ config NO_IOMEM config UML_IOMEM_EMULATION bool select INDIRECT_IOMEM - select HAS_IOPORT select GENERIC_PCI_IOMAP - select GENERIC_IOMAP - select NO_GENERIC_PCI_IOPORT_MAP - -config NO_IOPORT_MAP - def_bool !UML_IOMEM_EMULATION config ISA bool diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 4df1cd0d2017..4669db2aa9be 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -25,7 +25,6 @@ obj-$(CONFIG_GPROF) += gprof_syms.o obj-$(CONFIG_OF) += dtb.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o USER_OBJS := config.o diff --git a/arch/um/kernel/ioport.c b/arch/um/kernel/ioport.c deleted file mode 100644 index 7220615b3beb..000000000000 --- a/arch/um/kernel/ioport.c +++ /dev/null @@ -1,13 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2021 Intel Corporation - * Author: Johannes Berg - */ -#include -#include - -void __iomem *__pci_ioport_map(struct pci_dev *dev, unsigned long port, - unsigned int nr) -{ - return NULL; -} -- cgit v1.2.3 From fd054188999ff19746cc09f4e0f196a113964db9 Mon Sep 17 00:00:00 2001 From: Yongting Lin Date: Tue, 27 May 2025 23:12:22 +0800 Subject: um: Fix tgkill compile error on old host OSes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tgkill is a quite old syscall since kernel 2.5.75, but unfortunately glibc doesn't support it before 2.30. Thus some systems fail to compile the latest UserMode Linux. Here is the compile error I encountered when I tried to compile UML in my system shipped with glibc-2.28. CALL scripts/checksyscalls.sh CC arch/um/os-Linux/sigio.o In file included from arch/um/os-Linux/sigio.c:17: arch/um/os-Linux/sigio.c: In function ‘write_sigio_thread’: arch/um/os-Linux/sigio.c:49:19: error: implicit declaration of function ‘tgkill’; did you mean ‘kill’? [-Werror=implicit-function-declaration] CATCH_EINTR(r = tgkill(pid, pid, SIGIO)); ^~~~~~ ./arch/um/include/shared/os.h:21:48: note: in definition of macro ‘CATCH_EINTR’ #define CATCH_EINTR(expr) while ((errno = 0, ((expr) < 0)) && (errno == EINTR)) ^~~~ cc1: some warnings being treated as errors Fix it by Replacing glibc call with raw syscall. Fixes: 33c9da5dfb18 ("um: Rewrite the sigio workaround based on epoll and tgkill") Signed-off-by: Yongting Lin Link: https://patch.msgid.link/20250527151222.40371-1-linyongting@gmail.com Signed-off-by: Johannes Berg --- arch/um/os-Linux/sigio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c index a05a6ecee756..6de145f8fe3d 100644 --- a/arch/um/os-Linux/sigio.c +++ b/arch/um/os-Linux/sigio.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -46,7 +47,7 @@ static void *write_sigio_thread(void *unused) __func__, errno); } - CATCH_EINTR(r = tgkill(pid, pid, SIGIO)); + CATCH_EINTR(r = syscall(__NR_tgkill, pid, pid, SIGIO)); if (r < 0) printk(UM_KERN_ERR "%s: tgkill failed, errno = %d\n", __func__, errno); -- cgit v1.2.3 From 10eabeca45fdfa48e8efd2c7fc0fd97314b65266 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 5 May 2025 10:33:59 +0200 Subject: um: chan_kern: use raw spinlock for irqs_to_free_lock Since this is called deep in the ARCH=um IRQ infrastructure it must use a raw spinlock. It's not really part of the driver, but rather the core UML IRQ code. Link: https://patch.msgid.link/20250505103358.ae7dc659f8b4.I64ca7aece30e0b4b0b5b35ad89cdd63db197c0ce@changeid Signed-off-by: Johannes Berg --- arch/um/drivers/chan_kern.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c index e78a99816c86..26442db7d608 100644 --- a/arch/um/drivers/chan_kern.c +++ b/arch/um/drivers/chan_kern.c @@ -212,7 +212,7 @@ int enable_chan(struct line *line) * be permanently disabled. This is discovered in IRQ context, but * the freeing of the IRQ must be done later. */ -static DEFINE_SPINLOCK(irqs_to_free_lock); +static DEFINE_RAW_SPINLOCK(irqs_to_free_lock); static LIST_HEAD(irqs_to_free); void free_irqs(void) @@ -222,9 +222,9 @@ void free_irqs(void) struct list_head *ele; unsigned long flags; - spin_lock_irqsave(&irqs_to_free_lock, flags); + raw_spin_lock_irqsave(&irqs_to_free_lock, flags); list_splice_init(&irqs_to_free, &list); - spin_unlock_irqrestore(&irqs_to_free_lock, flags); + raw_spin_unlock_irqrestore(&irqs_to_free_lock, flags); list_for_each(ele, &list) { chan = list_entry(ele, struct chan, free_list); @@ -246,9 +246,9 @@ static void close_one_chan(struct chan *chan, int delay_free_irq) return; if (delay_free_irq) { - spin_lock_irqsave(&irqs_to_free_lock, flags); + raw_spin_lock_irqsave(&irqs_to_free_lock, flags); list_add(&chan->free_list, &irqs_to_free); - spin_unlock_irqrestore(&irqs_to_free_lock, flags); + raw_spin_unlock_irqrestore(&irqs_to_free_lock, flags); } else { if (chan->input && chan->enabled) um_free_irq(chan->line->read_irq, chan); -- cgit v1.2.3 From 477c1c21da0d6ffc4dfe924178b90377f2938ba5 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Tue, 6 May 2025 12:51:16 +0800 Subject: um: vector: Clean up and modernize log messages Use pr_*() and netdev_*() to print log messages. While at it, join split messages for easier grepping. Suggested-by: Geert Uytterhoeven Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250506045117.1896661-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/vector_kern.c | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 0d5c96897fa0..7822421057ea 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -8,6 +8,8 @@ * Copyright (C) 2001 by various other people who didn't put their name here. */ +#define pr_fmt(fmt) "uml-vector: " fmt + #include #include #include @@ -1551,41 +1553,33 @@ static void vector_setup_etheraddr(struct net_device *dev, char *str) addr[i] = simple_strtoul(str, &end, 16); if ((end == str) || ((*end != ':') && (*end != ',') && (*end != '\0'))) { - printk(KERN_ERR - "setup_etheraddr: failed to parse '%s' " - "as an ethernet address\n", str); + netdev_err(dev, + "Failed to parse '%s' as an ethernet address\n", str); goto random; } str = end + 1; } if (is_multicast_ether_addr(addr)) { - printk(KERN_ERR - "Attempt to assign a multicast ethernet address to a " - "device disallowed\n"); + netdev_err(dev, + "Attempt to assign a multicast ethernet address to a device disallowed\n"); goto random; } if (!is_valid_ether_addr(addr)) { - printk(KERN_ERR - "Attempt to assign an invalid ethernet address to a " - "device disallowed\n"); + netdev_err(dev, + "Attempt to assign an invalid ethernet address to a device disallowed\n"); goto random; } if (!is_local_ether_addr(addr)) { - printk(KERN_WARNING - "Warning: Assigning a globally valid ethernet " - "address to a device\n"); - printk(KERN_WARNING "You should set the 2nd rightmost bit in " - "the first byte of the MAC,\n"); - printk(KERN_WARNING "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n", - addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4], - addr[5]); + netdev_warn(dev, "Warning: Assigning a globally valid ethernet address to a device\n"); + netdev_warn(dev, "You should set the 2nd rightmost bit in the first byte of the MAC,\n"); + netdev_warn(dev, "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n", + addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4], addr[5]); } eth_hw_addr_set(dev, addr); return; random: - printk(KERN_INFO - "Choosing a random ethernet address for device %s\n", dev->name); + netdev_info(dev, "Choosing a random ethernet address\n"); eth_hw_addr_random(dev); } @@ -1601,14 +1595,12 @@ static void vector_eth_configure( device = kzalloc(sizeof(*device), GFP_KERNEL); if (device == NULL) { - printk(KERN_ERR "eth_configure failed to allocate struct " - "vector_device\n"); + pr_err("Failed to allocate struct vector_device for vec%d\n", n); return; } dev = alloc_etherdev(sizeof(struct vector_private)); if (dev == NULL) { - printk(KERN_ERR "eth_configure: failed to allocate struct " - "net_device for vec%d\n", n); + pr_err("Failed to allocate struct net_device for vec%d\n", n); goto out_free_device; } @@ -1738,8 +1730,7 @@ static int __init vector_setup(char *str) err = vector_parse(str, &n, &str, &error); if (err) { - printk(KERN_ERR "vector_setup - Couldn't parse '%s' : %s\n", - str, error); + pr_err("Couldn't parse '%s': %s\n", str, error); return 1; } new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); -- cgit v1.2.3 From b76d18b53a3f52880452a58cc982910abcccd111 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Tue, 6 May 2025 12:51:17 +0800 Subject: um: vector: Use mac_pton() for MAC address parsing Use mac_pton() instead of custom approach. Suggested-by: Geert Uytterhoeven Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250506045117.1896661-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/vector_kern.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 7822421057ea..5226d2c52e6a 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1543,21 +1543,14 @@ static void vector_timer_expire(struct timer_list *t) static void vector_setup_etheraddr(struct net_device *dev, char *str) { u8 addr[ETH_ALEN]; - char *end; - int i; if (str == NULL) goto random; - for (i = 0; i < 6; i++) { - addr[i] = simple_strtoul(str, &end, 16); - if ((end == str) || - ((*end != ':') && (*end != ',') && (*end != '\0'))) { - netdev_err(dev, - "Failed to parse '%s' as an ethernet address\n", str); - goto random; - } - str = end + 1; + if (!mac_pton(str, addr)) { + netdev_err(dev, + "Failed to parse '%s' as an ethernet address\n", str); + goto random; } if (is_multicast_ether_addr(addr)) { netdev_err(dev, -- cgit v1.2.3 From e21560b7d33c4f692cc9cd5b75ff09024f5a69d2 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 29 May 2025 09:35:08 +0200 Subject: arm64: Disable LLD linker ASSERT()s for the time being It turns out [1] that the way LLD handles ASSERT()s in the linker script can result in spurious failures, so disable them for the newly introduced BSS symbol export checks. Since we're not aware of any issues with the existing assertions in vmlinux.lds.S, leave those alone for now so that they can continue to provide useful coverage. A linker fix [2] is due to land in version 21 of LLD. Link: https://lore.kernel.org/r/202505261019.OUlitN6m-lkp@intel.com [1] Link: https://github.com/llvm/llvm-project/commit/5859863bab7f [2] Link: https://github.com/ClangBuiltLinux/linux/issues/2094 Signed-off-by: Ard Biesheuvel Tested-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505261019.OUlitN6m-lkp@intel.com/ Link: https://lore.kernel.org/r/20250529073507.2984959-2-ardb+git@google.com Signed-off-by: Will Deacon --- arch/arm64/kernel/image-vars.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index c5266430284b..86f088a16147 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -10,6 +10,10 @@ #error This file should only be included in vmlinux.lds.S #endif +#if defined(CONFIG_LD_IS_LLD) && CONFIG_LLD_VERSION < 210000 +#define ASSERT(...) +#endif + #define PI_EXPORT_SYM(sym) \ __PI_EXPORT_SYM(sym, __pi_ ## sym, Cannot export BSS symbol sym to startup code) #define __PI_EXPORT_SYM(sym, pisym, msg)\ @@ -142,4 +146,6 @@ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); _kernel_codesize = ABSOLUTE(__inittext_end - _text); #endif +#undef ASSERT + #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ -- cgit v1.2.3 From dc0a083948040ff364d065da8bb50c29f77a39ad Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 31 May 2025 14:30:05 +0200 Subject: arm64: Work around convergence issue with LLD linker LLD will occasionally error out with a '__init_end does not converge' error if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this results in a circular dependency. Counter this by dimensioning the initial IDMAP page tables based on a new boundary marker 'kimage_limit', and define it such that its value should not change as a result of the initdata segment being pushed over a 64k segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided that its value doesn't change by more than 2M between linker passes. Reported-by: Arnd Bergmann Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250531123005.3866382-2-ardb+git@google.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/kernel-pgtable.h | 2 +- arch/arm64/kernel/image-vars.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 9e93733523f6..74a4f738c5f5 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -58,7 +58,7 @@ #define INIT_DIR_SIZE (PAGE_SIZE * (EARLY_PAGES(SWAPPER_PGTABLE_LEVELS, KIMAGE_VADDR, _end, EXTRA_PAGE) \ + EARLY_SEGMENT_EXTRA_PAGES)) -#define INIT_IDMAP_DIR_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, KIMAGE_VADDR, _end, 1)) +#define INIT_IDMAP_DIR_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, KIMAGE_VADDR, kimage_limit, 1)) #define INIT_IDMAP_DIR_SIZE ((INIT_IDMAP_DIR_PAGES + EARLY_IDMAP_EXTRA_PAGES) * PAGE_SIZE) #define INIT_IDMAP_FDT_PAGES (EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, 0UL, UL(MAX_FDT_SIZE), 1) - 1) diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 86f088a16147..1cee48feb309 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -146,6 +146,17 @@ KVM_NVHE_ALIAS(kvm_protected_mode_initialized); _kernel_codesize = ABSOLUTE(__inittext_end - _text); #endif +/* + * LLD will occasionally error out with a '__init_end does not converge' error + * if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this results in a + * circular dependency. Counter this by dimensioning the initial IDMAP page + * tables based on kimage_limit, which is defined such that its value should + * not change as a result of the initdata segment being pushed over a 64k + * segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided that its + * value doesn't change by more than 2M between linker passes. + */ +kimage_limit = ALIGN(ABSOLUTE(_end + SZ_64K), SZ_2M); + #undef ASSERT #endif /* __ARM64_KERNEL_IMAGE_VARS_H */ -- cgit v1.2.3 From 4b634918384c0f84c33aeb4dd9fd4c38e7be5ccb Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 30 May 2025 16:23:47 +0100 Subject: arm64/mm: Close theoretical race where stale TLB entry remains valid Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries") describes a race that, prior to the commit, could occur between reclaim and operations such as mprotect() when using reclaim's tlbbatch mechanism. See that commit for details but the summary is: """ Nadav Amit identified a theoritical race between page reclaim and mprotect due to TLB flushes being batched outside of the PTL being held. He described the race as follows: CPU0 CPU1 ---- ---- user accesses memory using RW PTE [PTE now cached in TLB] try_to_unmap_one() ==> ptep_get_and_clear() ==> set_tlb_ubc_flush_pending() mprotect(addr, PROT_READ) ==> change_pte_range() ==> [ PTE non-present - no flush ] user writes using cached RW PTE ... try_to_unmap_flush() """ The solution was to insert flush_tlb_batched_pending() in mprotect() and friends to explcitly drain any pending reclaim TLB flushes. In the modern version of this solution, arch_flush_tlb_batched_pending() is called to do that synchronisation. arm64's tlbbatch implementation simply issues TLBIs at queue-time (arch_tlbbatch_add_pending()), eliding the trailing dsb(ish). The trailing dsb(ish) is finally issued in arch_tlbbatch_flush() at the end of the batch to wait for all the issued TLBIs to complete. Now, the Arm ARM states: """ The completion of the TLB maintenance instruction is guaranteed only by the execution of a DSB by the observer that performed the TLB maintenance instruction. The execution of a DSB by a different observer does not have this effect, even if the DSB is known to be executed after the TLB maintenance instruction is observed by that different observer. """ arch_tlbbatch_add_pending() and arch_tlbbatch_flush() conform to this requirement because they are called from the same task (either kswapd or caller of madvise(MADV_PAGEOUT)), so either they are on the same CPU or if the task was migrated, __switch_to() contains an extra dsb(ish). HOWEVER, arm64's arch_flush_tlb_batched_pending() is also implemented as a dsb(ish). But this may be running on a CPU remote from the one that issued the outstanding TLBIs. So there is no architectural gurantee of synchonization. Therefore we are still vulnerable to the theoretical race described in Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries"). Fix this by flushing the entire mm in arch_flush_tlb_batched_pending(). This aligns with what the other arches that implement the tlbbatch feature do. Cc: Fixes: 43b3dfdd0455 ("arm64: support batched/deferred tlb shootdown during page reclamation/migration") Signed-off-by: Ryan Roberts Link: https://lore.kernel.org/r/20250530152445.2430295-1-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/tlbflush.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index eba1a98657f1..aa9efee17277 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -323,13 +323,14 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) } /* - * If mprotect/munmap/etc occurs during TLB batched flushing, we need to - * synchronise all the TLBI issued with a DSB to avoid the race mentioned in - * flush_tlb_batched_pending(). + * If mprotect/munmap/etc occurs during TLB batched flushing, we need to ensure + * all the previously issued TLBIs targeting mm have completed. But since we + * can be executing on a remote CPU, a DSB cannot guarantee this like it can + * for arch_tlbbatch_flush(). Our only option is to flush the entire mm. */ static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) { - dsb(ish); + flush_tlb_mm(mm); } /* -- cgit v1.2.3 From 10f885d63a0efd50b0d22bf27eb3cf727838e99e Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Mon, 2 Jun 2025 12:33:21 +0800 Subject: arm64: Add override for MPAM As the message of the commit 09e6b306f3ba ("arm64: cpufeature: discover CPU support for MPAM") already states, if a buggy firmware fails to either enable MPAM or emulate the trap as if it were disabled, the kernel will just fail to boot. While upgrading the firmware should be the best solution, we have some hardware of which the vendor have made no response 2 months after we requested a firmware update. Allow overriding it so our devices don't become some e-waste. Cc: James Morse Cc: Marc Zyngier Cc: Will Deacon Cc: Shameer Kolothum Cc: Mingcong Bai Cc: Shaopeng Tan Cc: Ben Horgan Signed-off-by: Xi Ruoyao Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20250602043723.216338-1-xry111@xry111.site Signed-off-by: Will Deacon --- arch/arm64/include/asm/el2_setup.h | 24 ++++++++++-------------- arch/arm64/kernel/cpufeature.c | 7 +++++-- arch/arm64/kernel/cpuinfo.c | 7 +++++-- arch/arm64/kernel/pi/idreg-override.c | 3 +++ 4 files changed, 23 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 30f57b0334a3..af7807f11df4 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -298,19 +298,6 @@ .Lskip_gcs_\@: .endm -.macro __init_el2_mpam - /* Memory Partitioning And Monitoring: disable EL2 traps */ - mrs x1, id_aa64pfr0_el1 - ubfx x0, x1, #ID_AA64PFR0_EL1_MPAM_SHIFT, #4 - cbz x0, .Lskip_mpam_\@ // skip if no MPAM - msr_s SYS_MPAM2_EL2, xzr // use the default partition - // and disable lower traps - mrs_s x0, SYS_MPAMIDR_EL1 - tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg - msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 -.Lskip_mpam_\@: -.endm - /** * Initialize EL2 registers to sane values. This should be called early on all * cores that were booted in EL2. Note that everything gets initialised as @@ -328,7 +315,6 @@ __init_el2_stage2 __init_el2_gicv3 __init_el2_hstr - __init_el2_mpam __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt @@ -375,6 +361,16 @@ #endif .macro finalise_el2_state + check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2 + +.Linit_mpam_\@: + msr_s SYS_MPAM2_EL2, xzr // use the default partition + // and disable lower traps + mrs_s x0, SYS_MPAMIDR_EL1 + tbz x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@ // skip if no MPAMHCR reg + msr_s SYS_MPAMHCR_EL2, xzr // clear TRAP_MPAMIDR_EL1 -> EL2 + +.Lskip_mpam_\@: check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2 .Linit_sve_\@: /* SVE register access */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 379c82d22c75..fcc20d13e938 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1198,8 +1198,10 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info) cpacr_restore(cpacr); } - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr); + } if (id_aa64pfr1_mte(info->reg_id_aa64pfr1)) init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid); @@ -1452,7 +1454,8 @@ void update_cpu_features(int cpu, cpacr_restore(cpacr); } - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) { + if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) { + info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu, info->reg_mpamidr, boot->reg_mpamidr); } diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 94525abd1c22..c1f2b6b04b41 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -496,8 +496,11 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) __cpuinfo_store_cpu_32bit(&info->aarch32); - if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) - info->reg_mpamidr = read_cpuid(MPAMIDR_EL1); + /* + * info->reg_mpamidr deferred to {init,update}_cpu_features because we + * don't want to read it (and trigger a trap on buggy firmware) if + * using an aa64pfr0_el1 override to unconditionally disable MPAM. + */ if (IS_ENABLED(CONFIG_ARM64_SME) && id_aa64pfr1_sme(info->reg_id_aa64pfr1)) { diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c index c6b185b885f7..bc57b290e5e7 100644 --- a/arch/arm64/kernel/pi/idreg-override.c +++ b/arch/arm64/kernel/pi/idreg-override.c @@ -127,6 +127,7 @@ static const struct ftr_set_desc pfr0 __prel64_initconst = { .fields = { FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter), FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL), + FIELD("mpam", ID_AA64PFR0_EL1_MPAM_SHIFT, NULL), {} }, }; @@ -154,6 +155,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = { FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL), FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL), FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter), + FIELD("mpam_frac", ID_AA64PFR1_EL1_MPAM_frac_SHIFT, NULL), {} }, }; @@ -246,6 +248,7 @@ static const struct { { "rodata=off", "arm64_sw.rodataoff=1" }, { "arm64.nolva", "id_aa64mmfr2.varange=0" }, { "arm64.no32bit_el0", "id_aa64pfr0.el0=1" }, + { "arm64.nompam", "id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" }, }; static int __init parse_hexdigit(const char *p, u64 *v) -- cgit v1.2.3 From 247ed9e4a6869f3bf07bffd277a341a6833abdbc Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 2 Jun 2025 15:00:46 +0200 Subject: um: Move faultinfo extraction into userspace routine The segv handler is called slightly differently depending on whether PTRACE_FULL_FAULTINFO is set or not (32bit vs. 64bit). The only difference is that we don't try to pass the registers and instruction pointer to the segv handler. It would be good to either document or remove the difference, but I do not know why this difference exists. And, passing NULL can even result in a crash. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250602130052.545733-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/os-Linux/skas/process.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index ae2aea062f06..97c2f964f5fc 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -163,12 +163,6 @@ static void get_skas_faultinfo(int pid, struct faultinfo *fi) memcpy(fi, (void *)current_stub_stack(), sizeof(*fi)); } -static void handle_segv(int pid, struct uml_pt_regs *regs) -{ - get_skas_faultinfo(pid, ®s->faultinfo); - segv(regs->faultinfo, 0, 1, NULL, NULL); -} - static void handle_trap(int pid, struct uml_pt_regs *regs) { if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END)) @@ -521,13 +515,14 @@ void userspace(struct uml_pt_regs *regs) switch (sig) { case SIGSEGV: - if (PTRACE_FULL_FAULTINFO) { - get_skas_faultinfo(pid, - ®s->faultinfo); + get_skas_faultinfo(pid, ®s->faultinfo); + + if (PTRACE_FULL_FAULTINFO) (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, regs, NULL); - } - else handle_segv(pid, regs); + else + segv(regs->faultinfo, 0, 1, NULL, NULL); + break; case SIGTRAP + 0x80: handle_trap(pid, regs); -- cgit v1.2.3 From dac494bf54f764a114f16621ef04f534dd754ac1 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 2 Jun 2025 15:00:47 +0200 Subject: um: Add stub side of SECCOMP/futex based process handling This adds the stub side for the new seccomp process management code. In this case we do register save/restore through the signal handler mcontext. Add special code for handling TLS, which for x86_64 means setting the FS_BASE/GS_BASE registers while for i386 it means calling the set_thread_area syscall. Co-authored-by: Johannes Berg Signed-off-by: Benjamin Berg Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250602130052.545733-3-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/common-offsets.h | 2 ++ arch/um/include/shared/skas/stub-data.h | 14 ++++++++++ arch/um/kernel/skas/stub.c | 49 +++++++++++++++++++++++++++++++++ arch/x86/um/shared/sysdep/stub-data.h | 23 ++++++++++++++++ arch/x86/um/shared/sysdep/stub.h | 2 ++ arch/x86/um/shared/sysdep/stub_32.h | 13 +++++++++ arch/x86/um/shared/sysdep/stub_64.h | 17 ++++++++++++ 7 files changed, 120 insertions(+) create mode 100644 arch/x86/um/shared/sysdep/stub-data.h (limited to 'arch') diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 73f3a4792ed8..93e7097a2922 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -14,3 +14,5 @@ DEFINE(UM_THREAD_SIZE, THREAD_SIZE); DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); + +DEFINE(UM_KERN_GDT_ENTRY_TLS_ENTRIES, GDT_ENTRY_TLS_ENTRIES); diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 81a4cace032c..81ac2cd12112 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -11,6 +11,10 @@ #include #include #include +#include + +#define FUTEX_IN_CHILD 0 +#define FUTEX_IN_KERN 1 struct stub_init_data { unsigned long stub_start; @@ -52,6 +56,16 @@ struct stub_data { /* 128 leaves enough room for additional fields in the struct */ struct stub_syscall syscall_data[(UM_KERN_PAGE_SIZE - 128) / sizeof(struct stub_syscall)] __aligned(16); + /* data shared with signal handler (only used in seccomp mode) */ + short restart_wait; + unsigned int futex; + int signal; + unsigned short si_offset; + unsigned short mctx_offset; + + /* seccomp architecture specific state restore */ + struct stub_data_arch arch_data; + /* Stack for our signal handlers and for calling into . */ unsigned char sigstack[UM_KERN_PAGE_SIZE] __aligned(UM_KERN_PAGE_SIZE); }; diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c index 796fc266d3bb..9041f6b6e28b 100644 --- a/arch/um/kernel/skas/stub.c +++ b/arch/um/kernel/skas/stub.c @@ -5,6 +5,9 @@ #include +#include +#include + static __always_inline int syscall_handler(struct stub_data *d) { int i; @@ -57,3 +60,49 @@ stub_syscall_handler(void) trap_myself(); } + +void __section(".__syscall_stub") +stub_signal_interrupt(int sig, siginfo_t *info, void *p) +{ + struct stub_data *d = get_stub_data(); + ucontext_t *uc = p; + long res; + + d->signal = sig; + d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0]; + d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0]; + +restart_wait: + d->futex = FUTEX_IN_KERN; + do { + res = stub_syscall3(__NR_futex, (unsigned long)&d->futex, + FUTEX_WAKE, 1); + } while (res == -EINTR); + do { + res = stub_syscall4(__NR_futex, (unsigned long)&d->futex, + FUTEX_WAIT, FUTEX_IN_KERN, 0); + } while (res == -EINTR || d->futex == FUTEX_IN_KERN); + + if (res < 0 && res != -EAGAIN) + stub_syscall1(__NR_exit_group, 1); + + /* Try running queued syscalls. */ + if (syscall_handler(d) < 0 || d->restart_wait) { + /* Report SIGSYS if we restart. */ + d->signal = SIGSYS; + d->restart_wait = 0; + goto restart_wait; + } + + /* Restore arch dependent state that is not part of the mcontext */ + stub_seccomp_restore_state(&d->arch_data); + + /* Return so that the host modified mcontext is restored. */ +} + +void __section(".__syscall_stub") +stub_signal_restorer(void) +{ + /* We must not have anything on the stack when doing rt_sigreturn */ + stub_syscall0(__NR_rt_sigreturn); +} diff --git a/arch/x86/um/shared/sysdep/stub-data.h b/arch/x86/um/shared/sysdep/stub-data.h new file mode 100644 index 000000000000..82b1b7f8ac3d --- /dev/null +++ b/arch/x86/um/shared/sysdep/stub-data.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARCH_STUB_DATA_H +#define __ARCH_STUB_DATA_H + +#ifdef __i386__ +#include +#include + +struct stub_data_arch { + int sync; + struct user_desc tls[UM_KERN_GDT_ENTRY_TLS_ENTRIES]; +}; +#else +#define STUB_SYNC_FS_BASE (1 << 0) +#define STUB_SYNC_GS_BASE (1 << 1) +struct stub_data_arch { + int sync; + unsigned long fs_base; + unsigned long gs_base; +}; +#endif + +#endif /* __ARCH_STUB_DATA_H */ diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h index dc89f4423454..4fa58f5b4fca 100644 --- a/arch/x86/um/shared/sysdep/stub.h +++ b/arch/x86/um/shared/sysdep/stub.h @@ -13,3 +13,5 @@ extern void stub_segv_handler(int, siginfo_t *, void *); extern void stub_syscall_handler(void); +extern void stub_signal_interrupt(int, siginfo_t *, void *); +extern void stub_signal_restorer(void); diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h index 390988132c0a..df568fc3ceb4 100644 --- a/arch/x86/um/shared/sysdep/stub_32.h +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -131,4 +131,17 @@ static __always_inline void *get_stub_data(void) "call *%%eax ;" \ :: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE), \ "i" (&fn)) + +static __always_inline void +stub_seccomp_restore_state(struct stub_data_arch *arch) +{ + for (int i = 0; i < sizeof(arch->tls) / sizeof(arch->tls[0]); i++) { + if (arch->sync & (1 << i)) + stub_syscall1(__NR_set_thread_area, + (unsigned long) &arch->tls[i]); + } + + arch->sync = 0; +} + #endif diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h index 294affbec742..9cfd31afa769 100644 --- a/arch/x86/um/shared/sysdep/stub_64.h +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -10,6 +10,7 @@ #include #include #include +#include #define STUB_MMAP_NR __NR_mmap #define MMAP_OFFSET(o) (o) @@ -134,4 +135,20 @@ static __always_inline void *get_stub_data(void) "call *%%rax ;" \ :: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE), \ "i" (&fn)) + +static __always_inline void +stub_seccomp_restore_state(struct stub_data_arch *arch) +{ + /* + * We could use _writefsbase_u64/_writegsbase_u64 if the host reports + * support in the hwcaps (HWCAP2_FSGSBASE). + */ + if (arch->sync & STUB_SYNC_FS_BASE) + stub_syscall2(__NR_arch_prctl, ARCH_SET_FS, arch->fs_base); + if (arch->sync & STUB_SYNC_GS_BASE) + stub_syscall2(__NR_arch_prctl, ARCH_SET_GS, arch->gs_base); + + arch->sync = 0; +} + #endif -- cgit v1.2.3 From b1e1bd2e69430445021394536740352be1b41cd0 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 2 Jun 2025 15:00:48 +0200 Subject: um: Add helper functions to get/set state for SECCOMP When not using ptrace, we need to both save and restore registers through the mcontext as provided by the host kernel to our signal handlers. Add corresponding functions to store the state to an mcontext and helpers to access the mcontext of the subprocess through the stub data. Signed-off-by: Benjamin Berg Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250602130052.545733-4-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/x86/um/os-Linux/mcontext.c | 218 ++++++++++++++++++++++++++++++++++- arch/x86/um/ptrace.c | 76 +++++++++--- arch/x86/um/shared/sysdep/mcontext.h | 9 ++ 3 files changed, 283 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c index 37decaa74761..e661fdc44db9 100644 --- a/arch/x86/um/os-Linux/mcontext.c +++ b/arch/x86/um/os-Linux/mcontext.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 -#include #define __FRAME_OFFSETS +#include +#include +#include #include +#include #include #include #include @@ -18,6 +21,10 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) COPY2(UESP, ESP); /* sic */ COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX); COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS); +#undef COPY2 +#undef COPY +#undef COPY_SEG +#undef COPY_SEG_CPL3 #else #define COPY2(X,Y) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##Y] #define COPY(X) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##X] @@ -29,6 +36,8 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) COPY2(EFLAGS, EFL); COPY2(CS, CSGSFS); regs->gp[SS / sizeof(unsigned long)] = mc->gregs[REG_CSGSFS] >> 48; +#undef COPY2 +#undef COPY #endif } @@ -42,3 +51,210 @@ void mc_set_rip(void *_mc, void *target) mc->gregs[REG_RIP] = (unsigned long)target; #endif } + +/* Same thing, but the copy macros are turned around. */ +void get_mc_from_regs(struct uml_pt_regs *regs, mcontext_t *mc, int single_stepping) +{ +#ifdef __i386__ +#define COPY2(X,Y) mc->gregs[REG_##Y] = regs->gp[X] +#define COPY(X) mc->gregs[REG_##X] = regs->gp[X] +#define COPY_SEG(X) mc->gregs[REG_##X] = regs->gp[X] & 0xffff; +#define COPY_SEG_CPL3(X) mc->gregs[REG_##X] = (regs->gp[X] & 0xffff) | 3; + COPY_SEG(GS); COPY_SEG(FS); COPY_SEG(ES); COPY_SEG(DS); + COPY(EDI); COPY(ESI); COPY(EBP); + COPY2(UESP, ESP); /* sic */ + COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX); + COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS); +#else +#define COPY2(X,Y) mc->gregs[REG_##Y] = regs->gp[X/sizeof(unsigned long)] +#define COPY(X) mc->gregs[REG_##X] = regs->gp[X/sizeof(unsigned long)] + COPY(R8); COPY(R9); COPY(R10); COPY(R11); + COPY(R12); COPY(R13); COPY(R14); COPY(R15); + COPY(RDI); COPY(RSI); COPY(RBP); COPY(RBX); + COPY(RDX); COPY(RAX); COPY(RCX); COPY(RSP); + COPY(RIP); + COPY2(EFLAGS, EFL); + mc->gregs[REG_CSGSFS] = mc->gregs[REG_CSGSFS] & 0xffffffffffffl; + mc->gregs[REG_CSGSFS] |= (regs->gp[SS / sizeof(unsigned long)] & 0xffff) << 48; +#endif + + if (single_stepping) + mc->gregs[REG_EFL] |= X86_EFLAGS_TF; + else + mc->gregs[REG_EFL] &= ~X86_EFLAGS_TF; +} + +#ifdef CONFIG_X86_32 +struct _xstate_64 { + struct _fpstate_64 fpstate; + struct _header xstate_hdr; + struct _ymmh_state ymmh; + /* New processor state extensions go here: */ +}; + +/* Not quite the right structures as these contain more information */ +int um_i387_from_fxsr(struct _fpstate_32 *i387, + const struct _fpstate_64 *fxsave); +int um_fxsr_from_i387(struct _fpstate_64 *fxsave, + const struct _fpstate_32 *from); +#else +#define _xstate_64 _xstate +#endif + +static struct _fpstate *get_fpstate(struct stub_data *data, + mcontext_t *mcontext, + int *fp_size) +{ + struct _fpstate *res; + + /* Assume floating point registers are on the same page */ + res = (void *)(((unsigned long)mcontext->fpregs & + (UM_KERN_PAGE_SIZE - 1)) + + (unsigned long)&data->sigstack[0]); + + if ((void *)res + sizeof(struct _fpstate) > + (void *)data->sigstack + sizeof(data->sigstack)) + return NULL; + + if (res->sw_reserved.magic1 != FP_XSTATE_MAGIC1) { + *fp_size = sizeof(struct _fpstate); + } else { + char *magic2_addr; + + magic2_addr = (void *)res; + magic2_addr += res->sw_reserved.extended_size; + magic2_addr -= FP_XSTATE_MAGIC2_SIZE; + + /* We still need to be within our stack */ + if ((void *)magic2_addr > + (void *)data->sigstack + sizeof(data->sigstack)) + return NULL; + + /* If we do not read MAGIC2, then we did something wrong */ + if (*(__u32 *)magic2_addr != FP_XSTATE_MAGIC2) + return NULL; + + /* Remove MAGIC2 from the size, we do not save/restore it */ + *fp_size = res->sw_reserved.extended_size - + FP_XSTATE_MAGIC2_SIZE; + } + + return res; +} + +int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + unsigned long *fp_size_out) +{ + mcontext_t *mcontext; + struct _fpstate *fpstate_stub; + struct _xstate_64 *xstate_stub; + int fp_size, xstate_size; + + /* mctx_offset is verified by wait_stub_done_seccomp */ + mcontext = (void *)&data->sigstack[data->mctx_offset]; + + get_regs_from_mc(regs, mcontext); + + fpstate_stub = get_fpstate(data, mcontext, &fp_size); + if (!fpstate_stub) + return -EINVAL; + +#ifdef CONFIG_X86_32 + xstate_stub = (void *)&fpstate_stub->_fxsr_env; + xstate_size = fp_size - offsetof(struct _fpstate_32, _fxsr_env); +#else + xstate_stub = (void *)fpstate_stub; + xstate_size = fp_size; +#endif + + if (fp_size_out) + *fp_size_out = xstate_size; + + if (xstate_size > host_fp_size) + return -ENOSPC; + + memcpy(®s->fp, xstate_stub, xstate_size); + + /* We do not need to read the x86_64 FS_BASE/GS_BASE registers as + * we do not permit userspace to set them directly. + */ + +#ifdef CONFIG_X86_32 + /* Read the i387 legacy FP registers */ + if (um_fxsr_from_i387((void *)®s->fp, fpstate_stub)) + return -EINVAL; +#endif + + return 0; +} + +/* Copied because we cannot include regset.h here. */ +struct task_struct; +struct user_regset; +struct membuf { + void *p; + size_t left; +}; + +int fpregs_legacy_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to); + +int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + int single_stepping) +{ + mcontext_t *mcontext; + struct _fpstate *fpstate_stub; + struct _xstate_64 *xstate_stub; + int fp_size, xstate_size; + + /* mctx_offset is verified by wait_stub_done_seccomp */ + mcontext = (void *)&data->sigstack[data->mctx_offset]; + + if ((unsigned long)mcontext < (unsigned long)data->sigstack || + (unsigned long)mcontext > + (unsigned long) data->sigstack + + sizeof(data->sigstack) - sizeof(*mcontext)) + return -EINVAL; + + get_mc_from_regs(regs, mcontext, single_stepping); + + fpstate_stub = get_fpstate(data, mcontext, &fp_size); + if (!fpstate_stub) + return -EINVAL; + +#ifdef CONFIG_X86_32 + xstate_stub = (void *)&fpstate_stub->_fxsr_env; + xstate_size = fp_size - offsetof(struct _fpstate_32, _fxsr_env); +#else + xstate_stub = (void *)fpstate_stub; + xstate_size = fp_size; +#endif + + memcpy(fpstate_stub, ®s->fp, fp_size); + +#ifdef __i386__ + /* + * On x86, the GDT entries are updated by arch_set_tls. + */ + + /* Store the i387 legacy FP registers which the host will use */ + if (um_i387_from_fxsr(fpstate_stub, (void *)®s->fp)) + return -EINVAL; +#else + /* + * On x86_64, we need to sync the FS_BASE/GS_BASE registers using the + * arch specific data. + */ + if (data->arch_data.fs_base != regs->gp[FS_BASE / sizeof(unsigned long)]) { + data->arch_data.fs_base = regs->gp[FS_BASE / sizeof(unsigned long)]; + data->arch_data.sync |= STUB_SYNC_FS_BASE; + } + if (data->arch_data.gs_base != regs->gp[GS_BASE / sizeof(unsigned long)]) { + data->arch_data.gs_base = regs->gp[GS_BASE / sizeof(unsigned long)]; + data->arch_data.sync |= STUB_SYNC_GS_BASE; + } +#endif + + return 0; +} diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c index 57c504fd5626..3275870330fe 100644 --- a/arch/x86/um/ptrace.c +++ b/arch/x86/um/ptrace.c @@ -25,7 +25,8 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd) return tmp; } -static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) +static inline unsigned long +twd_fxsr_to_i387(const struct user_fxsr_struct *fxsave) { struct _fpxreg *st = NULL; unsigned long twd = (unsigned long) fxsave->twd; @@ -69,12 +70,16 @@ static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) return ret; } -/* Get/set the old 32bit i387 registers (pre-FPX) */ -static int fpregs_legacy_get(struct task_struct *target, - const struct user_regset *regset, - struct membuf to) +/* + * Get/set the old 32bit i387 registers (pre-FPX) + * + * We provide simple wrappers for mcontext.c, they are only defined locally + * because mcontext.c is userspace facing and needs to a different definition + * of the structures. + */ +static int _um_i387_from_fxsr(struct membuf to, + const struct user_fxsr_struct *fxsave) { - struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; int i; membuf_store(&to, (unsigned long)fxsave->cwd | 0xffff0000ul); @@ -91,23 +96,36 @@ static int fpregs_legacy_get(struct task_struct *target, return 0; } -static int fpregs_legacy_set(struct task_struct *target, +int um_i387_from_fxsr(struct user_i387_struct *i387, + const struct user_fxsr_struct *fxsave); + +int um_i387_from_fxsr(struct user_i387_struct *i387, + const struct user_fxsr_struct *fxsave) +{ + struct membuf to = { + .p = i387, + .left = sizeof(*i387), + }; + + return _um_i387_from_fxsr(to, fxsave); +} + +static int fpregs_legacy_get(struct task_struct *target, const struct user_regset *regset, - unsigned int pos, unsigned int count, - const void *kbuf, const void __user *ubuf) + struct membuf to) { struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; - const struct user_i387_struct *from; - struct user_i387_struct buf; - int i; - if (ubuf) { - if (copy_from_user(&buf, ubuf, sizeof(buf))) - return -EFAULT; - from = &buf; - } else { - from = kbuf; - } + return _um_i387_from_fxsr(to, fxsave); +} + +int um_fxsr_from_i387(struct user_fxsr_struct *fxsave, + const struct user_i387_struct *from); + +int um_fxsr_from_i387(struct user_fxsr_struct *fxsave, + const struct user_i387_struct *from) +{ + int i; fxsave->cwd = (unsigned short)(from->cwd & 0xffff); fxsave->swd = (unsigned short)(from->swd & 0xffff); @@ -125,6 +143,26 @@ static int fpregs_legacy_set(struct task_struct *target, return 0; } + +static int fpregs_legacy_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp; + const struct user_i387_struct *from; + struct user_i387_struct buf; + + if (ubuf) { + if (copy_from_user(&buf, ubuf, sizeof(buf))) + return -EFAULT; + from = &buf; + } else { + from = kbuf; + } + + return um_fxsr_from_i387(fxsave, &buf); +} #endif static int genregs_get(struct task_struct *target, diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h index b724c54da316..6fe490cc5b98 100644 --- a/arch/x86/um/shared/sysdep/mcontext.h +++ b/arch/x86/um/shared/sysdep/mcontext.h @@ -6,7 +6,16 @@ #ifndef __SYS_SIGCONTEXT_X86_H #define __SYS_SIGCONTEXT_X86_H +#include + extern void get_regs_from_mc(struct uml_pt_regs *, mcontext_t *); +extern void get_mc_from_regs(struct uml_pt_regs *regs, mcontext_t *mc, + int single_stepping); + +extern int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + unsigned long *fp_size_out); +extern int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data, + int single_stepping); #ifdef __i386__ -- cgit v1.2.3 From 8420e08fe3a594b6ffa07705ac270faa2ed452c5 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 2 Jun 2025 15:00:49 +0200 Subject: um: Track userspace children dying in SECCOMP mode When in seccomp mode, we would hang forever on the futex if a child has died unexpectedly. In contrast, ptrace mode will notice it and kill the corresponding thread when it fails to run it. Fix this issue using a new IRQ that is fired after a SIGCHLD and keeping an (internal) list of all MMs. In the IRQ handler, find the affected MM and set its PID to -1 as well as the futex variable to FUTEX_IN_KERN. This, together with futex returning -EINTR after the signal is sufficient to implement a race-free detection of a child dying. Note that this also enables IRQ handling while starting a userspace process. This should be safe and SECCOMP requires the IRQ in case the process does not come up properly. Signed-off-by: Benjamin Berg Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250602130052.545733-5-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/asm/irq.h | 5 ++- arch/um/include/asm/mmu.h | 3 ++ arch/um/include/shared/irq_user.h | 2 + arch/um/include/shared/os.h | 1 + arch/um/include/shared/skas/mm_id.h | 2 + arch/um/include/shared/skas/skas.h | 1 + arch/um/kernel/irq.c | 6 +++ arch/um/kernel/skas/mmu.c | 82 ++++++++++++++++++++++++++++++++++--- arch/um/os-Linux/process.c | 31 ++++++++++++++ arch/um/os-Linux/signal.c | 19 ++++++++- arch/um/os-Linux/skas/process.c | 1 + 11 files changed, 145 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h index 749dfe8512e8..36dbedd1af48 100644 --- a/arch/um/include/asm/irq.h +++ b/arch/um/include/asm/irq.h @@ -13,17 +13,18 @@ #define TELNETD_IRQ 8 #define XTERM_IRQ 9 #define RANDOM_IRQ 10 +#define SIGCHLD_IRQ 11 #ifdef CONFIG_UML_NET_VECTOR -#define VECTOR_BASE_IRQ (RANDOM_IRQ + 1) +#define VECTOR_BASE_IRQ (SIGCHLD_IRQ + 1) #define VECTOR_IRQ_SPACE 8 #define UM_FIRST_DYN_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ) #else -#define UM_FIRST_DYN_IRQ (RANDOM_IRQ + 1) +#define UM_FIRST_DYN_IRQ (SIGCHLD_IRQ + 1) #endif diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h index a3eaca41ff61..4d0e4239f3cc 100644 --- a/arch/um/include/asm/mmu.h +++ b/arch/um/include/asm/mmu.h @@ -6,11 +6,14 @@ #ifndef __ARCH_UM_MMU_H #define __ARCH_UM_MMU_H +#include "linux/types.h" #include typedef struct mm_context { struct mm_id id; + struct list_head list; + /* Address range in need of a TLB sync */ unsigned long sync_tlb_range_from; unsigned long sync_tlb_range_to; diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index 88835b52ae2b..746abc24a5d5 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -17,6 +17,8 @@ enum um_irq_type { struct siginfo; extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, void *mc); +extern void sigchld_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc); void sigio_run_timetravel_handlers(void); extern void free_irq_by_fd(int fd); extern void deactivate_fd(int fd, int irqnum); diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index e63a74a5ff19..3046728ec42e 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -197,6 +197,7 @@ extern int create_mem_file(unsigned long long len); extern void report_enomem(void); /* process.c */ +pid_t os_reap_child(void); extern void os_alarm_process(int pid); extern void os_kill_process(int pid, int reap_child); extern void os_kill_ptraced_process(int pid, int reap_child); diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h index 140388c282f6..0654c57bb28e 100644 --- a/arch/um/include/shared/skas/mm_id.h +++ b/arch/um/include/shared/skas/mm_id.h @@ -14,4 +14,6 @@ struct mm_id { void __switch_mm(struct mm_id *mm_idp); +void notify_mm_kill(int pid); + #endif diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h index 85c50122ab98..7d1de4cab551 100644 --- a/arch/um/include/shared/skas/skas.h +++ b/arch/um/include/shared/skas/skas.h @@ -8,6 +8,7 @@ #include +extern int using_seccomp; extern int userspace_pid[]; extern void new_thread_handler(void); diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index abe8f30a521c..f1787be3983c 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -690,3 +690,9 @@ void __init init_IRQ(void) /* Initialize EPOLL Loop */ os_setup_epoll(); } + +extern void sigchld_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc) +{ + do_IRQ(SIGCHLD_IRQ, regs); +} diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 0eb5a1d3ba70..1e146a0f9549 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -19,6 +20,9 @@ /* Ensure the stub_data struct covers the allocated area */ static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE); +spinlock_t mm_list_lock; +struct list_head mm_list; + int init_new_context(struct task_struct *task, struct mm_struct *mm) { struct mm_id *new_id = &mm->context.id; @@ -31,10 +35,12 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) new_id->stack = stack; - block_signals_trace(); - new_id->pid = start_userspace(stack); - unblock_signals_trace(); + scoped_guard(spinlock_irqsave, &mm_list_lock) { + /* Insert into list, used for lookups when the child dies */ + list_add(&mm->context.list, &mm_list); + } + new_id->pid = start_userspace(stack); if (new_id->pid < 0) { ret = new_id->pid; goto out_free; @@ -60,13 +66,79 @@ void destroy_context(struct mm_struct *mm) * zero, resulting in a kill(0), which will result in the * whole UML suddenly dying. Also, cover negative and * 1 cases, since they shouldn't happen either. + * + * Negative cases happen if the child died unexpectedly. */ - if (mmu->id.pid < 2) { + if (mmu->id.pid >= 0 && mmu->id.pid < 2) { printk(KERN_ERR "corrupt mm_context - pid = %d\n", mmu->id.pid); return; } - os_kill_ptraced_process(mmu->id.pid, 1); + + if (mmu->id.pid > 0) { + os_kill_ptraced_process(mmu->id.pid, 1); + mmu->id.pid = -1; + } free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); + + guard(spinlock_irqsave)(&mm_list_lock); + + list_del(&mm->context.list); +} + +static irqreturn_t mm_sigchld_irq(int irq, void* dev) +{ + struct mm_context *mm_context; + pid_t pid; + + guard(spinlock)(&mm_list_lock); + + while ((pid = os_reap_child()) > 0) { + /* + * A child died, check if we have an MM with the PID. This is + * only relevant in SECCOMP mode (as ptrace will fail anyway). + * + * See wait_stub_done_seccomp for more details. + */ + list_for_each_entry(mm_context, &mm_list, list) { + if (mm_context->id.pid == pid) { + struct stub_data *stub_data; + printk("Unexpectedly lost MM child! Affected tasks will segfault."); + + /* Marks the MM as dead */ + mm_context->id.pid = -1; + + /* + * NOTE: If SMP is implemented, a futex_wake + * needs to be added here. + */ + stub_data = (void *)mm_context->id.stack; + stub_data->futex = FUTEX_IN_KERN; + + /* + * NOTE: Currently executing syscalls by + * affected tasks may finish normally. + */ + break; + } + } + } + + return IRQ_HANDLED; +} + +static int __init init_child_tracking(void) +{ + int err; + + spin_lock_init(&mm_list_lock); + INIT_LIST_HEAD(&mm_list); + + err = request_irq(SIGCHLD_IRQ, mm_sigchld_irq, 0, "SIGCHLD", NULL); + if (err < 0) + panic("Failed to register SIGCHLD IRQ: %d", err); + + return 0; } +early_initcall(init_child_tracking) diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index 184566edeee9..00b49e90d05f 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -18,17 +18,29 @@ #include #include #include +#include void os_alarm_process(int pid) { + if (pid <= 0) + return; + kill(pid, SIGALRM); } void os_kill_process(int pid, int reap_child) { + if (pid <= 0) + return; + + /* Block signals until child is reaped */ + block_signals(); + kill(pid, SIGKILL); if (reap_child) CATCH_EINTR(waitpid(pid, NULL, __WALL)); + + unblock_signals(); } /* Kill off a ptraced child by all means available. kill it normally first, @@ -38,11 +50,27 @@ void os_kill_process(int pid, int reap_child) void os_kill_ptraced_process(int pid, int reap_child) { + if (pid <= 0) + return; + + /* Block signals until child is reaped */ + block_signals(); + kill(pid, SIGKILL); ptrace(PTRACE_KILL, pid); ptrace(PTRACE_CONT, pid); if (reap_child) CATCH_EINTR(waitpid(pid, NULL, __WALL)); + + unblock_signals(); +} + +pid_t os_reap_child(void) +{ + int status; + + /* Try to reap a child */ + return waitpid(-1, &status, WNOHANG); } /* Don't use the glibc version, which caches the result in TLS. It misses some @@ -151,6 +179,9 @@ void init_new_thread_signals(void) set_handler(SIGBUS); signal(SIGHUP, SIG_IGN); set_handler(SIGIO); + /* We (currently) only use the child reaper IRQ in seccomp mode */ + if (using_seccomp) + set_handler(SIGCHLD); signal(SIGWINCH, SIG_IGN); } diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index e71e5b4878d1..11f07f498270 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -29,6 +29,7 @@ void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) = [SIGBUS] = relay_signal, [SIGSEGV] = segv_handler, [SIGIO] = sigio_handler, + [SIGCHLD] = sigchld_handler, }; static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) @@ -44,7 +45,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) } /* enable signals if sig isn't IRQ signal */ - if ((sig != SIGIO) && (sig != SIGWINCH)) + if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGCHLD)) unblock_signals_trace(); (*sig_info[sig])(sig, si, &r, mc); @@ -64,6 +65,9 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) #define SIGALRM_BIT 1 #define SIGALRM_MASK (1 << SIGALRM_BIT) +#define SIGCHLD_BIT 2 +#define SIGCHLD_MASK (1 << SIGCHLD_BIT) + int signals_enabled; #if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT) static int signals_blocked, signals_blocked_pending; @@ -102,6 +106,11 @@ static void sig_handler(int sig, struct siginfo *si, mcontext_t *mc) return; } + if (!enabled && (sig == SIGCHLD)) { + signals_pending |= SIGCHLD_MASK; + return; + } + block_signals_trace(); sig_handler_common(sig, si, mc); @@ -181,6 +190,8 @@ static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = { [SIGIO] = sig_handler, [SIGWINCH] = sig_handler, + /* SIGCHLD is only actually registered in seccomp mode. */ + [SIGCHLD] = sig_handler, [SIGALRM] = timer_alarm_handler, [SIGUSR1] = sigusr1_handler, @@ -309,6 +320,12 @@ void unblock_signals(void) if (save_pending & SIGIO_MASK) sig_handler_common(SIGIO, NULL, NULL); + if (save_pending & SIGCHLD_MASK) { + struct uml_pt_regs regs = {}; + + sigchld_handler(SIGCHLD, NULL, ®s, NULL); + } + /* Do not reenter the handler */ if ((save_pending & SIGALRM_MASK) && (!(signals_active & SIGALRM_MASK))) diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 97c2f964f5fc..150e4f6ba633 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -309,6 +309,7 @@ static int __init init_stub_exe_fd(void) } __initcall(init_stub_exe_fd); +int using_seccomp; int userspace_pid[NR_CPUS]; /** -- cgit v1.2.3 From 406d17c6c370a33cfb54067d9e205305293d4604 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 2 Jun 2025 15:00:50 +0200 Subject: um: Implement kernel side of SECCOMP based process handling This adds the kernel side of the seccomp based process handling. Co-authored-by: Johannes Berg Signed-off-by: Benjamin Berg Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250602130052.545733-6-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/common-offsets.h | 2 + arch/um/include/shared/os.h | 2 +- arch/um/include/shared/skas/stub-data.h | 5 +- arch/um/kernel/skas/mmu.c | 6 +- arch/um/kernel/skas/stub_exe.c | 141 ++++++++++- arch/um/os-Linux/internal.h | 5 +- arch/um/os-Linux/skas/mem.c | 37 +-- arch/um/os-Linux/skas/process.c | 374 +++++++++++++++++++++-------- arch/x86/um/shared/sysdep/kernel-offsets.h | 2 + arch/x86/um/tls_32.c | 23 +- 10 files changed, 459 insertions(+), 138 deletions(-) (limited to 'arch') diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 93e7097a2922..8ca66a1918c3 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -16,3 +16,5 @@ DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC); DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); DEFINE(UM_KERN_GDT_ENTRY_TLS_ENTRIES, GDT_ENTRY_TLS_ENTRIES); + +DEFINE(UM_SECCOMP_ARCH_NATIVE, SECCOMP_ARCH_NATIVE); diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 3046728ec42e..b35cc8ce333b 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -286,7 +286,7 @@ int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len); /* skas/process.c */ extern int is_skas_winch(int pid, int fd, void *data); -extern int start_userspace(unsigned long stub_stack); +extern int start_userspace(struct mm_id *mm_id); extern void userspace(struct uml_pt_regs *regs); extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void)); extern void switch_threads(jmp_buf *me, jmp_buf *you); diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 81ac2cd12112..675f1a0a1390 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -17,6 +17,8 @@ #define FUTEX_IN_KERN 1 struct stub_init_data { + int seccomp; + unsigned long stub_start; int stub_code_fd; @@ -24,7 +26,8 @@ struct stub_init_data { int stub_data_fd; unsigned long stub_data_offset; - unsigned long segv_handler; + unsigned long signal_handler; + unsigned long signal_restorer; }; #define STUB_NEXT_SYSCALL(s) \ diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 1e146a0f9549..87a18ae4da19 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -40,11 +40,9 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm) list_add(&mm->context.list, &mm_list); } - new_id->pid = start_userspace(stack); - if (new_id->pid < 0) { - ret = new_id->pid; + ret = start_userspace(new_id); + if (ret < 0) goto out_free; - } /* Ensure the new MM is clean and nothing unwanted is mapped */ unmap(new_id, 0, STUB_START); diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c index 23c99b285e82..f40f2332b676 100644 --- a/arch/um/kernel/skas/stub_exe.c +++ b/arch/um/kernel/skas/stub_exe.c @@ -3,6 +3,9 @@ #include #include #include +#include +#include +#include void _start(void); @@ -25,8 +28,6 @@ noinline static void real_init(void) } sa = { /* Need to set SA_RESTORER (but the handler never returns) */ .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000, - /* no need to mask any signals */ - .sa_mask = 0, }; /* set a nice name */ @@ -35,6 +36,9 @@ noinline static void real_init(void) /* Make sure this process dies if the kernel dies */ stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL); + /* Needed in SECCOMP mode (and safe to do anyway) */ + stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + /* read information from STDIN and close it */ res = stub_syscall3(__NR_read, 0, (unsigned long)&init_data, sizeof(init_data)); @@ -63,18 +67,133 @@ noinline static void real_init(void) stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); - /* register SIGSEGV handler */ - sa.sa_handler_ = (void *) init_data.segv_handler; - res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0, - sizeof(sa.sa_mask)); - if (res != 0) - stub_syscall1(__NR_exit, 13); + /* register signal handlers */ + sa.sa_handler_ = (void *) init_data.signal_handler; + sa.sa_restorer = (void *) init_data.signal_restorer; + if (!init_data.seccomp) { + /* In ptrace mode, the SIGSEGV handler never returns */ + sa.sa_mask = 0; + + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 13); + } else { + /* SECCOMP mode uses rt_sigreturn, need to mask all signals */ + sa.sa_mask = ~0ULL; + + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 14); + + res = stub_syscall4(__NR_rt_sigaction, SIGSYS, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 15); + + res = stub_syscall4(__NR_rt_sigaction, SIGALRM, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 16); + + res = stub_syscall4(__NR_rt_sigaction, SIGTRAP, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 17); + + res = stub_syscall4(__NR_rt_sigaction, SIGILL, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 18); + + res = stub_syscall4(__NR_rt_sigaction, SIGFPE, + (unsigned long)&sa, 0, sizeof(sa.sa_mask)); + if (res != 0) + stub_syscall1(__NR_exit, 19); + } + + /* + * If in seccomp mode, install the SECCOMP filter and trigger a syscall. + * Otherwise set PTRACE_TRACEME and do a SIGSTOP. + */ + if (init_data.seccomp) { + struct sock_filter filter[] = { +#if __BITS_PER_LONG > 32 + /* [0] Load upper 32bit of instruction pointer from seccomp_data */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, instruction_pointer) + 4)), + + /* [1] Jump forward 3 instructions if the upper address is not identical */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) >> 32, 0, 3), +#endif + /* [2] Load lower 32bit of instruction pointer from seccomp_data */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + (offsetof(struct seccomp_data, instruction_pointer))), + + /* [3] Mask out lower bits */ + BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000), + + /* [4] Jump to [6] if the lower bits are not on the expected page */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) & 0xfffff000, 1, 0), + + /* [5] Trap call, allow */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), + + /* [6,7] Check architecture */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, arch)), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, + UM_SECCOMP_ARCH_NATIVE, 1, 0), + + /* [8] Kill (for architecture check) */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + + /* [9] Load syscall number */ + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, nr)), + + /* [10-14] Check against permitted syscalls */ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex, + 5, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR, + 4, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap, + 3, 0), +#ifdef __i386__ + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area, + 2, 0), +#else + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl, + 2, 0), +#endif + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn, + 1, 0), + + /* [15] Not one of the permitted syscalls */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), + + /* [16] Permitted call for the stub */ + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + }; + struct sock_fprog prog = { + .len = sizeof(filter) / sizeof(filter[0]), + .filter = filter, + }; + + if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, + (unsigned long)&prog) != 0) + stub_syscall1(__NR_exit, 20); - stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); + /* Fall through, the exit syscall will cause SIGSYS */ + } else { + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0); - stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP); + } - stub_syscall1(__NR_exit, 14); + stub_syscall1(__NR_exit, 30); __builtin_unreachable(); } diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h index 317fca190c2b..5d8d3b0817a9 100644 --- a/arch/um/os-Linux/internal.h +++ b/arch/um/os-Linux/internal.h @@ -2,6 +2,9 @@ #ifndef __UM_OS_LINUX_INTERNAL_H #define __UM_OS_LINUX_INTERNAL_H +#include +#include + /* * elf_aux.c */ @@ -16,5 +19,5 @@ void check_tmpexec(void); * skas/process.c */ void wait_stub_done(int pid); - +void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys); #endif /* __UM_OS_LINUX_INTERNAL_H */ diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index d7f1814b0e5a..31bf3a52047a 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -80,27 +80,32 @@ static inline long do_syscall_stub(struct mm_id *mm_idp) int n, i; int err, pid = mm_idp->pid; - n = ptrace_setregs(pid, syscall_regs); - if (n < 0) { - printk(UM_KERN_ERR "Registers - \n"); - for (i = 0; i < MAX_REG_NR; i++) - printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); - panic("%s : PTRACE_SETREGS failed, errno = %d\n", - __func__, -n); - } - /* Inform process how much we have filled in. */ proc_data->syscall_data_len = mm_idp->syscall_data_len; - err = ptrace(PTRACE_CONT, pid, 0, 0); - if (err) - panic("Failed to continue stub, pid = %d, errno = %d\n", pid, - errno); - - wait_stub_done(pid); + if (using_seccomp) { + proc_data->restart_wait = 1; + wait_stub_done_seccomp(mm_idp, 0, 1); + } else { + n = ptrace_setregs(pid, syscall_regs); + if (n < 0) { + printk(UM_KERN_ERR "Registers -\n"); + for (i = 0; i < MAX_REG_NR; i++) + printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]); + panic("%s : PTRACE_SETREGS failed, errno = %d\n", + __func__, -n); + } + + err = ptrace(PTRACE_CONT, pid, 0, 0); + if (err) + panic("Failed to continue stub, pid = %d, errno = %d\n", + pid, errno); + + wait_stub_done(pid); + } /* - * proc_data->err will be non-zero if there was an (unexpected) error. + * proc_data->err will be negative if there was an (unexpected) error. * In that case, syscall_data_len points to the last executed syscall, * otherwise it will be zero (but we do not need to rely on that). */ diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index 150e4f6ba633..e1aaa144e273 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de) * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ @@ -25,8 +26,11 @@ #include #include #include +#include +#include #include #include +#include #include "../internal.h" int is_skas_winch(int pid, int fd, void *data) @@ -142,6 +146,73 @@ bad_wait: fatal_sigsegv(); } +void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys) +{ + struct stub_data *data = (void *)mm_idp->stack; + int ret; + + do { + if (!running) { + data->signal = 0; + data->futex = FUTEX_IN_CHILD; + CATCH_EINTR(syscall(__NR_futex, &data->futex, + FUTEX_WAKE, 1, NULL, NULL, 0)); + } + + do { + /* + * We need to check whether the child is still alive + * before and after the FUTEX_WAIT call. Before, in + * case it just died but we still updated data->futex + * to FUTEX_IN_CHILD. And after, in case it died while + * we were waiting (and SIGCHLD woke us up, see the + * IRQ handler in mmu.c). + * + * Either way, if PID is negative, then we have no + * choice but to kill the task. + */ + if (__READ_ONCE(mm_idp->pid) < 0) + goto out_kill; + + ret = syscall(__NR_futex, &data->futex, + FUTEX_WAIT, FUTEX_IN_CHILD, + NULL, NULL, 0); + if (ret < 0 && errno != EINTR && errno != EAGAIN) { + printk(UM_KERN_ERR "%s : FUTEX_WAIT failed, errno = %d\n", + __func__, errno); + goto out_kill; + } + } while (data->futex == FUTEX_IN_CHILD); + + if (__READ_ONCE(mm_idp->pid) < 0) + goto out_kill; + + running = 0; + + /* We may receive a SIGALRM before SIGSYS, iterate again. */ + } while (wait_sigsys && data->signal == SIGALRM); + + if (data->mctx_offset > sizeof(data->sigstack) - sizeof(mcontext_t)) { + printk(UM_KERN_ERR "%s : invalid mcontext offset", __func__); + goto out_kill; + } + + if (wait_sigsys && data->signal != SIGSYS) { + printk(UM_KERN_ERR "%s : expected SIGSYS but got %d", + __func__, data->signal); + goto out_kill; + } + + return; + +out_kill: + printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = %d\n", + __func__, mm_idp->pid, errno); + /* This is not true inside start_userspace */ + if (current_mm_id() == mm_idp) + fatal_sigsegv(); +} + extern unsigned long current_stub_stack(void); static void get_skas_faultinfo(int pid, struct faultinfo *fi) @@ -185,14 +256,26 @@ static int userspace_tramp(void *stack) int pipe_fds[2]; unsigned long long offset; struct stub_init_data init_data = { + .seccomp = using_seccomp, .stub_start = STUB_START, - .segv_handler = STUB_CODE + - (unsigned long) stub_segv_handler - - (unsigned long) __syscall_stub_start, }; struct iomem_region *iomem; int ret; + if (using_seccomp) { + init_data.signal_handler = STUB_CODE + + (unsigned long) stub_signal_interrupt - + (unsigned long) __syscall_stub_start; + init_data.signal_restorer = STUB_CODE + + (unsigned long) stub_signal_restorer - + (unsigned long) __syscall_stub_start; + } else { + init_data.signal_handler = STUB_CODE + + (unsigned long) stub_segv_handler - + (unsigned long) __syscall_stub_start; + init_data.signal_restorer = 0; + } + init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset); init_data.stub_code_offset = MMAP_OFFSET(offset); @@ -323,8 +406,9 @@ int userspace_pid[NR_CPUS]; * when negative: an error number. * FIXME: can PIDs become negative?! */ -int start_userspace(unsigned long stub_stack) +int start_userspace(struct mm_id *mm_id) { + struct stub_data *proc_data = (void *)mm_id->stack; void *stack; unsigned long sp; int pid, status, n, err; @@ -343,10 +427,13 @@ int start_userspace(unsigned long stub_stack) /* set stack pointer to the end of the stack page, so it can grow downwards */ sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; + if (using_seccomp) + proc_data->futex = FUTEX_IN_CHILD; + /* clone into new userspace process */ pid = clone(userspace_tramp, (void *) sp, CLONE_VFORK | CLONE_VM | SIGCHLD, - (void *)stub_stack); + (void *)mm_id->stack); if (pid < 0) { err = -errno; printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", @@ -354,29 +441,34 @@ int start_userspace(unsigned long stub_stack) return err; } - do { - CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL)); - if (n < 0) { + if (using_seccomp) { + wait_stub_done_seccomp(mm_id, 1, 1); + } else { + do { + CATCH_EINTR(n = waitpid(pid, &status, + WUNTRACED | __WALL)); + if (n < 0) { + err = -errno; + printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", + __func__, errno); + goto out_kill; + } + } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); + + if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { + err = -EINVAL; + printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", + __func__, status); + goto out_kill; + } + + if (ptrace(PTRACE_SETOPTIONS, pid, NULL, + (void *) PTRACE_O_TRACESYSGOOD) < 0) { err = -errno; - printk(UM_KERN_ERR "%s : wait failed, errno = %d\n", + printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", __func__, errno); goto out_kill; } - } while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM)); - - if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) { - err = -EINVAL; - printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n", - __func__, status); - goto out_kill; - } - - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, - (void *) PTRACE_O_TRACESYSGOOD) < 0) { - err = -errno; - printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", - __func__, errno); - goto out_kill; } if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) { @@ -386,6 +478,8 @@ int start_userspace(unsigned long stub_stack) goto out_kill; } + mm_id->pid = pid; + return pid; out_kill: @@ -399,7 +493,9 @@ extern unsigned long tt_extra_sched_jiffies; void userspace(struct uml_pt_regs *regs) { int err, status, op, pid = userspace_pid[0]; - siginfo_t si; + siginfo_t si_ptrace; + siginfo_t *si; + int sig; /* Handle any immediate reschedules or signals */ interrupt_end(); @@ -432,104 +528,182 @@ void userspace(struct uml_pt_regs *regs) current_mm_sync(); - /* Flush out any pending syscalls */ - err = syscall_stub_flush(current_mm_id()); - if (err) { - if (err == -ENOMEM) - report_enomem(); + if (using_seccomp) { + struct mm_id *mm_id = current_mm_id(); + struct stub_data *proc_data = (void *) mm_id->stack; + int ret; - printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", - __func__, -err); - fatal_sigsegv(); - } + ret = set_stub_state(regs, proc_data, singlestepping()); + if (ret) { + printk(UM_KERN_ERR "%s - failed to set regs: %d", + __func__, ret); + fatal_sigsegv(); + } - /* - * This can legitimately fail if the process loads a - * bogus value into a segment register. It will - * segfault and PTRACE_GETREGS will read that value - * out of the process. However, PTRACE_SETREGS will - * fail. In this case, there is nothing to do but - * just kill the process. - */ - if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { - printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + /* Must have been reset by the syscall caller */ + if (proc_data->restart_wait != 0) + panic("Programming error: Flag to only run syscalls in child was not cleared!"); + + /* Mark pending syscalls for flushing */ + proc_data->syscall_data_len = mm_id->syscall_data_len; + mm_id->syscall_data_len = 0; + + proc_data->signal = 0; + proc_data->futex = FUTEX_IN_CHILD; + CATCH_EINTR(syscall(__NR_futex, &proc_data->futex, + FUTEX_WAKE, 1, NULL, NULL, 0)); + do { + ret = syscall(__NR_futex, &proc_data->futex, + FUTEX_WAIT, FUTEX_IN_CHILD, NULL, NULL, 0); + } while ((ret == -1 && errno == EINTR) || + proc_data->futex == FUTEX_IN_CHILD); + + sig = proc_data->signal; + + if (sig == SIGTRAP && proc_data->err != 0) { + printk(UM_KERN_ERR "%s - Error flushing stub syscalls", + __func__); + syscall_stub_dump_error(mm_id); + fatal_sigsegv(); + } - if (put_fp_registers(pid, regs->fp)) { - printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + ret = get_stub_state(regs, proc_data, NULL); + if (ret) { + printk(UM_KERN_ERR "%s - failed to get regs: %d", + __func__, ret); + fatal_sigsegv(); + } - if (singlestepping()) - op = PTRACE_SYSEMU_SINGLESTEP; - else - op = PTRACE_SYSEMU; + if (proc_data->si_offset > sizeof(proc_data->sigstack) - sizeof(*si)) + panic("%s - Invalid siginfo offset from child", + __func__); + si = (void *)&proc_data->sigstack[proc_data->si_offset]; - if (ptrace(op, pid, 0, 0)) { - printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", - __func__, op, errno); - fatal_sigsegv(); - } + regs->is_user = 1; - CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); - if (err < 0) { - printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + /* Fill in ORIG_RAX and extract fault information */ + PT_SYSCALL_NR(regs->gp) = si->si_syscall; + if (sig == SIGSEGV) { + mcontext_t *mcontext = (void *)&proc_data->sigstack[proc_data->mctx_offset]; - regs->is_user = 1; - if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { - printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + GET_FAULTINFO_FROM_MC(regs->faultinfo, mcontext); + } + } else { + /* Flush out any pending syscalls */ + err = syscall_stub_flush(current_mm_id()); + if (err) { + if (err == -ENOMEM) + report_enomem(); + + printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d", + __func__, -err); + fatal_sigsegv(); + } - if (get_fp_registers(pid, regs->fp)) { - printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", - __func__, errno); - fatal_sigsegv(); - } + /* + * This can legitimately fail if the process loads a + * bogus value into a segment register. It will + * segfault and PTRACE_GETREGS will read that value + * out of the process. However, PTRACE_SETREGS will + * fail. In this case, there is nothing to do but + * just kill the process. + */ + if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } - UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + if (put_fp_registers(pid, regs->fp)) { + printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } - if (WIFSTOPPED(status)) { - int sig = WSTOPSIG(status); + if (singlestepping()) + op = PTRACE_SYSEMU_SINGLESTEP; + else + op = PTRACE_SYSEMU; - /* These signal handlers need the si argument. - * The SIGIO and SIGALARM handlers which constitute the - * majority of invocations, do not use it. - */ - switch (sig) { - case SIGSEGV: - case SIGTRAP: - case SIGILL: - case SIGBUS: - case SIGFPE: - case SIGWINCH: - ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si); - break; + if (ptrace(op, pid, 0, 0)) { + printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n", + __func__, op, errno); + fatal_sigsegv(); + } + + CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL)); + if (err < 0) { + printk(UM_KERN_ERR "%s - wait failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); } + regs->is_user = 1; + if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) { + printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (get_fp_registers(pid, regs->fp)) { + printk(UM_KERN_ERR "%s - get_fp_registers failed, errno = %d\n", + __func__, errno); + fatal_sigsegv(); + } + + if (WIFSTOPPED(status)) { + sig = WSTOPSIG(status); + + /* + * These signal handlers need the si argument + * and SIGSEGV needs the faultinfo. + * The SIGIO and SIGALARM handlers which constitute + * the majority of invocations, do not use it. + */ + switch (sig) { + case SIGSEGV: + get_skas_faultinfo(pid, + ®s->faultinfo); + fallthrough; + case SIGTRAP: + case SIGILL: + case SIGBUS: + case SIGFPE: + case SIGWINCH: + ptrace(PTRACE_GETSIGINFO, pid, 0, + (struct siginfo *)&si_ptrace); + si = &si_ptrace; + break; + default: + si = NULL; + break; + } + } else { + sig = 0; + } + } + + UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */ + + if (sig) { switch (sig) { case SIGSEGV: - get_skas_faultinfo(pid, ®s->faultinfo); - - if (PTRACE_FULL_FAULTINFO) - (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, + if (using_seccomp || PTRACE_FULL_FAULTINFO) + (*sig_info[SIGSEGV])(SIGSEGV, + (struct siginfo *)si, regs, NULL); else segv(regs->faultinfo, 0, 1, NULL, NULL); + break; + case SIGSYS: + handle_syscall(regs); break; case SIGTRAP + 0x80: handle_trap(pid, regs); break; case SIGTRAP: - relay_signal(SIGTRAP, (struct siginfo *)&si, regs, NULL); + relay_signal(SIGTRAP, (struct siginfo *)si, regs, NULL); break; case SIGALRM: break; @@ -539,7 +713,7 @@ void userspace(struct uml_pt_regs *regs) case SIGFPE: case SIGWINCH: block_signals_trace(); - (*sig_info[sig])(sig, (struct siginfo *)&si, regs, NULL); + (*sig_info[sig])(sig, (struct siginfo *)si, regs, NULL); unblock_signals_trace(); break; default: diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h index 48de3a71f845..6fd1ed400399 100644 --- a/arch/x86/um/shared/sysdep/kernel-offsets.h +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include /* workaround for a warning with -Wmissing-prototypes */ void foo(void); diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index fbb129023080..21cbb70cf771 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -12,6 +12,7 @@ #include #include #include +#include /* * If needed we can detect when it's uninitialized. @@ -21,13 +22,27 @@ static int host_supports_tls = -1; int host_gdt_entry_tls_min; -static int do_set_thread_area(struct user_desc *info) +static int do_set_thread_area(struct task_struct* task, struct user_desc *info) { int ret; u32 cpu; + if (info->entry_number < host_gdt_entry_tls_min || + info->entry_number >= host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES) + return -EINVAL; + + if (using_seccomp) { + int idx = info->entry_number - host_gdt_entry_tls_min; + struct stub_data *data = (void *)task->mm->context.id.stack; + + data->arch_data.tls[idx] = *info; + data->arch_data.sync |= BIT(idx); + + return 0; + } + cpu = get_cpu(); - ret = os_set_thread_area(info, userspace_pid[cpu]); + ret = os_set_thread_area(info, task->mm->context.id.pid); put_cpu(); if (ret) @@ -97,7 +112,7 @@ static int load_TLS(int flags, struct task_struct *to) if (!(flags & O_FORCE) && curr->flushed) continue; - ret = do_set_thread_area(&curr->tls); + ret = do_set_thread_area(current, &curr->tls); if (ret) goto out; @@ -275,7 +290,7 @@ SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, user_desc) return -EFAULT; } - ret = do_set_thread_area(&info); + ret = do_set_thread_area(current, &info); if (ret) return ret; return set_tls_entry(current, &info, idx, 1); -- cgit v1.2.3 From beddc9fb1cb161e1bf779b180750b648ff9690c7 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 2 Jun 2025 15:00:51 +0200 Subject: um: Add SECCOMP support detection and initialization This detects seccomp support, sets the global using_seccomp variable and initilizes the exec registers. The support is only enabled if the seccomp= kernel parameter is set to either "on" or "auto". With "auto" a fallback to ptrace mode will happen if initialization failed. Signed-off-by: Benjamin Berg Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250602130052.545733-7-benjamin@sipsolutions.net [extend help with Kconfig text from v2, use exit syscall instead of libc, remove unneeded mctx_offset assignment, disable on 32-bit for now] Signed-off-by: Johannes Berg --- arch/um/os-Linux/registers.c | 4 +- arch/um/os-Linux/start_up.c | 195 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 195 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c index d7ca148807b2..bfba2cbc9478 100644 --- a/arch/um/os-Linux/registers.c +++ b/arch/um/os-Linux/registers.c @@ -14,8 +14,8 @@ /* This is set once at boot time and not changed thereafter */ -static unsigned long exec_regs[MAX_REG_NR]; -static unsigned long *exec_fp_regs; +unsigned long exec_regs[MAX_REG_NR]; +unsigned long *exec_fp_regs; int init_pid_registers(int pid) { diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 93fc82c01aba..d95dc9034b26 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* + * Copyright (C) 2021 Benjamin Berg * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) */ @@ -24,6 +25,13 @@ #include #include #include +#include +#include +#include +#include +#include +#include +#include #include #include #include "internal.h" @@ -224,6 +232,140 @@ static void __init check_ptrace(void) check_sysemu(); } +extern unsigned long host_fp_size; +extern unsigned long exec_regs[MAX_REG_NR]; +extern unsigned long *exec_fp_regs; + +__initdata static struct stub_data *seccomp_test_stub_data; + +static void __init sigsys_handler(int sig, siginfo_t *info, void *p) +{ + ucontext_t *uc = p; + + /* Stow away the location of the mcontext in the stack */ + seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext - + (unsigned long)&seccomp_test_stub_data->sigstack[0]; + + /* Prevent libc from clearing memory (mctx_offset in particular) */ + syscall(__NR_exit, 0); +} + +static int __init seccomp_helper(void *data) +{ + static struct sock_filter filter[] = { + BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), + BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP), + }; + static struct sock_fprog prog = { + .len = ARRAY_SIZE(filter), + .filter = filter, + }; + struct sigaction sa; + + set_sigstack(seccomp_test_stub_data->sigstack, + sizeof(seccomp_test_stub_data->sigstack)); + + sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO; + sa.sa_sigaction = (void *) sigsys_handler; + sa.sa_restorer = NULL; + if (sigaction(SIGSYS, &sa, NULL) < 0) + exit(1); + + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, + SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0) + exit(2); + + sleep(0); + + /* Never reached. */ + _exit(3); +} + +static bool __init init_seccomp(void) +{ + int pid; + int status; + int n; + unsigned long sp; + + /* doesn't work on 32-bit right now */ + if (!IS_ENABLED(CONFIG_64BIT)) + return false; + + /* + * We check that we can install a seccomp filter and then exit(0) + * from a trapped syscall. + * + * Note that we cannot verify that no seccomp filter already exists + * for a syscall that results in the process/thread to be killed. + */ + + os_info("Checking that seccomp filters can be installed..."); + + seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANON, 0, 0); + + /* Use the syscall data area as stack, we just need something */ + sp = (unsigned long)&seccomp_test_stub_data->syscall_data + + sizeof(seccomp_test_stub_data->syscall_data) - + sizeof(void *); + pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL); + + if (pid < 0) + fatal_perror("check_seccomp : clone failed"); + + CATCH_EINTR(n = waitpid(pid, &status, __WCLONE)); + if (n < 0) + fatal_perror("check_seccomp : waitpid failed"); + + if (WIFEXITED(status) && WEXITSTATUS(status) == 0) { + struct uml_pt_regs *regs; + unsigned long fp_size; + int r; + + /* Fill in the host_fp_size from the mcontext. */ + regs = calloc(1, sizeof(struct uml_pt_regs)); + get_stub_state(regs, seccomp_test_stub_data, &fp_size); + host_fp_size = fp_size; + free(regs); + + /* Repeat with the correct size */ + regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size); + r = get_stub_state(regs, seccomp_test_stub_data, NULL); + + /* Store as the default startup registers */ + exec_fp_regs = malloc(host_fp_size); + memcpy(exec_regs, regs->gp, sizeof(exec_regs)); + memcpy(exec_fp_regs, regs->fp, host_fp_size); + + munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); + + free(regs); + + if (r) { + os_info("failed to fetch registers: %d\n", r); + return false; + } + + os_info("OK\n"); + return true; + } + + if (WIFEXITED(status) && WEXITSTATUS(status) == 2) + os_info("missing\n"); + else + os_info("error\n"); + + munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data)); + return false; +} + + static void __init check_coredump_limit(void) { struct rlimit lim; @@ -278,6 +420,44 @@ void __init get_host_cpu_features( } } +static int seccomp_config __initdata; + +static int __init uml_seccomp_config(char *line, int *add) +{ + *add = 0; + + if (strcmp(line, "off") == 0) + seccomp_config = 0; + else if (strcmp(line, "auto") == 0) + seccomp_config = 1; + else if (strcmp(line, "on") == 0) + seccomp_config = 2; + else + fatal("Invalid seccomp option '%s', expected on/auto/off\n", + line); + + return 0; +} + +__uml_setup("seccomp=", uml_seccomp_config, +"seccomp=\n" +" Configure whether or not SECCOMP is used. With SECCOMP, userspace\n" +" processes work collaboratively with the kernel instead of being\n" +" traced using ptrace. All syscalls from the application are caught and\n" +" redirected using a signal. This signal handler in turn is permitted to\n" +" do the selected set of syscalls to communicate with the UML kernel and\n" +" do the required memory management.\n" +"\n" +" This method is overall faster than the ptrace based userspace, primarily\n" +" because it reduces the number of context switches for (minor) page faults.\n" +"\n" +" However, the SECCOMP filter is not (yet) restrictive enough to prevent\n" +" userspace from reading and writing all physical memory. Userspace\n" +" processes could also trick the stub into disabling SIGALRM which\n" +" prevents it from being interrupted for scheduling purposes.\n" +"\n" +" This is insecure and should only be used with a trusted userspace\n\n" +); void __init os_early_checks(void) { @@ -286,13 +466,24 @@ void __init os_early_checks(void) /* Print out the core dump limits early */ check_coredump_limit(); - check_ptrace(); - /* Need to check this early because mmapping happens before the * kernel is running. */ check_tmpexec(); + if (seccomp_config) { + if (init_seccomp()) { + using_seccomp = 1; + return; + } + + if (seccomp_config == 2) + fatal("SECCOMP userspace requested but not functional!\n"); + } + + using_seccomp = 0; + check_ptrace(); + pid = start_ptraced_child(); if (init_pid_registers(pid)) fatal("Failed to initialize default registers"); -- cgit v1.2.3 From e92e2552858142b60238b9828d802f128e4acccd Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 2 Jun 2025 15:00:52 +0200 Subject: um: pass FD for memory operations when needed Instead of always sharing the FDs with the userspace process, only hand over the FDs needed for mmap when required. The idea is that userspace might be able to force the stub into executing an mmap syscall, however, it will not be able to manipulate the control flow sufficiently to have access to an FD that would allow mapping arbitrary memory. Security wise, we need to be sure that only the expected syscalls are executed after the kernel sends FDs through the socket. This is currently not the case, as userspace can trivially jump to the rt_sigreturn syscall instruction to execute any syscall that the stub is permitted to do. With this, it can trick the kernel to send the FD, which in turn allows userspace to freely map any physical memory. As such, this is currently *not* secure. However, in principle the approach should be fine with a more strict SECCOMP filter and a careful review of the stub control flow (as userspace can prepare a stack). With some care, it is likely possible to extend the security model to SMP if desired. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250602130052.545733-8-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/skas/mm_id.h | 7 ++ arch/um/include/shared/skas/stub-data.h | 1 + arch/um/kernel/skas/mmu.c | 3 + arch/um/kernel/skas/stub.c | 87 ++++++++++++++++++++-- arch/um/kernel/skas/stub_exe.c | 40 +++++++--- arch/um/os-Linux/skas/mem.c | 66 ++++++++++++++++- arch/um/os-Linux/skas/process.c | 126 +++++++++++++++++++++++--------- arch/um/os-Linux/start_up.c | 10 ++- 8 files changed, 280 insertions(+), 60 deletions(-) (limited to 'arch') diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h index 0654c57bb28e..89df9a55fbea 100644 --- a/arch/um/include/shared/skas/mm_id.h +++ b/arch/um/include/shared/skas/mm_id.h @@ -6,10 +6,17 @@ #ifndef __MM_ID_H #define __MM_ID_H +#define STUB_MAX_FDS 4 + struct mm_id { int pid; unsigned long stack; int syscall_data_len; + + /* Only used with SECCOMP mode */ + int sock; + int syscall_fd_num; + int syscall_fd_map[STUB_MAX_FDS]; }; void __switch_mm(struct mm_id *mm_idp); diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h index 675f1a0a1390..c261a77a32f6 100644 --- a/arch/um/include/shared/skas/stub-data.h +++ b/arch/um/include/shared/skas/stub-data.h @@ -12,6 +12,7 @@ #include #include #include +#include #define FUTEX_IN_CHILD 0 #define FUTEX_IN_KERN 1 diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 87a18ae4da19..849fafa4b54f 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -78,6 +78,9 @@ void destroy_context(struct mm_struct *mm) mmu->id.pid = -1; } + if (using_seccomp && mmu->id.sock) + os_close_file(mmu->id.sock); + free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES)); guard(spinlock_irqsave)(&mm_list_lock); diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c index 9041f6b6e28b..67cab46a602c 100644 --- a/arch/um/kernel/skas/stub.c +++ b/arch/um/kernel/skas/stub.c @@ -6,23 +6,53 @@ #include #include +#include #include -static __always_inline int syscall_handler(struct stub_data *d) +/* + * Known security issues + * + * Userspace can jump to this address to execute *any* syscall that is + * permitted by the stub. As we will return afterwards, it can do + * whatever it likes, including: + * - Tricking the kernel into handing out the memory FD + * - Using this memory FD to read/write all physical memory + * - Running in parallel to the kernel processing a syscall + * (possibly creating data races?) + * - Blocking e.g. SIGALRM to avoid time based scheduling + * + * To avoid this, the permitted location for each syscall needs to be + * checked for in the SECCOMP filter (which is reasonably simple). Also, + * more care will need to go into considerations how the code might be + * tricked by using a prepared stack (or even modifying the stack from + * another thread in case SMP support is added). + * + * As for the SIGALRM, the best counter measure will be to check in the + * kernel that the process is reporting back the SIGALRM in a timely + * fashion. + */ +static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS]) { + struct stub_data *d = get_stub_data(); int i; unsigned long res; + int fd; for (i = 0; i < d->syscall_data_len; i++) { struct stub_syscall *sc = &d->syscall_data[i]; switch (sc->syscall) { case STUB_SYSCALL_MMAP: + if (fd_map) + fd = fd_map[sc->mem.fd]; + else + fd = sc->mem.fd; + res = stub_syscall6(STUB_MMAP_NR, sc->mem.addr, sc->mem.length, sc->mem.prot, MAP_SHARED | MAP_FIXED, - sc->mem.fd, sc->mem.offset); + fd, sc->mem.offset); if (res != sc->mem.addr) { d->err = res; d->syscall_data_len = i; @@ -54,9 +84,7 @@ static __always_inline int syscall_handler(struct stub_data *d) void __section(".__syscall_stub") stub_syscall_handler(void) { - struct stub_data *d = get_stub_data(); - - syscall_handler(d); + syscall_handler(NULL); trap_myself(); } @@ -65,7 +93,25 @@ void __section(".__syscall_stub") stub_signal_interrupt(int sig, siginfo_t *info, void *p) { struct stub_data *d = get_stub_data(); + char rcv_data; + union { + char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)]; + struct cmsghdr align; + } ctrl = {}; + struct iovec iov = { + .iov_base = &rcv_data, + .iov_len = 1, + }; + struct msghdr msghdr = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = &ctrl, + .msg_controllen = sizeof(ctrl), + }; ucontext_t *uc = p; + struct cmsghdr *fd_msg; + int *fd_map; + int num_fds; long res; d->signal = sig; @@ -78,6 +124,7 @@ restart_wait: res = stub_syscall3(__NR_futex, (unsigned long)&d->futex, FUTEX_WAKE, 1); } while (res == -EINTR); + do { res = stub_syscall4(__NR_futex, (unsigned long)&d->futex, FUTEX_WAIT, FUTEX_IN_KERN, 0); @@ -86,11 +133,37 @@ restart_wait: if (res < 0 && res != -EAGAIN) stub_syscall1(__NR_exit_group, 1); - /* Try running queued syscalls. */ - if (syscall_handler(d) < 0 || d->restart_wait) { + if (d->syscall_data_len) { + /* Read passed FDs (if any) */ + do { + res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0); + } while (res == -EINTR); + + /* We should never have a receive error (other than -EAGAIN) */ + if (res < 0 && res != -EAGAIN) + stub_syscall1(__NR_exit_group, 1); + + /* Receive the FDs */ + num_fds = 0; + fd_msg = msghdr.msg_control; + fd_map = (void *)&CMSG_DATA(fd_msg); + if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr)) + num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int); + + /* Try running queued syscalls. */ + res = syscall_handler(fd_map); + + while (num_fds) + stub_syscall2(__NR_close, fd_map[--num_fds], 0); + } else { + res = 0; + } + + if (res < 0 || d->restart_wait) { /* Report SIGSYS if we restart. */ d->signal = SIGSYS; d->restart_wait = 0; + goto restart_wait; } diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c index f40f2332b676..cbafaa684e66 100644 --- a/arch/um/kernel/skas/stub_exe.c +++ b/arch/um/kernel/skas/stub_exe.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -45,7 +46,11 @@ noinline static void real_init(void) if (res != sizeof(init_data)) stub_syscall1(__NR_exit, 10); - stub_syscall1(__NR_close, 0); + /* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */ + if (!init_data.seccomp) + stub_syscall1(__NR_close, 0); + else + stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK); /* map stub code + data */ res = stub_syscall6(STUB_MMAP_NR, @@ -63,6 +68,13 @@ noinline static void real_init(void) if (res != init_data.stub_start + UM_KERN_PAGE_SIZE) stub_syscall1(__NR_exit, 12); + /* In SECCOMP mode, we only need the signalling FD from now on */ + if (init_data.seccomp) { + res = stub_syscall3(__NR_close_range, 1, ~0U, 0); + if (res != 0) + stub_syscall1(__NR_exit, 13); + } + /* setup signal stack inside stub data */ stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE; stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0); @@ -77,7 +89,7 @@ noinline static void real_init(void) res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0, sizeof(sa.sa_mask)); if (res != 0) - stub_syscall1(__NR_exit, 13); + stub_syscall1(__NR_exit, 14); } else { /* SECCOMP mode uses rt_sigreturn, need to mask all signals */ sa.sa_mask = ~0ULL; @@ -85,32 +97,32 @@ noinline static void real_init(void) res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0, sizeof(sa.sa_mask)); if (res != 0) - stub_syscall1(__NR_exit, 14); + stub_syscall1(__NR_exit, 15); res = stub_syscall4(__NR_rt_sigaction, SIGSYS, (unsigned long)&sa, 0, sizeof(sa.sa_mask)); if (res != 0) - stub_syscall1(__NR_exit, 15); + stub_syscall1(__NR_exit, 16); res = stub_syscall4(__NR_rt_sigaction, SIGALRM, (unsigned long)&sa, 0, sizeof(sa.sa_mask)); if (res != 0) - stub_syscall1(__NR_exit, 16); + stub_syscall1(__NR_exit, 17); res = stub_syscall4(__NR_rt_sigaction, SIGTRAP, (unsigned long)&sa, 0, sizeof(sa.sa_mask)); if (res != 0) - stub_syscall1(__NR_exit, 17); + stub_syscall1(__NR_exit, 18); res = stub_syscall4(__NR_rt_sigaction, SIGILL, (unsigned long)&sa, 0, sizeof(sa.sa_mask)); if (res != 0) - stub_syscall1(__NR_exit, 18); + stub_syscall1(__NR_exit, 19); res = stub_syscall4(__NR_rt_sigaction, SIGFPE, (unsigned long)&sa, 0, sizeof(sa.sa_mask)); if (res != 0) - stub_syscall1(__NR_exit, 19); + stub_syscall1(__NR_exit, 20); } /* @@ -153,8 +165,12 @@ noinline static void real_init(void) BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, nr)), - /* [10-14] Check against permitted syscalls */ + /* [10-16] Check against permitted syscalls */ BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex, + 7, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg, + 6, 0), + BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close, 5, 0), BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR, 4, 0), @@ -170,10 +186,10 @@ noinline static void real_init(void) BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn, 1, 0), - /* [15] Not one of the permitted syscalls */ + /* [17] Not one of the permitted syscalls */ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS), - /* [16] Permitted call for the stub */ + /* [18] Permitted call for the stub */ BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW), }; struct sock_fprog prog = { @@ -184,7 +200,7 @@ noinline static void real_init(void) if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (unsigned long)&prog) != 0) - stub_syscall1(__NR_exit, 20); + stub_syscall1(__NR_exit, 21); /* Fall through, the exit syscall will cause SIGSYS */ } else { diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c index 31bf3a52047a..8b9921ac3ef8 100644 --- a/arch/um/os-Linux/skas/mem.c +++ b/arch/um/os-Linux/skas/mem.c @@ -43,6 +43,16 @@ void syscall_stub_dump_error(struct mm_id *mm_idp) print_hex_dump(UM_KERN_ERR, " syscall data: ", 0, 16, 4, sc, sizeof(*sc), 0); + + if (using_seccomp) { + printk(UM_KERN_ERR "%s: FD map num: %d", __func__, + mm_idp->syscall_fd_num); + print_hex_dump(UM_KERN_ERR, + " FD map: ", 0, 16, + sizeof(mm_idp->syscall_fd_map[0]), + mm_idp->syscall_fd_map, + sizeof(mm_idp->syscall_fd_map), 0); + } } static inline unsigned long *check_init_stack(struct mm_id * mm_idp, @@ -118,6 +128,9 @@ static inline long do_syscall_stub(struct mm_id *mm_idp) mm_idp->syscall_data_len = 0; } + if (using_seccomp) + mm_idp->syscall_fd_num = 0; + return mm_idp->syscall_data_len; } @@ -180,6 +193,44 @@ static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp, return NULL; } +static int get_stub_fd(struct mm_id *mm_idp, int fd) +{ + int i; + + /* Find an FD slot (or flush and use first) */ + if (!using_seccomp) + return fd; + + /* Already crashed, value does not matter */ + if (mm_idp->syscall_data_len < 0) + return 0; + + /* Find existing FD in map if we can allocate another syscall */ + if (mm_idp->syscall_data_len < + ARRAY_SIZE(((struct stub_data *)NULL)->syscall_data)) { + for (i = 0; i < mm_idp->syscall_fd_num; i++) { + if (mm_idp->syscall_fd_map[i] == fd) + return i; + } + + if (mm_idp->syscall_fd_num < STUB_MAX_FDS) { + i = mm_idp->syscall_fd_num; + mm_idp->syscall_fd_map[i] = fd; + + mm_idp->syscall_fd_num++; + + return i; + } + } + + /* FD map full or no syscall space available, continue after flush */ + do_syscall_stub(mm_idp); + mm_idp->syscall_fd_map[0] = fd; + mm_idp->syscall_fd_num = 1; + + return 0; +} + int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, int phys_fd, unsigned long long offset) { @@ -187,12 +238,21 @@ int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot, /* Compress with previous syscall if that is possible */ sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MMAP, virt); - if (sc && sc->mem.prot == prot && sc->mem.fd == phys_fd && + if (sc && sc->mem.prot == prot && sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) { - sc->mem.length += len; - return 0; + int prev_fd = sc->mem.fd; + + if (using_seccomp) + prev_fd = mm_idp->syscall_fd_map[sc->mem.fd]; + + if (phys_fd == prev_fd) { + sc->mem.length += len; + return 0; + } } + phys_fd = get_stub_fd(mm_idp, phys_fd); + sc = syscall_stub_alloc(mm_idp); sc->syscall = STUB_SYSCALL_MMAP; sc->mem.addr = virt; diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index e1aaa144e273..e42ffac23e3c 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -152,7 +153,39 @@ void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys) int ret; do { + const char byte = 0; + struct iovec iov = { + .iov_base = (void *)&byte, + .iov_len = sizeof(byte), + }; + union { + char data[CMSG_SPACE(sizeof(mm_idp->syscall_fd_map))]; + struct cmsghdr align; + } ctrl; + struct msghdr msgh = { + .msg_iov = &iov, + .msg_iovlen = 1, + }; + if (!running) { + if (mm_idp->syscall_fd_num) { + unsigned int fds_size = + sizeof(int) * mm_idp->syscall_fd_num; + struct cmsghdr *cmsg; + + msgh.msg_control = ctrl.data; + msgh.msg_controllen = CMSG_SPACE(fds_size); + cmsg = CMSG_FIRSTHDR(&msgh); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(fds_size); + memcpy(CMSG_DATA(cmsg), mm_idp->syscall_fd_map, + fds_size); + + CATCH_EINTR(syscall(__NR_sendmsg, mm_idp->sock, + &msgh, 0)); + } + data->signal = 0; data->futex = FUTEX_IN_CHILD; CATCH_EINTR(syscall(__NR_futex, &data->futex, @@ -246,14 +279,20 @@ extern char __syscall_stub_start[]; static int stub_exe_fd; +struct tramp_data { + struct stub_data *stub_data; + /* 0 is inherited, 1 is the kernel side */ + int sockpair[2]; +}; + #ifndef CLOSE_RANGE_CLOEXEC #define CLOSE_RANGE_CLOEXEC (1U << 2) #endif -static int userspace_tramp(void *stack) +static int userspace_tramp(void *data) { + struct tramp_data *tramp_data = data; char *const argv[] = { "uml-userspace", NULL }; - int pipe_fds[2]; unsigned long long offset; struct stub_init_data init_data = { .seccomp = using_seccomp, @@ -280,7 +319,8 @@ static int userspace_tramp(void *stack) &offset); init_data.stub_code_offset = MMAP_OFFSET(offset); - init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset); + init_data.stub_data_fd = phys_mapping(uml_to_phys(tramp_data->stub_data), + &offset); init_data.stub_data_offset = MMAP_OFFSET(offset); /* @@ -291,20 +331,21 @@ static int userspace_tramp(void *stack) syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC); fcntl(init_data.stub_data_fd, F_SETFD, 0); - for (iomem = iomem_regions; iomem; iomem = iomem->next) - fcntl(iomem->fd, F_SETFD, 0); - /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */ - if (pipe(pipe_fds)) - exit(2); + /* In SECCOMP mode, these FDs are passed when needed */ + if (!using_seccomp) { + for (iomem = iomem_regions; iomem; iomem = iomem->next) + fcntl(iomem->fd, F_SETFD, 0); + } - if (dup2(pipe_fds[0], 0) < 0) + /* dup2 signaling FD/socket to STDIN */ + if (dup2(tramp_data->sockpair[0], 0) < 0) exit(3); - close(pipe_fds[0]); + close(tramp_data->sockpair[0]); /* Write init_data and close write side */ - ret = write(pipe_fds[1], &init_data, sizeof(init_data)); - close(pipe_fds[1]); + ret = write(tramp_data->sockpair[1], &init_data, sizeof(init_data)); + close(tramp_data->sockpair[1]); if (ret != sizeof(init_data)) exit(4); @@ -397,7 +438,7 @@ int userspace_pid[NR_CPUS]; /** * start_userspace() - prepare a new userspace process - * @stub_stack: pointer to the stub stack. + * @mm_id: The corresponding struct mm_id * * Setups a new temporary stack page that is used while userspace_tramp() runs * Clones the kernel process into a new userspace process, with FDs only. @@ -409,9 +450,12 @@ int userspace_pid[NR_CPUS]; int start_userspace(struct mm_id *mm_id) { struct stub_data *proc_data = (void *)mm_id->stack; + struct tramp_data tramp_data = { + .stub_data = proc_data, + }; void *stack; unsigned long sp; - int pid, status, n, err; + int status, n, err; /* setup a temporary stack page */ stack = mmap(NULL, UM_KERN_PAGE_SIZE, @@ -427,25 +471,32 @@ int start_userspace(struct mm_id *mm_id) /* set stack pointer to the end of the stack page, so it can grow downwards */ sp = (unsigned long)stack + UM_KERN_PAGE_SIZE; + /* socket pair for init data and SECCOMP FD passing (no CLOEXEC here) */ + if (socketpair(AF_UNIX, SOCK_STREAM, 0, tramp_data.sockpair)) { + err = -errno; + printk(UM_KERN_ERR "%s : socketpair failed, errno = %d\n", + __func__, errno); + return err; + } + if (using_seccomp) proc_data->futex = FUTEX_IN_CHILD; - /* clone into new userspace process */ - pid = clone(userspace_tramp, (void *) sp, + mm_id->pid = clone(userspace_tramp, (void *) sp, CLONE_VFORK | CLONE_VM | SIGCHLD, - (void *)mm_id->stack); - if (pid < 0) { + (void *)&tramp_data); + if (mm_id->pid < 0) { err = -errno; printk(UM_KERN_ERR "%s : clone failed, errno = %d\n", __func__, errno); - return err; + goto out_close; } if (using_seccomp) { wait_stub_done_seccomp(mm_id, 1, 1); } else { do { - CATCH_EINTR(n = waitpid(pid, &status, + CATCH_EINTR(n = waitpid(mm_id->pid, &status, WUNTRACED | __WALL)); if (n < 0) { err = -errno; @@ -462,7 +513,7 @@ int start_userspace(struct mm_id *mm_id) goto out_kill; } - if (ptrace(PTRACE_SETOPTIONS, pid, NULL, + if (ptrace(PTRACE_SETOPTIONS, mm_id->pid, NULL, (void *) PTRACE_O_TRACESYSGOOD) < 0) { err = -errno; printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n", @@ -478,12 +529,22 @@ int start_userspace(struct mm_id *mm_id) goto out_kill; } - mm_id->pid = pid; + close(tramp_data.sockpair[0]); + if (using_seccomp) + mm_id->sock = tramp_data.sockpair[1]; + else + close(tramp_data.sockpair[1]); + + return 0; - return pid; +out_kill: + os_kill_ptraced_process(mm_id->pid, 1); +out_close: + close(tramp_data.sockpair[0]); + close(tramp_data.sockpair[1]); + + mm_id->pid = -1; - out_kill: - os_kill_ptraced_process(pid, 1); return err; } @@ -546,17 +607,8 @@ void userspace(struct uml_pt_regs *regs) /* Mark pending syscalls for flushing */ proc_data->syscall_data_len = mm_id->syscall_data_len; - mm_id->syscall_data_len = 0; - proc_data->signal = 0; - proc_data->futex = FUTEX_IN_CHILD; - CATCH_EINTR(syscall(__NR_futex, &proc_data->futex, - FUTEX_WAKE, 1, NULL, NULL, 0)); - do { - ret = syscall(__NR_futex, &proc_data->futex, - FUTEX_WAIT, FUTEX_IN_CHILD, NULL, NULL, 0); - } while ((ret == -1 && errno == EINTR) || - proc_data->futex == FUTEX_IN_CHILD); + wait_stub_done_seccomp(mm_id, 0, 0); sig = proc_data->signal; @@ -564,9 +616,13 @@ void userspace(struct uml_pt_regs *regs) printk(UM_KERN_ERR "%s - Error flushing stub syscalls", __func__); syscall_stub_dump_error(mm_id); + mm_id->syscall_data_len = proc_data->err; fatal_sigsegv(); } + mm_id->syscall_data_len = 0; + mm_id->syscall_fd_num = 0; + ret = get_stub_state(regs, proc_data, NULL); if (ret) { printk(UM_KERN_ERR "%s - failed to get regs: %d", diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index d95dc9034b26..49015be1aaaf 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -265,6 +265,10 @@ static int __init seccomp_helper(void *data) }; struct sigaction sa; + /* close_range is needed for the stub */ + if (stub_syscall3(__NR_close_range, 1, ~0U, 0)) + exit(1); + set_sigstack(seccomp_test_stub_data->sigstack, sizeof(seccomp_test_stub_data->sigstack)); @@ -272,17 +276,17 @@ static int __init seccomp_helper(void *data) sa.sa_sigaction = (void *) sigsys_handler; sa.sa_restorer = NULL; if (sigaction(SIGSYS, &sa, NULL) < 0) - exit(1); + exit(2); prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0) - exit(2); + exit(3); sleep(0); /* Never reached. */ - _exit(3); + _exit(4); } static bool __init init_seccomp(void) -- cgit v1.2.3 From 2f956db8b3b02256b21da4d1f26fedc63782adff Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Mon, 2 Jun 2025 10:33:15 -0700 Subject: Revert "RISC-V: vDSO: Wire up getrandom() vDSO implementation" This has been on -next for a bit, but it's broken and there's already a v2. So I'm reverting it to avoid more rebasing. This reverts commit 89079520cef65d6da1e864eab4464effe5396e23. Link: https://lore.kernel.org/r/20250602173315.20228-1-palmer@dabbelt.com Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 - arch/riscv/include/asm/vdso/getrandom.h | 30 ---- arch/riscv/kernel/vdso/Makefile | 12 -- arch/riscv/kernel/vdso/getrandom.c | 10 -- arch/riscv/kernel/vdso/vdso.lds.S | 1 - arch/riscv/kernel/vdso/vgetrandom-chacha.S | 244 ----------------------------- 6 files changed, 298 deletions(-) delete mode 100644 arch/riscv/include/asm/vdso/getrandom.h delete mode 100644 arch/riscv/kernel/vdso/getrandom.c delete mode 100644 arch/riscv/kernel/vdso/vgetrandom-chacha.S (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index fbca724302ab..bbec87b79309 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -218,7 +218,6 @@ config RISCV select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU - select VDSO_GETRANDOM if HAVE_GENERIC_VDSO select USER_STACKTRACE_SUPPORT select ZONE_DMA32 if 64BIT diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h deleted file mode 100644 index 8dc92441702a..000000000000 --- a/arch/riscv/include/asm/vdso/getrandom.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2025 Xi Ruoyao . All Rights Reserved. - */ -#ifndef __ASM_VDSO_GETRANDOM_H -#define __ASM_VDSO_GETRANDOM_H - -#ifndef __ASSEMBLY__ - -#include - -static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags) -{ - register long ret asm("a0"); - register long nr asm("a7") = __NR_getrandom; - register void *buffer asm("a0") = _buffer; - register size_t len asm("a1") = _len; - register unsigned int flags asm("a2") = _flags; - - asm volatile ("ecall\n" - : "+r" (ret) - : "r" (nr), "r" (buffer), "r" (len), "r" (flags) - : "memory"); - - return ret; -} - -#endif /* !__ASSEMBLY__ */ - -#endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 8d12f5646eb5..4a5d131506fc 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -13,17 +13,9 @@ vdso-syms += flush_icache vdso-syms += hwprobe vdso-syms += sys_hwprobe -ifdef CONFIG_VDSO_GETRANDOM -vdso-syms += getrandom -endif - # Files to link into the vdso obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o -ifdef CONFIG_VDSO_GETRANDOM -obj-vdso += vgetrandom-chacha.o -endif - ccflags-y := -fno-stack-protector ccflags-y += -DDISABLE_BRANCH_PROFILING ccflags-y += -fno-builtin @@ -32,10 +24,6 @@ ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y) endif -ifneq ($(c-getrandom-y),) - CFLAGS_getrandom.o += -fPIC -include $(c-getrandom-y) -endif - CFLAGS_hwprobe.o += -fPIC # Build rules diff --git a/arch/riscv/kernel/vdso/getrandom.c b/arch/riscv/kernel/vdso/getrandom.c deleted file mode 100644 index f21922e8cebd..000000000000 --- a/arch/riscv/kernel/vdso/getrandom.c +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2025 Xi Ruoyao . All Rights Reserved. - */ -#include - -ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) -{ - return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); -} diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index abc69cda0445..8e86965a8aae 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -80,7 +80,6 @@ VERSION #ifndef COMPAT_VDSO __vdso_riscv_hwprobe; #endif - __vdso_getrandom; local: *; }; } diff --git a/arch/riscv/kernel/vdso/vgetrandom-chacha.S b/arch/riscv/kernel/vdso/vgetrandom-chacha.S deleted file mode 100644 index d793cadc78a6..000000000000 --- a/arch/riscv/kernel/vdso/vgetrandom-chacha.S +++ /dev/null @@ -1,244 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2025 Xi Ruoyao . All Rights Reserved. - * - * Based on arch/loongarch/vdso/vgetrandom-chacha.S. - */ - -#include -#include - -.text - -.macro ROTRI rd rs imm - slliw t0, \rs, 32 - \imm - srliw \rd, \rs, \imm - or \rd, \rd, t0 -.endm - -.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 - \op \d0, \d0, \s0 - \op \d1, \d1, \s1 - \op \d2, \d2, \s2 - \op \d3, \d3, \s3 -.endm - -/* - * a0: output bytes - * a1: 32-byte key input - * a2: 8-byte counter input/output - * a3: number of 64-byte blocks to write to output - */ -SYM_FUNC_START(__arch_chacha20_blocks_nostack) - -#define output a0 -#define key a1 -#define counter a2 -#define nblocks a3 -#define i a4 -#define state0 s0 -#define state1 s1 -#define state2 s2 -#define state3 s3 -#define state4 s4 -#define state5 s5 -#define state6 s6 -#define state7 s7 -#define state8 s8 -#define state9 s9 -#define state10 s10 -#define state11 s11 -#define state12 a5 -#define state13 a6 -#define state14 a7 -#define state15 t1 -#define cnt t2 -#define copy0 t3 -#define copy1 t4 -#define copy2 t5 -#define copy3 t6 - -/* Packs to be used with OP_4REG */ -#define line0 state0, state1, state2, state3 -#define line1 state4, state5, state6, state7 -#define line2 state8, state9, state10, state11 -#define line3 state12, state13, state14, state15 - -#define line1_perm state5, state6, state7, state4 -#define line2_perm state10, state11, state8, state9 -#define line3_perm state15, state12, state13, state14 - -#define copy copy0, copy1, copy2, copy3 - -#define _16 16, 16, 16, 16 -#define _20 20, 20, 20, 20 -#define _24 24, 24, 24, 24 -#define _25 25, 25, 25, 25 - - addi sp, sp, -12*SZREG - REG_S s0, (sp) - REG_S s1, SZREG(sp) - REG_S s2, 2*SZREG(sp) - REG_S s3, 3*SZREG(sp) - REG_S s4, 4*SZREG(sp) - REG_S s5, 5*SZREG(sp) - REG_S s6, 6*SZREG(sp) - REG_S s7, 7*SZREG(sp) - REG_S s8, 8*SZREG(sp) - REG_S s9, 9*SZREG(sp) - REG_S s10, 10*SZREG(sp) - REG_S s11, 11*SZREG(sp) - - ld cnt, (counter) - - li copy0, 0x61707865 - li copy1, 0x3320646e - li copy2, 0x79622d32 - li copy3, 0x6b206574 - -.Lblock: - /* state[0,1,2,3] = "expand 32-byte k" */ - mv state0, copy0 - mv state1, copy1 - mv state2, copy2 - mv state3, copy3 - - /* state[4,5,..,11] = key */ - lw state4, (key) - lw state5, 4(key) - lw state6, 8(key) - lw state7, 12(key) - lw state8, 16(key) - lw state9, 20(key) - lw state10, 24(key) - lw state11, 28(key) - - /* state[12,13] = counter */ - mv state12, cnt - srli state13, cnt, 32 - - /* state[14,15] = 0 */ - mv state14, zero - mv state15, zero - - li i, 10 -.Lpermute: - /* odd round */ - OP_4REG addw line0, line1 - OP_4REG xor line3, line0 - OP_4REG ROTRI line3, _16 - - OP_4REG addw line2, line3 - OP_4REG xor line1, line2 - OP_4REG ROTRI line1, _20 - - OP_4REG addw line0, line1 - OP_4REG xor line3, line0 - OP_4REG ROTRI line3, _24 - - OP_4REG addw line2, line3 - OP_4REG xor line1, line2 - OP_4REG ROTRI line1, _25 - - /* even round */ - OP_4REG addw line0, line1_perm - OP_4REG xor line3_perm, line0 - OP_4REG ROTRI line3_perm, _16 - - OP_4REG addw line2_perm, line3_perm - OP_4REG xor line1_perm, line2_perm - OP_4REG ROTRI line1_perm, _20 - - OP_4REG addw line0, line1_perm - OP_4REG xor line3_perm, line0 - OP_4REG ROTRI line3_perm, _24 - - OP_4REG addw line2_perm, line3_perm - OP_4REG xor line1_perm, line2_perm - OP_4REG ROTRI line1_perm, _25 - - addi i, i, -1 - bnez i, .Lpermute - - /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ - OP_4REG addw line0, copy - sw state0, (output) - sw state1, 4(output) - sw state2, 8(output) - sw state3, 12(output) - - /* from now on state[0,1,2,3] are scratch registers */ - - /* state[0,1,2,3] = lo(key) */ - lw state0, (key) - lw state1, 4(key) - lw state2, 8(key) - lw state3, 12(key) - - /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ - OP_4REG addw line1, line0 - sw state4, 16(output) - sw state5, 20(output) - sw state6, 24(output) - sw state7, 28(output) - - /* state[0,1,2,3] = hi(key) */ - lw state0, 16(key) - lw state1, 20(key) - lw state2, 24(key) - lw state3, 28(key) - - /* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */ - OP_4REG addw line2, line0 - sw state8, 32(output) - sw state9, 36(output) - sw state10, 40(output) - sw state11, 44(output) - - /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ - addw state12, state12, cnt - srli state0, cnt, 32 - addw state13, state13, state0 - sw state12, 48(output) - sw state13, 52(output) - sw state14, 56(output) - sw state15, 60(output) - - /* ++counter */ - addi cnt, cnt, 1 - - /* output += 64 */ - addi output, output, 64 - /* --nblocks */ - addi nblocks, nblocks, -1 - bnez nblocks, .Lblock - - /* counter = [cnt_lo, cnt_hi] */ - sd cnt, (counter) - - /* Zero out the potentially sensitive regs, in case nothing uses these - * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and - * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we - * only need to zero state[12,...,15]. - */ - mv state12, zero - mv state13, zero - mv state14, zero - mv state15, zero - - REG_L s0, (sp) - REG_L s1, SZREG(sp) - REG_L s2, 2*SZREG(sp) - REG_L s3, 3*SZREG(sp) - REG_L s4, 4*SZREG(sp) - REG_L s5, 5*SZREG(sp) - REG_L s6, 6*SZREG(sp) - REG_L s7, 7*SZREG(sp) - REG_L s8, 8*SZREG(sp) - REG_L s9, 9*SZREG(sp) - REG_L s10, 10*SZREG(sp) - REG_L s11, 11*SZREG(sp) - addi sp, sp, 12*SZREG - - ret -SYM_FUNC_END(__arch_chacha20_blocks_nostack) -- cgit v1.2.3 From 8b68e978718f14fdcb080c2a7791c52a0d09bc6d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Feb 2025 16:01:57 +0100 Subject: x86/iopl: Cure TIF_IO_BITMAP inconsistencies io_bitmap_exit() is invoked from exit_thread() when a task exists or when a fork fails. In the latter case the exit_thread() cleans up resources which were allocated during fork(). io_bitmap_exit() invokes task_update_io_bitmap(), which in turn ends up in tss_update_io_bitmap(). tss_update_io_bitmap() operates on the current task. If current has TIF_IO_BITMAP set, but no bitmap installed, tss_update_io_bitmap() crashes with a NULL pointer dereference. There are two issues, which lead to that problem: 1) io_bitmap_exit() should not invoke task_update_io_bitmap() when the task, which is cleaned up, is not the current task. That's a clear indicator for a cleanup after a failed fork(). 2) A task should not have TIF_IO_BITMAP set and neither a bitmap installed nor IOPL emulation level 3 activated. This happens when a kernel thread is created in the context of a user space thread, which has TIF_IO_BITMAP set as the thread flags are copied and the IO bitmap pointer is cleared. Other than in the failed fork() case this has no impact because kernel threads including IO workers never return to user space and therefore never invoke tss_update_io_bitmap(). Cure this by adding the missing cleanups and checks: 1) Prevent io_bitmap_exit() to invoke task_update_io_bitmap() if the to be cleaned up task is not the current task. 2) Clear TIF_IO_BITMAP in copy_thread() unconditionally. For user space forks it is set later, when the IO bitmap is inherited in io_bitmap_share(). For paranoia sake, add a warning into tss_update_io_bitmap() to catch the case, when that code is invoked with inconsistent state. Fixes: ea5f1cd7ab49 ("x86/ioperm: Remove bitmap if all permissions dropped") Reported-by: syzbot+e2b1803445d236442e54@syzkaller.appspotmail.com Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/87wmdceom2.ffs@tglx --- arch/x86/kernel/ioport.c | 13 +++++++++---- arch/x86/kernel/process.c | 6 ++++++ 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 6290dd120f5e..ff40f09ad911 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -33,8 +33,9 @@ void io_bitmap_share(struct task_struct *tsk) set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } -static void task_update_io_bitmap(struct task_struct *tsk) +static void task_update_io_bitmap(void) { + struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; if (t->iopl_emul == 3 || t->io_bitmap) { @@ -54,7 +55,12 @@ void io_bitmap_exit(struct task_struct *tsk) struct io_bitmap *iobm = tsk->thread.io_bitmap; tsk->thread.io_bitmap = NULL; - task_update_io_bitmap(tsk); + /* + * Don't touch the TSS when invoked on a failed fork(). TSS + * reflects the state of @current and not the state of @tsk. + */ + if (tsk == current) + task_update_io_bitmap(); if (iobm && refcount_dec_and_test(&iobm->refcnt)) kfree(iobm); } @@ -192,8 +198,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) } t->iopl_emul = level; - task_update_io_bitmap(current); - + task_update_io_bitmap(); return 0; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c1d2dac72b9c..704883c21f3a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -176,6 +176,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->ret_addr = (unsigned long) ret_from_fork_asm; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; + clear_tsk_thread_flag(p, TIF_IO_BITMAP); p->thread.iopl_warn = 0; memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); @@ -464,6 +465,11 @@ void native_tss_update_io_bitmap(void) } else { struct io_bitmap *iobm = t->io_bitmap; + if (WARN_ON_ONCE(!iobm)) { + clear_thread_flag(TIF_IO_BITMAP); + native_tss_invalidate_io_bitmap(); + } + /* * Only copy bitmap data when the sequence number differs. The * update time is accounted to the incoming task. -- cgit v1.2.3 From 942349413a49670e8bed246e2185fd3a053227be Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 4 Jun 2025 10:17:05 +0200 Subject: um: fix SECCOMP 32bit xstate register restore There was a typo that caused the extended FP state to be copied into the wrong location on 32 bit. On 32 bit we only store the xstate internally as that already contains everything. However, for compatibility, the mcontext on 32 bit first contains the legacy FP state and then the xstate. The code copied the xstate on top of the legacy FP state instead of using the correct offset. This offset was already calculated in the xstate_* variables, so simply switch to those to fix the problem. With this SECCOMP mode works on 32 bit, so lift the restriction. Fixes: b1e1bd2e6943 ("um: Add helper functions to get/set state for SECCOMP") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250604081705.934112-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/os-Linux/start_up.c | 4 ---- arch/x86/um/os-Linux/mcontext.c | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c index 49015be1aaaf..a827c2e01aa5 100644 --- a/arch/um/os-Linux/start_up.c +++ b/arch/um/os-Linux/start_up.c @@ -296,10 +296,6 @@ static bool __init init_seccomp(void) int n; unsigned long sp; - /* doesn't work on 32-bit right now */ - if (!IS_ENABLED(CONFIG_64BIT)) - return false; - /* * We check that we can install a seccomp filter and then exit(0) * from a trapped syscall. diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c index e661fdc44db9..a21403df6663 100644 --- a/arch/x86/um/os-Linux/mcontext.c +++ b/arch/x86/um/os-Linux/mcontext.c @@ -231,7 +231,7 @@ int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data, xstate_size = fp_size; #endif - memcpy(fpstate_stub, ®s->fp, fp_size); + memcpy(xstate_stub, ®s->fp, xstate_size); #ifdef __i386__ /* -- cgit v1.2.3 From cf8651f7319d12a6fd81e1d001afb0958ee2bf28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 23 May 2025 12:19:18 +0200 Subject: riscv: sbi: add Firmware Feature (FWFT) SBI extensions definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Firmware Features extension (FWFT) was added as part of the SBI 3.0 specification. Add SBI definitions to use this extension. Signed-off-by: Clément Léger Reviewed-by: Samuel Holland Tested-by: Samuel Holland Reviewed-by: Deepak Gupta Reviewed-by: Andrew Jones Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20250523101932.1594077-2-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 3d250824178b..bb077d0c912f 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -35,6 +35,7 @@ enum sbi_ext_id { SBI_EXT_DBCN = 0x4442434E, SBI_EXT_STA = 0x535441, SBI_EXT_NACL = 0x4E41434C, + SBI_EXT_FWFT = 0x46574654, /* Experimentals extensions must lie within this range */ SBI_EXT_EXPERIMENTAL_START = 0x08000000, @@ -402,6 +403,33 @@ enum sbi_ext_nacl_feature { #define SBI_NACL_SHMEM_SRET_X(__i) ((__riscv_xlen / 8) * (__i)) #define SBI_NACL_SHMEM_SRET_X_LAST 31 +/* SBI function IDs for FW feature extension */ +#define SBI_EXT_FWFT_SET 0x0 +#define SBI_EXT_FWFT_GET 0x1 + +enum sbi_fwft_feature_t { + SBI_FWFT_MISALIGNED_EXC_DELEG = 0x0, + SBI_FWFT_LANDING_PAD = 0x1, + SBI_FWFT_SHADOW_STACK = 0x2, + SBI_FWFT_DOUBLE_TRAP = 0x3, + SBI_FWFT_PTE_AD_HW_UPDATING = 0x4, + SBI_FWFT_POINTER_MASKING_PMLEN = 0x5, + SBI_FWFT_LOCAL_RESERVED_START = 0x6, + SBI_FWFT_LOCAL_RESERVED_END = 0x3fffffff, + SBI_FWFT_LOCAL_PLATFORM_START = 0x40000000, + SBI_FWFT_LOCAL_PLATFORM_END = 0x7fffffff, + + SBI_FWFT_GLOBAL_RESERVED_START = 0x80000000, + SBI_FWFT_GLOBAL_RESERVED_END = 0xbfffffff, + SBI_FWFT_GLOBAL_PLATFORM_START = 0xc0000000, + SBI_FWFT_GLOBAL_PLATFORM_END = 0xffffffff, +}; + +#define SBI_FWFT_PLATFORM_FEATURE_BIT BIT(30) +#define SBI_FWFT_GLOBAL_FEATURE_BIT BIT(31) + +#define SBI_FWFT_SET_FLAG_LOCK BIT(0) + /* SBI spec version fields */ #define SBI_SPEC_VERSION_DEFAULT 0x1 #define SBI_SPEC_VERSION_MAJOR_SHIFT 24 @@ -419,6 +447,11 @@ enum sbi_ext_nacl_feature { #define SBI_ERR_ALREADY_STARTED -7 #define SBI_ERR_ALREADY_STOPPED -8 #define SBI_ERR_NO_SHMEM -9 +#define SBI_ERR_INVALID_STATE -10 +#define SBI_ERR_BAD_RANGE -11 +#define SBI_ERR_TIMEOUT -12 +#define SBI_ERR_IO -13 +#define SBI_ERR_DENIED_LOCKED -14 extern unsigned long sbi_spec_version; struct sbiret { -- cgit v1.2.3 From a7cd450f0e06b4118b2ca8b4e1ef707d5c2c4506 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 23 May 2025 12:19:19 +0200 Subject: riscv: sbi: remove useless parenthesis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A few parenthesis in check for SBI version/extension were useless, remove them. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20250523101932.1594077-3-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sbi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 1989b8cade1b..1d44c35305a9 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -609,7 +609,7 @@ void __init sbi_init(void) } else { __sbi_rfence = __sbi_rfence_v01; } - if ((sbi_spec_version >= sbi_mk_version(0, 3)) && + if (sbi_spec_version >= sbi_mk_version(0, 3) && sbi_probe_extension(SBI_EXT_SRST)) { pr_info("SBI SRST extension detected\n"); pm_power_off = sbi_srst_power_off; @@ -617,8 +617,8 @@ void __init sbi_init(void) sbi_srst_reboot_nb.priority = 192; register_restart_handler(&sbi_srst_reboot_nb); } - if ((sbi_spec_version >= sbi_mk_version(2, 0)) && - (sbi_probe_extension(SBI_EXT_DBCN) > 0)) { + if (sbi_spec_version >= sbi_mk_version(2, 0) && + sbi_probe_extension(SBI_EXT_DBCN) > 0) { pr_info("SBI DBCN extension detected\n"); sbi_debug_console_available = true; } -- cgit v1.2.3 From 99cf5b7c738733032af9a265a6a5a6bc34b91900 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 23 May 2025 12:19:20 +0200 Subject: riscv: sbi: add new SBI error mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A few new errors have been added with SBI V3.0, maps them as close as possible to errno values. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20250523101932.1594077-4-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index bb077d0c912f..0938f2a8d01b 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -536,11 +536,21 @@ static inline int sbi_err_map_linux_errno(int err) case SBI_SUCCESS: return 0; case SBI_ERR_DENIED: + case SBI_ERR_DENIED_LOCKED: return -EPERM; case SBI_ERR_INVALID_PARAM: + case SBI_ERR_INVALID_STATE: return -EINVAL; + case SBI_ERR_BAD_RANGE: + return -ERANGE; case SBI_ERR_INVALID_ADDRESS: return -EFAULT; + case SBI_ERR_NO_SHMEM: + return -ENOMEM; + case SBI_ERR_TIMEOUT: + return -ETIMEDOUT; + case SBI_ERR_IO: + return -EIO; case SBI_ERR_NOT_SUPPORTED: case SBI_ERR_FAILURE: default: -- cgit v1.2.3 From 6d6d0641dcfa9d1e398d75791283bf6d129135de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 23 May 2025 12:19:21 +0200 Subject: riscv: sbi: add FWFT extension interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This SBI extensions enables supervisor mode to control feature that are under M-mode control (For instance, Svadu menvcfg ADUE bit, Ssdbltrp DTE, etc). Add an interface to set local features for a specific cpu mask as well as for the online cpu mask. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20250523101932.1594077-5-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/sbi.h | 17 +++++++++++++ arch/riscv/kernel/sbi.c | 57 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h index 0938f2a8d01b..341e74238aa0 100644 --- a/arch/riscv/include/asm/sbi.h +++ b/arch/riscv/include/asm/sbi.h @@ -503,6 +503,23 @@ int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask, unsigned long asid); long sbi_probe_extension(int ext); +int sbi_fwft_set(u32 feature, unsigned long value, unsigned long flags); +int sbi_fwft_set_cpumask(const cpumask_t *mask, u32 feature, + unsigned long value, unsigned long flags); +/** + * sbi_fwft_set_online_cpus() - Set a feature on all online cpus + * @feature: The feature to be set + * @value: The feature value to be set + * @flags: FWFT feature set flags + * + * Return: 0 on success, appropriate linux error code otherwise. + */ +static inline int sbi_fwft_set_online_cpus(u32 feature, unsigned long value, + unsigned long flags) +{ + return sbi_fwft_set_cpumask(cpu_online_mask, feature, value, flags); +} + /* Check if current SBI specification version is 0.1 or not */ static inline int sbi_spec_is_0_1(void) { diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 1d44c35305a9..818efafdc8e9 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -299,6 +299,63 @@ static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, return 0; } +struct fwft_set_req { + u32 feature; + unsigned long value; + unsigned long flags; + atomic_t error; +}; + +static void cpu_sbi_fwft_set(void *arg) +{ + struct fwft_set_req *req = arg; + int ret; + + ret = sbi_fwft_set(req->feature, req->value, req->flags); + if (ret) + atomic_set(&req->error, ret); +} + +/** + * sbi_fwft_set() - Set a feature on the local hart + * @feature: The feature ID to be set + * @value: The feature value to be set + * @flags: FWFT feature set flags + * + * Return: 0 on success, appropriate linux error code otherwise. + */ +int sbi_fwft_set(u32 feature, unsigned long value, unsigned long flags) +{ + return -EOPNOTSUPP; +} + +/** + * sbi_fwft_set_cpumask() - Set a feature for the specified cpumask + * @mask: CPU mask of cpus that need the feature to be set + * @feature: The feature ID to be set + * @value: The feature value to be set + * @flags: FWFT feature set flags + * + * Return: 0 on success, appropriate linux error code otherwise. + */ +int sbi_fwft_set_cpumask(const cpumask_t *mask, u32 feature, + unsigned long value, unsigned long flags) +{ + struct fwft_set_req req = { + .feature = feature, + .value = value, + .flags = flags, + .error = ATOMIC_INIT(0), + }; + + if (feature & SBI_FWFT_GLOBAL_FEATURE_BIT) + return -EINVAL; + + on_each_cpu_mask(mask, cpu_sbi_fwft_set, &req, 1); + + return atomic_read(&req.error); +} + /** * sbi_set_timer() - Program the timer for next timer event. * @stime_value: The value after which next timer event should fire. -- cgit v1.2.3 From c4a50db1e1739a5d4698dee7cd4c6f6430bff7b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 23 May 2025 12:19:22 +0200 Subject: riscv: sbi: add SBI FWFT extension calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add FWFT extension calls. This will be ratified in SBI V3.0 hence, it is provided as a separate commit that can be left out if needed. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Reviewed-by: Atish Patra Link: https://lore.kernel.org/r/20250523101932.1594077-6-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/sbi.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c index 818efafdc8e9..53836a9235e3 100644 --- a/arch/riscv/kernel/sbi.c +++ b/arch/riscv/kernel/sbi.c @@ -299,6 +299,8 @@ static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask, return 0; } +static bool sbi_fwft_supported; + struct fwft_set_req { u32 feature; unsigned long value; @@ -326,7 +328,15 @@ static void cpu_sbi_fwft_set(void *arg) */ int sbi_fwft_set(u32 feature, unsigned long value, unsigned long flags) { - return -EOPNOTSUPP; + struct sbiret ret; + + if (!sbi_fwft_supported) + return -EOPNOTSUPP; + + ret = sbi_ecall(SBI_EXT_FWFT, SBI_EXT_FWFT_SET, + feature, value, flags, 0, 0, 0); + + return sbi_err_map_linux_errno(ret.error); } /** @@ -348,6 +358,9 @@ int sbi_fwft_set_cpumask(const cpumask_t *mask, u32 feature, .error = ATOMIC_INIT(0), }; + if (!sbi_fwft_supported) + return -EOPNOTSUPP; + if (feature & SBI_FWFT_GLOBAL_FEATURE_BIT) return -EINVAL; @@ -679,6 +692,11 @@ void __init sbi_init(void) pr_info("SBI DBCN extension detected\n"); sbi_debug_console_available = true; } + if (sbi_spec_version >= sbi_mk_version(3, 0) && + sbi_probe_extension(SBI_EXT_FWFT)) { + pr_info("SBI FWFT extension detected\n"); + sbi_fwft_supported = true; + } } else { __sbi_set_timer = __sbi_set_timer_v01; __sbi_send_ipi = __sbi_send_ipi_v01; -- cgit v1.2.3 From cf5a8abc6560f989a880bec3647c614e638a0c9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 23 May 2025 12:19:23 +0200 Subject: riscv: misaligned: request misaligned exception from SBI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that the kernel can handle misaligned accesses in S-mode, request misaligned access exception delegation from SBI. This uses the FWFT SBI extension defined in SBI version 3.0. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20250523101932.1594077-7-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpufeature.h | 3 +- arch/riscv/kernel/traps_misaligned.c | 71 ++++++++++++++++++++++++++++-- arch/riscv/kernel/unaligned_access_speed.c | 8 +++- 3 files changed, 77 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index f56b409361fb..dbe5970d4fe6 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -67,8 +67,9 @@ void __init riscv_user_isa_enable(void); _RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), _validate) bool __init check_unaligned_access_emulated_all_cpus(void); +void unaligned_access_init(void); +int cpu_online_unaligned_access_init(unsigned int cpu); #if defined(CONFIG_RISCV_SCALAR_MISALIGNED) -void check_unaligned_access_emulated(struct work_struct *work __always_unused); void unaligned_emulation_finish(void); bool unaligned_ctl_available(void); DECLARE_PER_CPU(long, misaligned_access_speed); diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 77c788660223..592b1a28e897 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #define INSN_MATCH_LB 0x3 @@ -646,7 +647,7 @@ bool __init check_vector_unaligned_access_emulated_all_cpus(void) static bool unaligned_ctl __read_mostly; -void check_unaligned_access_emulated(struct work_struct *work __always_unused) +static void check_unaligned_access_emulated(struct work_struct *work __always_unused) { int cpu = smp_processor_id(); long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu); @@ -657,6 +658,13 @@ void check_unaligned_access_emulated(struct work_struct *work __always_unused) __asm__ __volatile__ ( " "REG_L" %[tmp], 1(%[ptr])\n" : [tmp] "=r" (tmp_val) : [ptr] "r" (&tmp_var) : "memory"); +} + +static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) +{ + long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu); + + check_unaligned_access_emulated(NULL); /* * If unaligned_ctl is already set, this means that we detected that all @@ -665,9 +673,10 @@ void check_unaligned_access_emulated(struct work_struct *work __always_unused) */ if (unlikely(unaligned_ctl && (*mas_ptr != RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED))) { pr_crit("CPU misaligned accesses non homogeneous (expected all emulated)\n"); - while (true) - cpu_relax(); + return -EINVAL; } + + return 0; } bool __init check_unaligned_access_emulated_all_cpus(void) @@ -699,4 +708,60 @@ bool __init check_unaligned_access_emulated_all_cpus(void) { return false; } +static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) +{ + return 0; +} +#endif + +#ifdef CONFIG_RISCV_SBI + +static bool misaligned_traps_delegated; + +static int cpu_online_sbi_unaligned_setup(unsigned int cpu) +{ + if (sbi_fwft_set(SBI_FWFT_MISALIGNED_EXC_DELEG, 1, 0) && + misaligned_traps_delegated) { + pr_crit("Misaligned trap delegation non homogeneous (expected delegated)"); + return -EINVAL; + } + + return 0; +} + +void __init unaligned_access_init(void) +{ + int ret; + + ret = sbi_fwft_set_online_cpus(SBI_FWFT_MISALIGNED_EXC_DELEG, 1, 0); + if (ret) + return; + + misaligned_traps_delegated = true; + pr_info("SBI misaligned access exception delegation ok\n"); + /* + * Note that we don't have to take any specific action here, if + * the delegation is successful, then + * check_unaligned_access_emulated() will verify that indeed the + * platform traps on misaligned accesses. + */ +} +#else +void __init unaligned_access_init(void) {} + +static int cpu_online_sbi_unaligned_setup(unsigned int cpu __always_unused) +{ + return 0; +} #endif + +int cpu_online_unaligned_access_init(unsigned int cpu) +{ + int ret; + + ret = cpu_online_sbi_unaligned_setup(cpu); + if (ret) + return ret; + + return cpu_online_check_unaligned_access_emulated(cpu); +} diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c index b8ba13819d05..ae2068425fbc 100644 --- a/arch/riscv/kernel/unaligned_access_speed.c +++ b/arch/riscv/kernel/unaligned_access_speed.c @@ -236,6 +236,11 @@ arch_initcall_sync(lock_and_set_unaligned_access_static_branch); static int riscv_online_cpu(unsigned int cpu) { + int ret = cpu_online_unaligned_access_init(cpu); + + if (ret) + return ret; + /* We are already set since the last check */ if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { goto exit; @@ -248,7 +253,6 @@ static int riscv_online_cpu(unsigned int cpu) { static struct page *buf; - check_unaligned_access_emulated(NULL); buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER); if (!buf) { pr_warn("Allocation failure, not measuring misaligned performance\n"); @@ -439,6 +443,8 @@ static int __init check_unaligned_access_all_cpus(void) { int cpu; + unaligned_access_init(); + if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) { pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n", speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param); -- cgit v1.2.3 From 9f9f6fdd1dc6791bcfe251160a96a446199f85ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 23 May 2025 12:19:24 +0200 Subject: riscv: misaligned: use on_each_cpu() for scalar misaligned access probing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit schedule_on_each_cpu() was used without any good reason while documented as very slow. This call was in the boot path, so better use on_each_cpu() for scalar misaligned checking. Vector misaligned check still needs to use schedule_on_each_cpu() since it requires irqs to be enabled but that's less of a problem since this code is ran in a kthread. Add a comment to explicit that. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Reviewed-by: Charlie Jenkins Tested-by: Charlie Jenkins Link: https://lore.kernel.org/r/20250523101932.1594077-8-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps_misaligned.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 592b1a28e897..34b4a4e9dfca 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -627,6 +627,10 @@ bool __init check_vector_unaligned_access_emulated_all_cpus(void) { int cpu; + /* + * While being documented as very slow, schedule_on_each_cpu() is used since + * kernel_vector_begin() expects irqs to be enabled or it will panic() + */ schedule_on_each_cpu(check_vector_unaligned_access_emulated); for_each_online_cpu(cpu) @@ -647,7 +651,7 @@ bool __init check_vector_unaligned_access_emulated_all_cpus(void) static bool unaligned_ctl __read_mostly; -static void check_unaligned_access_emulated(struct work_struct *work __always_unused) +static void check_unaligned_access_emulated(void *arg __always_unused) { int cpu = smp_processor_id(); long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu); @@ -688,7 +692,7 @@ bool __init check_unaligned_access_emulated_all_cpus(void) * accesses emulated since tasks requesting such control can run on any * CPU. */ - schedule_on_each_cpu(check_unaligned_access_emulated); + on_each_cpu(check_unaligned_access_emulated, NULL, 1); for_each_online_cpu(cpu) if (per_cpu(misaligned_access_speed, cpu) -- cgit v1.2.3 From 1317045a7d6f397904d105f6d40dc9787876a34b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 23 May 2025 12:19:25 +0200 Subject: riscv: misaligned: declare misaligned_access_speed under CONFIG_RISCV_MISALIGNED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While misaligned_access_speed was defined in a file compile with CONFIG_RISCV_MISALIGNED, its definition was under CONFIG_RISCV_SCALAR_MISALIGNED. This resulted in compilation problems when using it in a file compiled with CONFIG_RISCV_MISALIGNED. Move the declaration under CONFIG_RISCV_MISALIGNED so that it can be used unconditionnally when compiled with that config and remove the check for that variable in traps_misaligned.c. Signed-off-by: Clément Léger Reviewed-by: Charlie Jenkins Tested-by: Charlie Jenkins Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20250523101932.1594077-9-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpufeature.h | 5 ++++- arch/riscv/kernel/traps_misaligned.c | 2 -- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index dbe5970d4fe6..2bfa4ef383ed 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -72,7 +72,6 @@ int cpu_online_unaligned_access_init(unsigned int cpu); #if defined(CONFIG_RISCV_SCALAR_MISALIGNED) void unaligned_emulation_finish(void); bool unaligned_ctl_available(void); -DECLARE_PER_CPU(long, misaligned_access_speed); #else static inline bool unaligned_ctl_available(void) { @@ -80,6 +79,10 @@ static inline bool unaligned_ctl_available(void) } #endif +#if defined(CONFIG_RISCV_MISALIGNED) +DECLARE_PER_CPU(long, misaligned_access_speed); +#endif + bool __init check_vector_unaligned_access_emulated_all_cpus(void); #if defined(CONFIG_RISCV_VECTOR_MISALIGNED) void check_vector_unaligned_access_emulated(struct work_struct *work __always_unused); diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 34b4a4e9dfca..f1b2af515592 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -369,9 +369,7 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs) perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr); -#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED; -#endif if (!unaligned_enabled) return -1; -- cgit v1.2.3 From 4eaaa65e301208d6ff612ad2244c6174c9d852b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 23 May 2025 12:19:26 +0200 Subject: riscv: misaligned: move emulated access uniformity check in a function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split the code that check for the uniformity of misaligned accesses performance on all cpus from check_unaligned_access_emulated_all_cpus() to its own function which will be used for delegation check. No functional changes intended. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Reviewed-by: Charlie Jenkins Tested-by: Charlie Jenkins Link: https://lore.kernel.org/r/20250523101932.1594077-10-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/traps_misaligned.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index f1b2af515592..7ecaa8103fe7 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -645,6 +645,18 @@ bool __init check_vector_unaligned_access_emulated_all_cpus(void) } #endif +static bool all_cpus_unaligned_scalar_access_emulated(void) +{ + int cpu; + + for_each_online_cpu(cpu) + if (per_cpu(misaligned_access_speed, cpu) != + RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED) + return false; + + return true; +} + #ifdef CONFIG_RISCV_SCALAR_MISALIGNED static bool unaligned_ctl __read_mostly; @@ -683,8 +695,6 @@ static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) bool __init check_unaligned_access_emulated_all_cpus(void) { - int cpu; - /* * We can only support PR_UNALIGN controls if all CPUs have misaligned * accesses emulated since tasks requesting such control can run on any @@ -692,10 +702,8 @@ bool __init check_unaligned_access_emulated_all_cpus(void) */ on_each_cpu(check_unaligned_access_emulated, NULL, 1); - for_each_online_cpu(cpu) - if (per_cpu(misaligned_access_speed, cpu) - != RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED) - return false; + if (!all_cpus_unaligned_scalar_access_emulated()) + return false; unaligned_ctl = true; return true; -- cgit v1.2.3 From 7977448bf374f6e9592153838f072a89bd3b5c45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Fri, 23 May 2025 12:19:27 +0200 Subject: riscv: misaligned: add a function to check misalign trap delegability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Checking for the delegability of the misaligned access trap is needed for the KVM FWFT extension implementation. Add a function to get the delegability of the misaligned trap exception. Signed-off-by: Clément Léger Reviewed-by: Andrew Jones Reviewed-by: Charlie Jenkins Tested-by: Charlie Jenkins Link: https://lore.kernel.org/r/20250523101932.1594077-11-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cpufeature.h | 6 ++++++ arch/riscv/kernel/traps_misaligned.c | 17 +++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h index 2bfa4ef383ed..fbd0e4306c93 100644 --- a/arch/riscv/include/asm/cpufeature.h +++ b/arch/riscv/include/asm/cpufeature.h @@ -81,6 +81,12 @@ static inline bool unaligned_ctl_available(void) #if defined(CONFIG_RISCV_MISALIGNED) DECLARE_PER_CPU(long, misaligned_access_speed); +bool misaligned_traps_can_delegate(void); +#else +static inline bool misaligned_traps_can_delegate(void) +{ + return false; +} #endif bool __init check_vector_unaligned_access_emulated_all_cpus(void); diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 7ecaa8103fe7..93043924fe6c 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -724,10 +724,10 @@ static int cpu_online_check_unaligned_access_emulated(unsigned int cpu) } #endif -#ifdef CONFIG_RISCV_SBI - static bool misaligned_traps_delegated; +#ifdef CONFIG_RISCV_SBI + static int cpu_online_sbi_unaligned_setup(unsigned int cpu) { if (sbi_fwft_set(SBI_FWFT_MISALIGNED_EXC_DELEG, 1, 0) && @@ -763,6 +763,7 @@ static int cpu_online_sbi_unaligned_setup(unsigned int cpu __always_unused) { return 0; } + #endif int cpu_online_unaligned_access_init(unsigned int cpu) @@ -775,3 +776,15 @@ int cpu_online_unaligned_access_init(unsigned int cpu) return cpu_online_check_unaligned_access_emulated(cpu); } + +bool misaligned_traps_can_delegate(void) +{ + /* + * Either we successfully requested misaligned traps delegation for all + * CPUs, or the SBI does not implement the FWFT extension but delegated + * the exception by default. + */ + return misaligned_traps_delegated || + all_cpus_unaligned_scalar_access_emulated(); +} +EXPORT_SYMBOL_GPL(misaligned_traps_can_delegate); -- cgit v1.2.3 From 27a041040f2c6cfbbc24053bce9acb2e801e4e71 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 5 Jun 2025 07:03:24 +0200 Subject: um: fix unused variable warning The code was updated to access the PID of the userspace stub process in a different way, making the local cpu variable obsolete. Remove it. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506050008.AwXLNxQX-lkp@intel.com/ Fixes: 406d17c6c370 ("um: Implement kernel side of SECCOMP based process handling") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250605050325.1077208-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/x86/um/tls_32.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c index 21cbb70cf771..cb3f17627d16 100644 --- a/arch/x86/um/tls_32.c +++ b/arch/x86/um/tls_32.c @@ -25,7 +25,6 @@ int host_gdt_entry_tls_min; static int do_set_thread_area(struct task_struct* task, struct user_desc *info) { int ret; - u32 cpu; if (info->entry_number < host_gdt_entry_tls_min || info->entry_number >= host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES) @@ -41,9 +40,7 @@ static int do_set_thread_area(struct task_struct* task, struct user_desc *info) return 0; } - cpu = get_cpu(); ret = os_set_thread_area(info, task->mm->context.id.pid); - put_cpu(); if (ret) printk(KERN_ERR "PTRACE_SET_THREAD_AREA failed, err = %d, " -- cgit v1.2.3 From e56a50ff7c12983aba710bd02a2c2ad401379e91 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Thu, 5 Jun 2025 07:03:25 +0200 Subject: um: remove "extern" from implementation of sigchld_handler There is no need to mark the function as extern in the implementation. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506051226.X8r7X5aa-lkp@intel.com/ Fixes: 8420e08fe3a5 ("um: Track userspace children dying in SECCOMP mode") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250605050325.1077208-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/kernel/irq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index f1787be3983c..0dfaf96bb7da 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -691,8 +691,8 @@ void __init init_IRQ(void) os_setup_epoll(); } -extern void sigchld_handler(int sig, struct siginfo *unused_si, - struct uml_pt_regs *regs, void *mc) +void sigchld_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc) { do_IRQ(SIGCHLD_IRQ, regs); } -- cgit v1.2.3 From 11709abccf93b08adde95ef313c300b0d4bc28f1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 3 Jun 2025 15:49:36 +0200 Subject: s390/mm: Fix in_atomic() handling in do_secure_storage_access() Kernel user spaces accesses to not exported pages in atomic context incorrectly try to resolve the page fault. With debug options enabled call traces like this can be seen: BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:1523 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 419074, name: qemu-system-s39 preempt_count: 1, expected: 0 RCU nest depth: 0, expected: 0 INFO: lockdep is turned off. Preemption disabled at: [<00000383ea47cfa2>] copy_page_from_iter_atomic+0xa2/0x8a0 CPU: 12 UID: 0 PID: 419074 Comm: qemu-system-s39 Tainted: G W 6.16.0-20250531.rc0.git0.69b3a602feac.63.fc42.s390x+debug #1 PREEMPT Tainted: [W]=WARN Hardware name: IBM 3931 A01 703 (LPAR) Call Trace: [<00000383e990d282>] dump_stack_lvl+0xa2/0xe8 [<00000383e99bf152>] __might_resched+0x292/0x2d0 [<00000383eaa7c374>] down_read+0x34/0x2d0 [<00000383e99432f8>] do_secure_storage_access+0x108/0x360 [<00000383eaa724b0>] __do_pgm_check+0x130/0x220 [<00000383eaa842e4>] pgm_check_handler+0x114/0x160 [<00000383ea47d028>] copy_page_from_iter_atomic+0x128/0x8a0 ([<00000383ea47d016>] copy_page_from_iter_atomic+0x116/0x8a0) [<00000383e9c45eae>] generic_perform_write+0x16e/0x310 [<00000383e9eb87f4>] ext4_buffered_write_iter+0x84/0x160 [<00000383e9da0de4>] vfs_write+0x1c4/0x460 [<00000383e9da123c>] ksys_write+0x7c/0x100 [<00000383eaa7284e>] __do_syscall+0x15e/0x280 [<00000383eaa8417e>] system_call+0x6e/0x90 INFO: lockdep is turned off. It is not allowed to take the mmap_lock while in atomic context. Therefore handle such a secure storage access fault as if the accessed page is not mapped: the uaccess function will return -EFAULT, and the caller has to deal with this. Usually this means that the access is retried in process context, which allows to resolve the page fault (or in this case export the page). Reviewed-by: Claudio Imbrenda Acked-by: Alexander Gordeev Acked-by: Christian Borntraeger Link: https://lore.kernel.org/r/20250603134936.1314139-1-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/mm/fault.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index da84ff6770de..8b3f6dd00eab 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -442,6 +442,8 @@ void do_secure_storage_access(struct pt_regs *regs) if (rc) BUG(); } else { + if (faulthandler_disabled()) + return handle_fault_error_nolock(regs, 0); mm = current->mm; mmap_read_lock(mm); vma = find_vma(mm, addr); -- cgit v1.2.3 From 6678791ee3da0b78c28fe7d77814097f53cbb8df Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jun 2025 08:08:21 +0100 Subject: KVM: arm64: Add assignment-specific sysreg accessor Assigning a value to a system register doesn't do what it is supposed to be doing if that register is one that has RESx bits. The main problem is that we use __vcpu_sys_reg(), which can be used both as a lvalue and rvalue. When used as a lvalue, the bit masking occurs *before* the new value is assigned, meaning that we (1) do pointless work on the old cvalue, and (2) potentially assign an invalid value as we fail to apply the masks to it. Fix this by providing a new __vcpu_assign_sys_reg() that does what it says on the tin, and sanitises the *new* value instead of the old one. This comes with a significant amount of churn. Reviewed-by: Miguel Luis Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250603070824.1192795-2-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 11 ++++++++ arch/arm64/kvm/arch_timer.c | 16 +++++------ arch/arm64/kvm/hyp/exception.c | 4 +-- arch/arm64/kvm/hyp/include/hyp/switch.h | 4 +-- arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h | 6 ++-- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 4 +-- arch/arm64/kvm/hyp/vhe/switch.c | 4 +-- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 44 +++++++++++++++--------------- arch/arm64/kvm/pmu-emul.c | 14 +++++----- arch/arm64/kvm/sys_regs.c | 38 ++++++++++++++------------ arch/arm64/kvm/sys_regs.h | 4 +-- arch/arm64/kvm/vgic/vgic-v3-nested.c | 10 +++---- 12 files changed, 86 insertions(+), 73 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index d941abc6b5ee..3b84ad91116b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1107,6 +1107,17 @@ static inline u64 *___ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r) #define ctxt_sys_reg(c,r) (*__ctxt_sys_reg(c,r)) u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64); + +#define __vcpu_assign_sys_reg(v, r, val) \ + do { \ + const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ + u64 __v = (val); \ + if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ + __v = kvm_vcpu_apply_reg_masks((v), (r), __v); \ + \ + ctxt_sys_reg(ctxt, (r)) = __v; \ + } while (0) + #define __vcpu_sys_reg(v,r) \ (*({ \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 5133dcbfe9f7..5a67a6d4d95b 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -108,16 +108,16 @@ static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl) switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: - __vcpu_sys_reg(vcpu, CNTV_CTL_EL0) = ctl; + __vcpu_assign_sys_reg(vcpu, CNTV_CTL_EL0, ctl); break; case TIMER_PTIMER: - __vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = ctl; + __vcpu_assign_sys_reg(vcpu, CNTP_CTL_EL0, ctl); break; case TIMER_HVTIMER: - __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2) = ctl; + __vcpu_assign_sys_reg(vcpu, CNTHV_CTL_EL2, ctl); break; case TIMER_HPTIMER: - __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2) = ctl; + __vcpu_assign_sys_reg(vcpu, CNTHP_CTL_EL2, ctl); break; default: WARN_ON(1); @@ -130,16 +130,16 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval) switch(arch_timer_ctx_index(ctxt)) { case TIMER_VTIMER: - __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0) = cval; + __vcpu_assign_sys_reg(vcpu, CNTV_CVAL_EL0, cval); break; case TIMER_PTIMER: - __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = cval; + __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, cval); break; case TIMER_HVTIMER: - __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2) = cval; + __vcpu_assign_sys_reg(vcpu, CNTHV_CVAL_EL2, cval); break; case TIMER_HPTIMER: - __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = cval; + __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, cval); break; default: WARN_ON(1); diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c index 424a5107cddb..6a2a899a344e 100644 --- a/arch/arm64/kvm/hyp/exception.c +++ b/arch/arm64/kvm/hyp/exception.c @@ -37,7 +37,7 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) if (unlikely(vcpu_has_nv(vcpu))) vcpu_write_sys_reg(vcpu, val, reg); else if (!__vcpu_write_sys_reg_to_cpu(val, reg)) - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); } static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, @@ -51,7 +51,7 @@ static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode, } else if (has_vhe()) { write_sysreg_el1(val, SYS_SPSR); } else { - __vcpu_sys_reg(vcpu, SPSR_EL1) = val; + __vcpu_assign_sys_reg(vcpu, SPSR_EL1, val); } } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index eef310cdbdbd..aa5b561b9218 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -45,7 +45,7 @@ static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu) if (!vcpu_el1_is_32bit(vcpu)) return; - __vcpu_sys_reg(vcpu, FPEXC32_EL2) = read_sysreg(fpexc32_el2); + __vcpu_assign_sys_reg(vcpu, FPEXC32_EL2, read_sysreg(fpexc32_el2)); } static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) @@ -457,7 +457,7 @@ static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu) */ if (vcpu_has_sve(vcpu)) { zcr_el1 = read_sysreg_el1(SYS_ZCR); - __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1; + __vcpu_assign_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu), zcr_el1); /* * The guest's state is always saved using the guest's max VL. diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h index b9cff893bbe0..4d0dbea4c56f 100644 --- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h @@ -307,11 +307,11 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu) vcpu->arch.ctxt.spsr_irq = read_sysreg(spsr_irq); vcpu->arch.ctxt.spsr_fiq = read_sysreg(spsr_fiq); - __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2); - __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2); + __vcpu_assign_sys_reg(vcpu, DACR32_EL2, read_sysreg(dacr32_el2)); + __vcpu_assign_sys_reg(vcpu, IFSR32_EL2, read_sysreg(ifsr32_el2)); if (has_vhe() || kvm_debug_regs_in_use(vcpu)) - __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2); + __vcpu_assign_sys_reg(vcpu, DBGVCR32_EL2, read_sysreg(dbgvcr32_el2)); } static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 8e8848de4d47..e9198e56e784 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -26,7 +26,7 @@ void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt); static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu) { - __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); + __vcpu_assign_sys_reg(vcpu, ZCR_EL1, read_sysreg_el1(SYS_ZCR)); /* * On saving/restoring guest sve state, always use the maximum VL for * the guest. The layout of the data when saving the sve state depends @@ -79,7 +79,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm)); if (has_fpmr) - __vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR); + __vcpu_assign_sys_reg(vcpu, FPMR, read_sysreg_s(SYS_FPMR)); if (system_supports_sve()) __hyp_sve_restore_host(); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index c9b330dc2066..09df2b42bc1b 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -223,9 +223,9 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ val = read_sysreg_el0(SYS_CNTP_CVAL); if (map.direct_ptimer == vcpu_ptimer(vcpu)) - __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = val; + __vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, val); if (map.direct_ptimer == vcpu_hptimer(vcpu)) - __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = val; + __vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, val); offset = read_sysreg_s(SYS_CNTPOFF_EL2); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 3814b0b2c937..34c7bf7fe9de 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -18,17 +18,17 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) { /* These registers are common with EL1 */ - __vcpu_sys_reg(vcpu, PAR_EL1) = read_sysreg(par_el1); - __vcpu_sys_reg(vcpu, TPIDR_EL1) = read_sysreg(tpidr_el1); - - __vcpu_sys_reg(vcpu, ESR_EL2) = read_sysreg_el1(SYS_ESR); - __vcpu_sys_reg(vcpu, AFSR0_EL2) = read_sysreg_el1(SYS_AFSR0); - __vcpu_sys_reg(vcpu, AFSR1_EL2) = read_sysreg_el1(SYS_AFSR1); - __vcpu_sys_reg(vcpu, FAR_EL2) = read_sysreg_el1(SYS_FAR); - __vcpu_sys_reg(vcpu, MAIR_EL2) = read_sysreg_el1(SYS_MAIR); - __vcpu_sys_reg(vcpu, VBAR_EL2) = read_sysreg_el1(SYS_VBAR); - __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR); - __vcpu_sys_reg(vcpu, AMAIR_EL2) = read_sysreg_el1(SYS_AMAIR); + __vcpu_assign_sys_reg(vcpu, PAR_EL1, read_sysreg(par_el1)); + __vcpu_assign_sys_reg(vcpu, TPIDR_EL1, read_sysreg(tpidr_el1)); + + __vcpu_assign_sys_reg(vcpu, ESR_EL2, read_sysreg_el1(SYS_ESR)); + __vcpu_assign_sys_reg(vcpu, AFSR0_EL2, read_sysreg_el1(SYS_AFSR0)); + __vcpu_assign_sys_reg(vcpu, AFSR1_EL2, read_sysreg_el1(SYS_AFSR1)); + __vcpu_assign_sys_reg(vcpu, FAR_EL2, read_sysreg_el1(SYS_FAR)); + __vcpu_assign_sys_reg(vcpu, MAIR_EL2, read_sysreg_el1(SYS_MAIR)); + __vcpu_assign_sys_reg(vcpu, VBAR_EL2, read_sysreg_el1(SYS_VBAR)); + __vcpu_assign_sys_reg(vcpu, CONTEXTIDR_EL2, read_sysreg_el1(SYS_CONTEXTIDR)); + __vcpu_assign_sys_reg(vcpu, AMAIR_EL2, read_sysreg_el1(SYS_AMAIR)); /* * In VHE mode those registers are compatible between EL1 and EL2, @@ -46,21 +46,21 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) * are always trapped, ensuring that the in-memory * copy is always up-to-date. A small blessing... */ - __vcpu_sys_reg(vcpu, SCTLR_EL2) = read_sysreg_el1(SYS_SCTLR); - __vcpu_sys_reg(vcpu, TTBR0_EL2) = read_sysreg_el1(SYS_TTBR0); - __vcpu_sys_reg(vcpu, TTBR1_EL2) = read_sysreg_el1(SYS_TTBR1); - __vcpu_sys_reg(vcpu, TCR_EL2) = read_sysreg_el1(SYS_TCR); + __vcpu_assign_sys_reg(vcpu, SCTLR_EL2, read_sysreg_el1(SYS_SCTLR)); + __vcpu_assign_sys_reg(vcpu, TTBR0_EL2, read_sysreg_el1(SYS_TTBR0)); + __vcpu_assign_sys_reg(vcpu, TTBR1_EL2, read_sysreg_el1(SYS_TTBR1)); + __vcpu_assign_sys_reg(vcpu, TCR_EL2, read_sysreg_el1(SYS_TCR)); if (ctxt_has_tcrx(&vcpu->arch.ctxt)) { - __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2); + __vcpu_assign_sys_reg(vcpu, TCR2_EL2, read_sysreg_el1(SYS_TCR2)); if (ctxt_has_s1pie(&vcpu->arch.ctxt)) { - __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0); - __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR); + __vcpu_assign_sys_reg(vcpu, PIRE0_EL2, read_sysreg_el1(SYS_PIRE0)); + __vcpu_assign_sys_reg(vcpu, PIR_EL2, read_sysreg_el1(SYS_PIR)); } if (ctxt_has_s1poe(&vcpu->arch.ctxt)) - __vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR); + __vcpu_assign_sys_reg(vcpu, POR_EL2, read_sysreg_el1(SYS_POR)); } /* @@ -74,9 +74,9 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; } - __vcpu_sys_reg(vcpu, SP_EL2) = read_sysreg(sp_el1); - __vcpu_sys_reg(vcpu, ELR_EL2) = read_sysreg_el1(SYS_ELR); - __vcpu_sys_reg(vcpu, SPSR_EL2) = read_sysreg_el1(SYS_SPSR); + __vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1)); + __vcpu_assign_sys_reg(vcpu, ELR_EL2, read_sysreg_el1(SYS_ELR)); + __vcpu_assign_sys_reg(vcpu, SPSR_EL2, read_sysreg_el1(SYS_SPSR)); } static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 25c29107f13f..4f0ae8073f78 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -178,7 +178,7 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force) val |= lower_32_bits(val); } - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); /* Recreate the perf event to reflect the updated sample_period */ kvm_pmu_create_perf_event(pmc); @@ -204,7 +204,7 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) { kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx)); - __vcpu_sys_reg(vcpu, counter_index_to_reg(select_idx)) = val; + __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(select_idx), val); kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); } @@ -239,7 +239,7 @@ static void kvm_pmu_stop_counter(struct kvm_pmc *pmc) reg = counter_index_to_reg(pmc->idx); - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); kvm_pmu_release_perf_event(pmc); } @@ -503,7 +503,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1; if (!kvm_pmc_is_64bit(pmc)) reg = lower_32_bits(reg); - __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg; + __vcpu_assign_sys_reg(vcpu, counter_index_to_reg(i), reg); /* No overflow? move on */ if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg)) @@ -602,7 +602,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); /* The reset bits don't indicate any state, and shouldn't be saved. */ - __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P); + __vcpu_assign_sys_reg(vcpu, PMCR_EL0, (val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P))); if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); @@ -779,7 +779,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, u64 reg; reg = counter_index_to_evtreg(pmc->idx); - __vcpu_sys_reg(vcpu, reg) = data & kvm_pmu_evtyper_mask(vcpu->kvm); + __vcpu_assign_sys_reg(vcpu, reg, (data & kvm_pmu_evtyper_mask(vcpu->kvm))); kvm_pmu_create_perf_event(pmc); } @@ -1038,7 +1038,7 @@ static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr) u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2); val &= ~MDCR_EL2_HPMN; val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters); - __vcpu_sys_reg(vcpu, MDCR_EL2) = val; + __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); } } } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 707c651aff03..93d0ca7ed936 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -228,7 +228,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) * to reverse-translate virtual EL2 system registers for a * non-VHE guest hypervisor. */ - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); switch (reg) { case CNTHCTL_EL2: @@ -263,7 +263,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg) return; memory_write: - __vcpu_sys_reg(vcpu, reg) = val; + __vcpu_assign_sys_reg(vcpu, reg, val); } /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ @@ -605,7 +605,7 @@ static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, if ((val ^ rd->val) & ~OSLSR_EL1_OSLK) return -EINVAL; - __vcpu_sys_reg(vcpu, rd->reg) = val; + __vcpu_assign_sys_reg(vcpu, rd->reg, val); return 0; } @@ -835,7 +835,7 @@ static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) * The value of PMCR.N field is included when the * vCPU register is read via kvm_vcpu_read_pmcr(). */ - __vcpu_sys_reg(vcpu, r->reg) = pmcr; + __vcpu_assign_sys_reg(vcpu, r->reg, pmcr); return __vcpu_sys_reg(vcpu, r->reg); } @@ -907,7 +907,7 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return false; if (p->is_write) - __vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval; + __vcpu_assign_sys_reg(vcpu, PMSELR_EL0, p->regval); else /* return PMSELR.SEL field */ p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0) @@ -1076,7 +1076,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va { u64 mask = kvm_pmu_accessible_counter_mask(vcpu); - __vcpu_sys_reg(vcpu, r->reg) = val & mask; + __vcpu_assign_sys_reg(vcpu, r->reg, val & mask); kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return 0; @@ -1185,8 +1185,8 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!vcpu_mode_priv(vcpu)) return undef_access(vcpu, p, r); - __vcpu_sys_reg(vcpu, PMUSERENR_EL0) = - p->regval & ARMV8_PMU_USERENR_MASK; + __vcpu_assign_sys_reg(vcpu, PMUSERENR_EL0, + (p->regval & ARMV8_PMU_USERENR_MASK)); } else { p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0) & ARMV8_PMU_USERENR_MASK; @@ -1237,7 +1237,7 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, if (!kvm_supports_32bit_el0()) val |= ARMV8_PMU_PMCR_LC; - __vcpu_sys_reg(vcpu, r->reg) = val; + __vcpu_assign_sys_reg(vcpu, r->reg, val); kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu); return 0; @@ -2207,7 +2207,7 @@ static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) if (kvm_has_mte(vcpu->kvm)) clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc); - __vcpu_sys_reg(vcpu, r->reg) = clidr; + __vcpu_assign_sys_reg(vcpu, r->reg, clidr); return __vcpu_sys_reg(vcpu, r->reg); } @@ -2221,7 +2221,7 @@ static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc)) return -EINVAL; - __vcpu_sys_reg(vcpu, rd->reg) = val; + __vcpu_assign_sys_reg(vcpu, rd->reg, val); return 0; } @@ -2398,7 +2398,7 @@ static bool access_sp_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (p->is_write) - __vcpu_sys_reg(vcpu, SP_EL1) = p->regval; + __vcpu_assign_sys_reg(vcpu, SP_EL1, p->regval); else p->regval = __vcpu_sys_reg(vcpu, SP_EL1); @@ -2422,7 +2422,7 @@ static bool access_spsr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (p->is_write) - __vcpu_sys_reg(vcpu, SPSR_EL1) = p->regval; + __vcpu_assign_sys_reg(vcpu, SPSR_EL1, p->regval); else p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1); @@ -2434,7 +2434,7 @@ static bool access_cntkctl_el12(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { if (p->is_write) - __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval; + __vcpu_assign_sys_reg(vcpu, CNTKCTL_EL1, p->regval); else p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1); @@ -2448,7 +2448,9 @@ static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1)) val |= HCR_E2H; - return __vcpu_sys_reg(vcpu, r->reg) = val; + __vcpu_assign_sys_reg(vcpu, r->reg, val); + + return __vcpu_sys_reg(vcpu, r->reg); } static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu, @@ -2619,7 +2621,7 @@ static bool access_mdcr(struct kvm_vcpu *vcpu, u64_replace_bits(val, hpmn, MDCR_EL2_HPMN); } - __vcpu_sys_reg(vcpu, MDCR_EL2) = val; + __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); /* * Request a reload of the PMU to enable/disable the counters @@ -2748,7 +2750,7 @@ static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, static u64 reset_mdcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - __vcpu_sys_reg(vcpu, r->reg) = vcpu->kvm->arch.nr_pmu_counters; + __vcpu_assign_sys_reg(vcpu, r->reg, vcpu->kvm->arch.nr_pmu_counters); return vcpu->kvm->arch.nr_pmu_counters; } @@ -5006,7 +5008,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg, if (r->set_user) { ret = (r->set_user)(vcpu, r, val); } else { - __vcpu_sys_reg(vcpu, r->reg) = val; + __vcpu_assign_sys_reg(vcpu, r->reg, val); ret = 0; } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index cc6338d38766..ef97d9fc67cc 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -137,7 +137,7 @@ static inline u64 reset_unknown(struct kvm_vcpu *vcpu, { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); - __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; + __vcpu_assign_sys_reg(vcpu, r->reg, 0x1de7ec7edbadc0deULL); return __vcpu_sys_reg(vcpu, r->reg); } @@ -145,7 +145,7 @@ static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); - __vcpu_sys_reg(vcpu, r->reg) = r->val; + __vcpu_assign_sys_reg(vcpu, r->reg, r->val); return __vcpu_sys_reg(vcpu, r->reg); } diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index 4f6954c30674..d22a8ad7bcc5 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -356,12 +356,12 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2); val &= ~ICH_HCR_EL2_EOIcount_MASK; val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK); - __vcpu_sys_reg(vcpu, ICH_HCR_EL2) = val; - __vcpu_sys_reg(vcpu, ICH_VMCR_EL2) = s_cpu_if->vgic_vmcr; + __vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val); + __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr); for (i = 0; i < 4; i++) { - __vcpu_sys_reg(vcpu, ICH_AP0RN(i)) = s_cpu_if->vgic_ap0r[i]; - __vcpu_sys_reg(vcpu, ICH_AP1RN(i)) = s_cpu_if->vgic_ap1r[i]; + __vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]); + __vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]); } for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { @@ -370,7 +370,7 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) val &= ~ICH_LR_STATE; val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE; - __vcpu_sys_reg(vcpu, ICH_LRN(i)) = val; + __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); s_cpu_if->vgic_lr[i] = 0; } -- cgit v1.2.3 From 8800b7c4bbede3cd40831726d3f98e8080baf4df Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jun 2025 08:08:22 +0100 Subject: KVM: arm64: Add RMW specific sysreg accessor In a number of cases, we perform a Read-Modify-Write operation on a system register, meaning that we would apply the RESx masks twice. Instead, provide a new accessor that performs this RMW operation, allowing the masks to be applied exactly once per operation. Reviewed-by: Miguel Luis Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250603070824.1192795-3-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 11 +++++++++++ arch/arm64/kvm/debug.c | 4 ++-- arch/arm64/kvm/hyp/vhe/sysreg-sr.c | 4 ++-- arch/arm64/kvm/nested.c | 2 +- arch/arm64/kvm/pmu-emul.c | 10 +++++----- arch/arm64/kvm/sys_regs.c | 22 +++++++++++----------- 6 files changed, 32 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 3b84ad91116b..8b8c649f225b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1118,6 +1118,17 @@ u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64); ctxt_sys_reg(ctxt, (r)) = __v; \ } while (0) +#define __vcpu_rmw_sys_reg(v, r, op, val) \ + do { \ + const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ + u64 __v = ctxt_sys_reg(ctxt, (r)); \ + __v op (val); \ + if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ + __v = kvm_vcpu_apply_reg_masks((v), (r), __v); \ + \ + ctxt_sys_reg(ctxt, (r)) = __v; \ + } while (0) + #define __vcpu_sys_reg(v,r) \ (*({ \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 0e4c805e7e89..1a7dab333f55 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -216,9 +216,9 @@ void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val) { if (val & OSLAR_EL1_OSLK) - __vcpu_sys_reg(vcpu, OSLSR_EL1) |= OSLSR_EL1_OSLK; + __vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, |=, OSLSR_EL1_OSLK); else - __vcpu_sys_reg(vcpu, OSLSR_EL1) &= ~OSLSR_EL1_OSLK; + __vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, &=, ~OSLSR_EL1_OSLK); preempt_disable(); kvm_arch_vcpu_put(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 34c7bf7fe9de..73e4bc7fde9e 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -70,8 +70,8 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu) */ val = read_sysreg_el1(SYS_CNTKCTL); val &= CNTKCTL_VALID_BITS; - __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS; - __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val; + __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, &=, ~CNTKCTL_VALID_BITS); + __vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, |=, val); } __vcpu_assign_sys_reg(vcpu, SP_EL2, read_sysreg(sp_el1)); diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index 4a53e4147fb0..5b191f4dc566 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -1757,7 +1757,7 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu) out: for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++) - (void)__vcpu_sys_reg(vcpu, sr); + __vcpu_rmw_sys_reg(vcpu, sr, |=, 0); return 0; } diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index 4f0ae8073f78..b03dbda7f1ab 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -510,7 +510,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu, continue; /* Mark overflow */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(i)); if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(i + 1), @@ -556,7 +556,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, perf_event->attr.sample_period = period; perf_event->hw.sample_period = period; - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(idx)); if (kvm_pmu_counter_can_chain(pmc)) kvm_pmu_counter_increment(vcpu, BIT(idx + 1), @@ -914,9 +914,9 @@ void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { u64 mask = kvm_pmu_implemented_counter_mask(vcpu); - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= mask; - __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= mask; - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= mask; + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, mask); + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, mask); + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, mask); kvm_pmu_reprogram_counter_mask(vcpu, mask); } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 93d0ca7ed936..e89d345e42d0 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -791,7 +791,7 @@ static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) mask |= GENMASK(n - 1, 0); reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= mask; + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, mask); return __vcpu_sys_reg(vcpu, r->reg); } @@ -799,7 +799,7 @@ static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, GENMASK(31, 0)); return __vcpu_sys_reg(vcpu, r->reg); } @@ -811,7 +811,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) return 0; reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= kvm_pmu_evtyper_mask(vcpu->kvm); + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, kvm_pmu_evtyper_mask(vcpu->kvm)); return __vcpu_sys_reg(vcpu, r->reg); } @@ -819,7 +819,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); - __vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK; + __vcpu_rmw_sys_reg(vcpu, r->reg, &=, PMSELR_EL0_SEL_MASK); return __vcpu_sys_reg(vcpu, r->reg); } @@ -1103,10 +1103,10 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, val = p->regval & mask; if (r->Op2 & 0x1) /* accessing PMCNTENSET_EL0 */ - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val; + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, |=, val); else /* accessing PMCNTENCLR_EL0 */ - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val; + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, ~val); kvm_pmu_reprogram_counter_mask(vcpu, val); } else { @@ -1129,10 +1129,10 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (r->Op2 & 0x1) /* accessing PMINTENSET_EL1 */ - __vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val; + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, |=, val); else /* accessing PMINTENCLR_EL1 */ - __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val; + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, ~val); } else { p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } @@ -1151,10 +1151,10 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (p->is_write) { if (r->CRm & 0x2) /* accessing PMOVSSET_EL0 */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, (p->regval & mask)); else /* accessing PMOVSCLR_EL0 */ - __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, ~(p->regval & mask)); } else { p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); } @@ -4786,7 +4786,7 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) r->reset(vcpu, r); if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS) - (void)__vcpu_sys_reg(vcpu, r->reg); + __vcpu_rmw_sys_reg(vcpu, r->reg, |=, 0); } set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); -- cgit v1.2.3 From b61fae4ee120f337b8700dff43b2fd639f3bf6a5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jun 2025 08:08:23 +0100 Subject: KVM: arm64: Don't use __vcpu_sys_reg() to get the address of a sysreg We are about to prevent the use of __vcpu_sys_reg() as a lvalue, and getting the address of a rvalue is not a thing. Update the couple of places where we do this to use the __ctxt_sys_reg() accessor, which return the address of a register. Reviewed-by: Miguel Luis Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250603070824.1192795-4-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arch_timer.c | 2 +- arch/arm64/kvm/fpsimd.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 5a67a6d4d95b..c6b4fb4c8e08 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1036,7 +1036,7 @@ void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) if (vcpu_has_nv(vcpu)) { struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset; - offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2); + offs->vcpu_offset = __ctxt_sys_reg(&vcpu->arch.ctxt, CNTVOFF_EL2); offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset; } diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 7f6e43d25691..8f6c8f57c6b9 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -103,8 +103,8 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) fp_state.sve_state = vcpu->arch.sve_state; fp_state.sve_vl = vcpu->arch.sve_max_vl; fp_state.sme_state = NULL; - fp_state.svcr = &__vcpu_sys_reg(vcpu, SVCR); - fp_state.fpmr = &__vcpu_sys_reg(vcpu, FPMR); + fp_state.svcr = __ctxt_sys_reg(&vcpu->arch.ctxt, SVCR); + fp_state.fpmr = __ctxt_sys_reg(&vcpu->arch.ctxt, FPMR); fp_state.fp_type = &vcpu->arch.fp_type; if (vcpu_has_sve(vcpu)) -- cgit v1.2.3 From b5fa1f91e11fdf74ad4e2ac6dae246a57cbd2d95 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 3 Jun 2025 08:08:24 +0100 Subject: KVM: arm64: Make __vcpu_sys_reg() a pure rvalue operand Now that we don't have any use of __vcpu_sys_reg() as a lvalue, remove the in-place update, and directly return the sanitised value. Reviewed-by: Miguel Luis Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250603070824.1192795-5-maz@kernel.org Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 8b8c649f225b..382b382d14da 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1130,13 +1130,13 @@ u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64); } while (0) #define __vcpu_sys_reg(v,r) \ - (*({ \ + ({ \ const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt; \ - u64 *__r = __ctxt_sys_reg(ctxt, (r)); \ + u64 __v = ctxt_sys_reg(ctxt, (r)); \ if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__) \ - *__r = kvm_vcpu_apply_reg_masks((v), (r), *__r);\ - __r; \ - })) + __v = kvm_vcpu_apply_reg_masks((v), (r), __v); \ + __v; \ + }) u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg); void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg); -- cgit v1.2.3 From f8693f6dffcd1865da7f5f4c3df1de445e519bec Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 8 Apr 2025 02:08:25 +0800 Subject: riscv: ftrace: support fastcc in Clang for WITH_ARGS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some caller-saved registers which are not defined as function arguments in the ABI can still be passed as arguments when the kernel is compiled with Clang. As a result, we must save and restore those registers to prevent ftrace from clobbering them. - [1]: https://reviews.llvm.org/D68559 Reported-by: Evgenii Shatokhin Closes: https://lore.kernel.org/linux-riscv/7e7c7914-445d-426d-89a0-59a9199c45b1@yadro.com/ Fixes: 7caa9765465f ("ftrace: riscv: move from REGS to ARGS") Acked-by: Nathan Chancellor Reviewed-by: Björn Töpel Signed-off-by: Andy Chiu Tested-by: Björn Töpel Link: https://lore.kernel.org/r/20250407180838.42877-1-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/ftrace.h | 7 +++++++ arch/riscv/kernel/asm-offsets.c | 7 +++++++ arch/riscv/kernel/mcount-dyn.S | 16 ++++++++++++++-- 3 files changed, 28 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index d627f63ee289..d8b2138bd9c6 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -146,6 +146,13 @@ struct __arch_ftrace_regs { unsigned long a5; unsigned long a6; unsigned long a7; +#ifdef CONFIG_CC_IS_CLANG + unsigned long t2; + unsigned long t3; + unsigned long t4; + unsigned long t5; + unsigned long t6; +#endif }; }; }; diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 16490755304e..7c43c8e26ae7 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -501,6 +501,13 @@ void asm_offsets(void) DEFINE(FREGS_SP, offsetof(struct __arch_ftrace_regs, sp)); DEFINE(FREGS_S0, offsetof(struct __arch_ftrace_regs, s0)); DEFINE(FREGS_T1, offsetof(struct __arch_ftrace_regs, t1)); +#ifdef CONFIG_CC_IS_CLANG + DEFINE(FREGS_T2, offsetof(struct __arch_ftrace_regs, t2)); + DEFINE(FREGS_T3, offsetof(struct __arch_ftrace_regs, t3)); + DEFINE(FREGS_T4, offsetof(struct __arch_ftrace_regs, t4)); + DEFINE(FREGS_T5, offsetof(struct __arch_ftrace_regs, t5)); + DEFINE(FREGS_T6, offsetof(struct __arch_ftrace_regs, t6)); +#endif DEFINE(FREGS_A0, offsetof(struct __arch_ftrace_regs, a0)); DEFINE(FREGS_A1, offsetof(struct __arch_ftrace_regs, a1)); DEFINE(FREGS_A2, offsetof(struct __arch_ftrace_regs, a2)); diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index 745dd4c4a69c..e988bd26b28b 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -96,7 +96,13 @@ REG_S x8, FREGS_S0(sp) #endif REG_S x6, FREGS_T1(sp) - +#ifdef CONFIG_CC_IS_CLANG + REG_S x7, FREGS_T2(sp) + REG_S x28, FREGS_T3(sp) + REG_S x29, FREGS_T4(sp) + REG_S x30, FREGS_T5(sp) + REG_S x31, FREGS_T6(sp) +#endif // save the arguments REG_S x10, FREGS_A0(sp) REG_S x11, FREGS_A1(sp) @@ -115,7 +121,13 @@ REG_L x8, FREGS_S0(sp) #endif REG_L x6, FREGS_T1(sp) - +#ifdef CONFIG_CC_IS_CLANG + REG_L x7, FREGS_T2(sp) + REG_L x28, FREGS_T3(sp) + REG_L x29, FREGS_T4(sp) + REG_L x30, FREGS_T5(sp) + REG_L x31, FREGS_T6(sp) +#endif // restore the arguments REG_L x10, FREGS_A0(sp) REG_L x11, FREGS_A1(sp) -- cgit v1.2.3 From 54ecbc8d857122b32a98fe204cb61a06aabc063d Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 8 Apr 2025 02:08:26 +0800 Subject: riscv: ftrace factor out code defined by !WITH_ARG DYNAMIC_FTRACE selects DYNAMIC_FTRACE_WITH_ARGS and mcount-dyn.S in riscv, so we can remove ifdef jargons of WITH_ARG when it is known that DYNAMIC_FTRACE is true. Signed-off-by: Andy Chiu Link: https://lore.kernel.org/r/20250407180838.42877-2-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/ftrace.c | 15 --------------- arch/riscv/kernel/mcount-dyn.S | 34 ---------------------------------- 2 files changed, 49 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 674dcdfae7a1..1fd10555c580 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -210,7 +210,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } #ifdef CONFIG_DYNAMIC_FTRACE -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs) { @@ -231,19 +230,5 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, if (!function_graph_enter_regs(old, ip, frame_pointer, parent, fregs)) *parent = return_hooker; } -#else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ -extern void ftrace_graph_call(void); -int ftrace_enable_ftrace_graph_caller(void) -{ - return __ftrace_modify_call((unsigned long)&ftrace_graph_call, - (unsigned long)&prepare_ftrace_return, true, true); -} - -int ftrace_disable_ftrace_graph_caller(void) -{ - return __ftrace_modify_call((unsigned long)&ftrace_graph_call, - (unsigned long)&prepare_ftrace_return, false, true); -} -#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index e988bd26b28b..3f06b40bb6c8 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -56,8 +56,6 @@ addi sp, sp, ABI_SIZE_ON_STACK .endm -#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS - /** * SAVE_ABI_REGS - save regs against the ftrace_regs struct * @@ -149,36 +147,6 @@ mv a3, sp .endm -#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ - -#ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS -SYM_FUNC_START(ftrace_caller) - SAVE_ABI - - addi a0, t0, -FENTRY_RA_OFFSET - la a1, function_trace_op - REG_L a2, 0(a1) - mv a1, ra - mv a3, sp - -SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) - call ftrace_stub - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - addi a0, sp, ABI_RA - REG_L a1, ABI_T0(sp) - addi a1, a1, -FENTRY_RA_OFFSET -#ifdef HAVE_FUNCTION_GRAPH_FP_TEST - mv a2, s0 -#endif -SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) - call ftrace_stub -#endif - RESTORE_ABI - jr t0 -SYM_FUNC_END(ftrace_caller) - -#else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ SYM_FUNC_START(ftrace_caller) mv t1, zero SAVE_ABI_REGS @@ -194,8 +162,6 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) jr t1 SYM_FUNC_END(ftrace_caller) -#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ - #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS SYM_CODE_START(ftrace_stub_direct_tramp) jr t0 -- cgit v1.2.3 From c41bf4326c7b0af3f7004235ac91551833ec95c6 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 8 Apr 2025 02:08:27 +0800 Subject: riscv: ftrace: align patchable functions to 4 Byte boundary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are changing ftrace code patching in order to remove dependency from stop_machine() and enable kernel preemption. This requires us to align functions entry at a 4-B align address. However, -falign-functions on older versions of GCC alone was not strong enoungh to align all functions. In fact, cold functions are not aligned after turning on optimizations. We consider this is a bug in GCC and turn off guess-branch-probility as a workaround to align all functions. GCC bug id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345 The option -fmin-function-alignment is able to align all functions properly on newer versions of gcc. So, we add a cc-option to test if the toolchain supports it. Suggested-by: Evgenii Shatokhin Signed-off-by: Andy Chiu Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20250407180838.42877-3-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bbec87b79309..7dbed10843d2 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -150,6 +150,7 @@ config RISCV select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS if MMU select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE) + select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC @@ -236,6 +237,7 @@ config CLANG_SUPPORTS_DYNAMIC_FTRACE config GCC_SUPPORTS_DYNAMIC_FTRACE def_bool CC_IS_GCC depends on $(cc-option,-fpatchable-function-entry=8) + depends on CC_HAS_MIN_FUNCTION_ALIGNMENT || !RISCV_ISA_C config HAVE_SHADOW_CALL_STACK def_bool $(cc-option,-fsanitize=shadow-call-stack) -- cgit v1.2.3 From b2137c3b6d7a45622967aac78b46a4bd2cff6c42 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 8 Apr 2025 02:08:29 +0800 Subject: riscv: ftrace: prepare ftrace for atomic code patching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We use an AUIPC+JALR pair to jump into a ftrace trampoline. Since instruction fetch can break down to 4 byte at a time, it is impossible to update two instructions without a race. In order to mitigate it, we initialize the patchable entry to AUIPC + NOP4. Then, the run-time code patching can change NOP4 to JALR to eable/disable ftrcae from a function. This limits the reach of each ftrace entry to +-2KB displacing from ftrace_caller. Starting from the trampoline, we add a level of indirection for it to reach ftrace caller target. Now, it loads the target address from a memory location, then perform the jump. This enable the kernel to update the target atomically. The new don't-stop-the-world text patching on change only one RISC-V instruction: | -8: &ftrace_ops of the associated tracer function. | : | 0: auipc t0, hi(ftrace_caller) | 4: jalr t0, lo(ftrace_caller) | | -8: &ftrace_nop_ops | : | 0: auipc t0, hi(ftrace_caller) | 4: nop This means that f+0x0 is fixed, and should not be claimed by ftrace, e.g. kprobe should be able to put a probe in f+0x0. Thus, we adjust the offset and MCOUNT_INSN_SIZE accordingly. [ alex: Fix build errors with !CONFIG_DYNAMIC_FTRACE ] Co-developed-by: Björn Töpel Signed-off-by: Björn Töpel Signed-off-by: Andy Chiu Link: https://lore.kernel.org/r/20250407180838.42877-5-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/ftrace.h | 49 +++++++------- arch/riscv/kernel/ftrace.c | 137 +++++++++++++++++++++------------------- arch/riscv/kernel/mcount-dyn.S | 9 ++- 3 files changed, 98 insertions(+), 97 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index d8b2138bd9c6..6a5c0a7fb826 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -20,10 +20,9 @@ extern void *return_address(unsigned int level); #define ftrace_return_address(n) return_address(n) void _mcount(void); -static inline unsigned long ftrace_call_adjust(unsigned long addr) -{ - return addr; -} +unsigned long ftrace_call_adjust(unsigned long addr); +unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip); +#define ftrace_get_symaddr(fentry_ip) arch_ftrace_get_symaddr(fentry_ip) /* * Let's do like x86/arm64 and ignore the compat syscalls. @@ -57,12 +56,21 @@ struct dyn_arch_ftrace { * 2) jalr: setting low-12 offset to ra, jump to ra, and set ra to * return address (original pc + 4) * + * The first 2 instructions for each tracable function is compiled to 2 nop + * instructions. Then, the kernel initializes the first instruction to auipc at + * boot time (). The second instruction is patched to jalr to + * start the trace. + * + *: + * 0: nop + * 4: nop + * *: - * 0: auipc t0/ra, 0x? - * 4: jalr t0/ra, ?(t0/ra) + * 0: auipc t0, 0x? + * 4: jalr t0, ?(t0) * *: - * 0: nop + * 0: auipc t0, 0x? * 4: nop * * Dynamic ftrace generates probes to call sites, so we must deal with @@ -75,10 +83,9 @@ struct dyn_arch_ftrace { #define AUIPC_OFFSET_MASK (0xfffff000) #define AUIPC_PAD (0x00001000) #define JALR_SHIFT 20 -#define JALR_RA (0x000080e7) -#define AUIPC_RA (0x00000097) #define JALR_T0 (0x000282e7) #define AUIPC_T0 (0x00000297) +#define JALR_RANGE (JALR_SIGN_MASK - 1) #define to_jalr_t0(offset) \ (((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_T0) @@ -96,26 +103,14 @@ do { \ call[1] = to_jalr_t0(offset); \ } while (0) -#define to_jalr_ra(offset) \ - (((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_RA) - -#define to_auipc_ra(offset) \ - ((offset & JALR_SIGN_MASK) ? \ - (((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_RA) : \ - ((offset & AUIPC_OFFSET_MASK) | AUIPC_RA)) - -#define make_call_ra(caller, callee, call) \ -do { \ - unsigned int offset = \ - (unsigned long) (callee) - (unsigned long) (caller); \ - call[0] = to_auipc_ra(offset); \ - call[1] = to_jalr_ra(offset); \ -} while (0) - /* - * Let auipc+jalr be the basic *mcount unit*, so we make it 8 bytes here. + * Only the jalr insn in the auipc+jalr is patched, so we make it 4 + * bytes here. */ -#define MCOUNT_INSN_SIZE 8 +#define MCOUNT_INSN_SIZE 4 +#define MCOUNT_AUIPC_SIZE 4 +#define MCOUNT_JALR_SIZE 4 +#define MCOUNT_NOP4_SIZE 4 #ifndef __ASSEMBLY__ struct dyn_ftrace; diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 1fd10555c580..ea04f09f9d4d 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -8,11 +8,22 @@ #include #include #include +#include #include #include #include #ifdef CONFIG_DYNAMIC_FTRACE +unsigned long ftrace_call_adjust(unsigned long addr) +{ + return addr + MCOUNT_AUIPC_SIZE; +} + +unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) +{ + return fentry_ip - MCOUNT_AUIPC_SIZE; +} + void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) { mutex_lock(&text_mutex); @@ -32,51 +43,32 @@ void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) mutex_unlock(&text_mutex); } -static int ftrace_check_current_call(unsigned long hook_pos, - unsigned int *expected) +static int __ftrace_modify_call(unsigned long source, unsigned long target, bool validate) { + unsigned int call[2], offset; unsigned int replaced[2]; - unsigned int nops[2] = {RISCV_INSN_NOP4, RISCV_INSN_NOP4}; - /* we expect nops at the hook position */ - if (!expected) - expected = nops; + offset = target - source; + call[1] = to_jalr_t0(offset); - /* - * Read the text we want to modify; - * return must be -EFAULT on read error - */ - if (copy_from_kernel_nofault(replaced, (void *)hook_pos, - MCOUNT_INSN_SIZE)) - return -EFAULT; - - /* - * Make sure it is what we expect it to be; - * return must be -EINVAL on failed comparison - */ - if (memcmp(expected, replaced, sizeof(replaced))) { - pr_err("%p: expected (%08x %08x) but got (%08x %08x)\n", - (void *)hook_pos, expected[0], expected[1], replaced[0], - replaced[1]); - return -EINVAL; + if (validate) { + call[0] = to_auipc_t0(offset); + /* + * Read the text we want to modify; + * return must be -EFAULT on read error + */ + if (copy_from_kernel_nofault(replaced, (void *)source, 2 * MCOUNT_INSN_SIZE)) + return -EFAULT; + + if (replaced[0] != call[0]) { + pr_err("%p: expected (%08x) but got (%08x)\n", + (void *)source, call[0], replaced[0]); + return -EINVAL; + } } - return 0; -} - -static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target, - bool enable, bool ra) -{ - unsigned int call[2]; - unsigned int nops[2] = {RISCV_INSN_NOP4, RISCV_INSN_NOP4}; - - if (ra) - make_call_ra(hook_pos, target, call); - else - make_call_t0(hook_pos, target, call); - - /* Replace the auipc-jalr pair at once. Return -EPERM on write error. */ - if (patch_insn_write((void *)hook_pos, enable ? call : nops, MCOUNT_INSN_SIZE)) + /* Replace the jalr at once. Return -EPERM on write error. */ + if (patch_insn_write((void *)(source + MCOUNT_AUIPC_SIZE), call + 1, MCOUNT_JALR_SIZE)) return -EPERM; return 0; @@ -84,22 +76,21 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target, int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { - unsigned int call[2]; - - make_call_t0(rec->ip, addr, call); + unsigned long distance, orig_addr, pc = rec->ip - MCOUNT_AUIPC_SIZE; - if (patch_insn_write((void *)rec->ip, call, MCOUNT_INSN_SIZE)) - return -EPERM; + orig_addr = (unsigned long)&ftrace_caller; + distance = addr > orig_addr ? addr - orig_addr : orig_addr - addr; + if (distance > JALR_RANGE) + return -EINVAL; - return 0; + return __ftrace_modify_call(pc, addr, false); } -int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, - unsigned long addr) +int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { - unsigned int nops[2] = {RISCV_INSN_NOP4, RISCV_INSN_NOP4}; + u32 nop4 = RISCV_INSN_NOP4; - if (patch_insn_write((void *)rec->ip, nops, MCOUNT_INSN_SIZE)) + if (patch_insn_write((void *)rec->ip, &nop4, MCOUNT_NOP4_SIZE)) return -EPERM; return 0; @@ -114,21 +105,38 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, */ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) { - int out; + unsigned long pc = rec->ip - MCOUNT_AUIPC_SIZE; + unsigned int nops[2], offset; + int ret; + + offset = (unsigned long) &ftrace_caller - pc; + nops[0] = to_auipc_t0(offset); + nops[1] = RISCV_INSN_NOP4; mutex_lock(&text_mutex); - out = ftrace_make_nop(mod, rec, MCOUNT_ADDR); + ret = patch_insn_write((void *)pc, nops, 2 * MCOUNT_INSN_SIZE); mutex_unlock(&text_mutex); - return out; + return ret; } +ftrace_func_t ftrace_call_dest = ftrace_stub; int ftrace_update_ftrace_func(ftrace_func_t func) { - int ret = __ftrace_modify_call((unsigned long)&ftrace_call, - (unsigned long)func, true, true); - - return ret; + WRITE_ONCE(ftrace_call_dest, func); + /* + * The data fence ensure that the update to ftrace_call_dest happens + * before the write to function_trace_op later in the generic ftrace. + * If the sequence is not enforced, then an old ftrace_call_dest may + * race loading a new function_trace_op set in ftrace_modify_all_code + * + * If we are in stop_machine, then we don't need to call remote fence + * as there is no concurrent read-side of ftrace_call_dest. + */ + smp_wmb(); + if (!irqs_disabled()) + smp_call_function(ftrace_sync_ipi, NULL, 1); + return 0; } struct ftrace_modify_param { @@ -166,23 +174,22 @@ void arch_ftrace_update_code(int command) stop_machine(__ftrace_modify_code, ¶m, cpu_online_mask); } -#endif +#else /* CONFIG_DYNAMIC_FTRACE */ +unsigned long ftrace_call_adjust(unsigned long addr) +{ + return addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { + unsigned long caller = rec->ip - MCOUNT_AUIPC_SIZE; unsigned int call[2]; - unsigned long caller = rec->ip; - int ret; make_call_t0(caller, old_addr, call); - ret = ftrace_check_current_call(caller, call); - - if (ret) - return ret; - - return __ftrace_modify_call(caller, addr, true, false); + return __ftrace_modify_call(caller, addr, true); } #endif diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index 3f06b40bb6c8..8aa554d56096 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -13,7 +13,6 @@ .text -#define FENTRY_RA_OFFSET 8 #define ABI_SIZE_ON_STACK 80 #define ABI_A0 0 #define ABI_A1 8 @@ -62,8 +61,7 @@ * After the stack is established, * * 0(sp) stores the PC of the traced function which can be accessed -* by &(fregs)->epc in tracing function. Note that the real -* function entry address should be computed with -FENTRY_RA_OFFSET. +* by &(fregs)->epc in tracing function. * * 8(sp) stores the function return address (i.e. parent IP) that * can be accessed by &(fregs)->ra in tracing function. @@ -140,7 +138,7 @@ .endm .macro PREPARE_ARGS - addi a0, t0, -FENTRY_RA_OFFSET + addi a0, t0, -MCOUNT_JALR_SIZE // ip (callsite's jalr insn) la a1, function_trace_op REG_L a2, 0(a1) mv a1, ra @@ -153,7 +151,8 @@ SYM_FUNC_START(ftrace_caller) PREPARE_ARGS SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) - call ftrace_stub + REG_L ra, ftrace_call_dest + jalr ra, 0(ra) RESTORE_ABI_REGS bnez t1, .Ldirect -- cgit v1.2.3 From 5aa4ef95588456df5a5563dd23892827f97fb14f Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 8 Apr 2025 02:08:30 +0800 Subject: riscv: ftrace: do not use stop_machine to update code Now it is safe to remove dependency from stop_machine() for us to patch code in ftrace. Signed-off-by: Andy Chiu Link: https://lore.kernel.org/r/20250407180838.42877-6-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/ftrace.c | 64 ++++++++-------------------------------------- 1 file changed, 10 insertions(+), 54 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index ea04f09f9d4d..b133c60808fe 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -24,23 +24,13 @@ unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip) return fentry_ip - MCOUNT_AUIPC_SIZE; } -void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) +void arch_ftrace_update_code(int command) { mutex_lock(&text_mutex); - - /* - * The code sequences we use for ftrace can't be patched while the - * kernel is running, so we need to use stop_machine() to modify them - * for now. This doesn't play nice with text_mutex, we use this flag - * to elide the check. - */ - riscv_patch_in_stop_machine = true; -} - -void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex) -{ - riscv_patch_in_stop_machine = false; + command |= FTRACE_MAY_SLEEP; + ftrace_modify_all_code(command); mutex_unlock(&text_mutex); + flush_icache_all(); } static int __ftrace_modify_call(unsigned long source, unsigned long target, bool validate) @@ -129,51 +119,17 @@ int ftrace_update_ftrace_func(ftrace_func_t func) * before the write to function_trace_op later in the generic ftrace. * If the sequence is not enforced, then an old ftrace_call_dest may * race loading a new function_trace_op set in ftrace_modify_all_code - * - * If we are in stop_machine, then we don't need to call remote fence - * as there is no concurrent read-side of ftrace_call_dest. */ smp_wmb(); - if (!irqs_disabled()) - smp_call_function(ftrace_sync_ipi, NULL, 1); - return 0; -} - -struct ftrace_modify_param { - int command; - atomic_t cpu_count; -}; - -static int __ftrace_modify_code(void *data) -{ - struct ftrace_modify_param *param = data; - - if (atomic_inc_return(¶m->cpu_count) == num_online_cpus()) { - ftrace_modify_all_code(param->command); - /* - * Make sure the patching store is effective *before* we - * increment the counter which releases all waiting CPUs - * by using the release variant of atomic increment. The - * release pairs with the call to local_flush_icache_all() - * on the waiting CPU. - */ - atomic_inc_return_release(¶m->cpu_count); - } else { - while (atomic_read(¶m->cpu_count) <= num_online_cpus()) - cpu_relax(); - - local_flush_icache_all(); - } - + /* + * Updating ftrace dpes not take stop_machine path, so irqs should not + * be disabled. + */ + WARN_ON(irqs_disabled()); + smp_call_function(ftrace_sync_ipi, NULL, 1); return 0; } -void arch_ftrace_update_code(int command) -{ - struct ftrace_modify_param param = { command, ATOMIC_INIT(0) }; - - stop_machine(__ftrace_modify_code, ¶m, cpu_online_mask); -} #else /* CONFIG_DYNAMIC_FTRACE */ unsigned long ftrace_call_adjust(unsigned long addr) { -- cgit v1.2.3 From d1049fc0de81bca3abbb35e8d4b8794170498b78 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 8 Apr 2025 02:08:31 +0800 Subject: riscv: vector: Support calling schedule() for preemptible Vector Each function entry implies a call to ftrace infrastructure. And it may call into schedule in some cases. So, it is possible for preemptible kernel-mode Vector to implicitly call into schedule. Since all V-regs are caller-saved, it is possible to drop all V context when a thread voluntarily call schedule(). Besides, we currently don't pass argument through vector register, so we don't have to save/restore V-regs in ftrace trampoline. Signed-off-by: Andy Chiu Link: https://lore.kernel.org/r/20250407180838.42877-7-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/processor.h | 5 +++++ arch/riscv/include/asm/vector.h | 22 +++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 5f56eb9d114a..9c1cc716b891 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -79,6 +79,10 @@ struct pt_regs; * Thus, the task does not own preempt_v. Any use of Vector will have to * save preempt_v, if dirty, and fallback to non-preemptible kernel-mode * Vector. + * - bit 29: The thread voluntarily calls schedule() while holding an active + * preempt_v. All preempt_v context should be dropped in such case because + * V-regs are caller-saved. Only sstatus.VS=ON is persisted across a + * schedule() call. * - bit 30: The in-kernel preempt_v context is saved, and requries to be * restored when returning to the context that owns the preempt_v. * - bit 31: The in-kernel preempt_v context is dirty, as signaled by the @@ -93,6 +97,7 @@ struct pt_regs; #define RISCV_PREEMPT_V 0x00000100 #define RISCV_PREEMPT_V_DIRTY 0x80000000 #define RISCV_PREEMPT_V_NEED_RESTORE 0x40000000 +#define RISCV_PREEMPT_V_IN_SCHEDULE 0x20000000 /* CPU-specific state of a task */ struct thread_struct { diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h index e8a83f55be2b..45c9b426fcc5 100644 --- a/arch/riscv/include/asm/vector.h +++ b/arch/riscv/include/asm/vector.h @@ -120,6 +120,11 @@ static __always_inline void riscv_v_disable(void) csr_clear(CSR_SSTATUS, SR_VS); } +static __always_inline bool riscv_v_is_on(void) +{ + return !!(csr_read(CSR_SSTATUS) & SR_VS); +} + static __always_inline void __vstate_csr_save(struct __riscv_v_ext_state *dest) { asm volatile ( @@ -366,6 +371,11 @@ static inline void __switch_to_vector(struct task_struct *prev, struct pt_regs *regs; if (riscv_preempt_v_started(prev)) { + if (riscv_v_is_on()) { + WARN_ON(prev->thread.riscv_v_flags & RISCV_V_CTX_DEPTH_MASK); + riscv_v_disable(); + prev->thread.riscv_v_flags |= RISCV_PREEMPT_V_IN_SCHEDULE; + } if (riscv_preempt_v_dirty(prev)) { __riscv_v_vstate_save(&prev->thread.kernel_vstate, prev->thread.kernel_vstate.datap); @@ -376,10 +386,16 @@ static inline void __switch_to_vector(struct task_struct *prev, riscv_v_vstate_save(&prev->thread.vstate, regs); } - if (riscv_preempt_v_started(next)) - riscv_preempt_v_set_restore(next); - else + if (riscv_preempt_v_started(next)) { + if (next->thread.riscv_v_flags & RISCV_PREEMPT_V_IN_SCHEDULE) { + next->thread.riscv_v_flags &= ~RISCV_PREEMPT_V_IN_SCHEDULE; + riscv_v_enable(); + } else { + riscv_preempt_v_set_restore(next); + } + } else { riscv_v_vstate_set_restore(next, task_pt_regs(next)); + } } void riscv_v_vstate_ctrl_init(struct task_struct *tsk); -- cgit v1.2.3 From ca358692de41b273468e625f96926fa53e13bd8c Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 8 Apr 2025 02:08:32 +0800 Subject: riscv: add a data fence for CMODX in the kernel mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RISC-V spec explicitly calls out that a local fence.i is not enough for the code modification to be visble from a remote hart. In fact, it states: To make a store to instruction memory visible to all RISC-V harts, the writing hart also has to execute a data FENCE before requesting that all remote RISC-V harts execute a FENCE.I. Although current riscv drivers for IPI use ordered MMIO when sending IPIs in order to synchronize the action between previous csd writes, riscv does not restrict itself to any particular flavor of IPI. Any driver or firmware implementation that does not order data writes before the IPI may pose a risk for code-modifying race. Thus, add a fence here to order data writes before making the IPI. Signed-off-by: Andy Chiu Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20250407180838.42877-8-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/cacheflush.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index b81672729887..b2e4b81763f8 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -24,7 +24,20 @@ void flush_icache_all(void) if (num_online_cpus() < 2) return; - else if (riscv_use_sbi_for_rfence()) + + /* + * Make sure all previous writes to the D$ are ordered before making + * the IPI. The RISC-V spec states that a hart must execute a data fence + * before triggering a remote fence.i in order to make the modification + * visable for remote harts. + * + * IPIs on RISC-V are triggered by MMIO writes to either CLINT or + * S-IMSIC, so the fence ensures previous data writes "happen before" + * the MMIO. + */ + RISCV_FENCE(w, o); + + if (riscv_use_sbi_for_rfence()) sbi_remote_fence_i(NULL); else on_each_cpu(ipi_remote_fence_i, NULL, 1); -- cgit v1.2.3 From d0262e907e2991ae09ca476281fc8cae3ec57850 Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 8 Apr 2025 02:08:33 +0800 Subject: riscv: ftrace: support PREEMPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now, we can safely enable dynamic ftrace with kernel preemption. Signed-off-by: Andy Chiu Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20250407180838.42877-9-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7dbed10843d2..dc0fc11b6e96 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -157,7 +157,7 @@ config RISCV select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS select HAVE_FUNCTION_GRAPH_FREGS - select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION + select HAVE_FUNCTION_TRACER if !XIP_KERNEL select HAVE_EBPF_JIT if MMU select HAVE_GUP_FAST if MMU select HAVE_FUNCTION_ARG_ACCESS_API -- cgit v1.2.3 From c217157bcd1df434fdeaeb24b7c25ec31e15cf4a Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 8 Apr 2025 02:08:34 +0800 Subject: riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V. This allows each ftrace callsite to provide an ftrace_ops to the common ftrace trampoline, allowing each callsite to invoke distinct tracer functions without the need to fall back to list processing or to allocate custom trampolines for each callsite. This significantly speeds up cases where multiple distinct trace functions are used and callsites are mostly traced by a single tracer. The idea and most of the implementation is taken from the ARM64's implementation of the same feature. The idea is to place a pointer to the ftrace_ops as a literal at a fixed offset from the function entry point, which can be recovered by the common ftrace trampoline. We use -fpatchable-function-entry to reserve 8 bytes above the function entry by emitting 2 4 byte or 4 2 byte nops depending on the presence of CONFIG_RISCV_ISA_C. These 8 bytes are patched at runtime with a pointer to the associated ftrace_ops for that callsite. Functions are aligned to 8 bytes to make sure that the accesses to this literal are atomic. This approach allows for directly invoking ftrace_ops::func even for ftrace_ops which are dynamically-allocated (or part of a module), without going via ftrace_ops_list_func. We've benchamrked this with the ftrace_ops sample module on Spacemit K1 Jupiter: Without this patch: baseline (Linux rivos 6.14.0-09584-g7d06015d936c #3 SMP Sat Mar 29 +-----------------------+-----------------+----------------------------+ | Number of tracers | Total time (ns) | Per-call average time | |-----------------------+-----------------+----------------------------| | Relevant | Irrelevant | 100000 calls | Total (ns) | Overhead (ns) | |----------+------------+-----------------+------------+---------------| | 0 | 0 | 1357958 | 13 | - | | 0 | 1 | 1302375 | 13 | - | | 0 | 2 | 1302375 | 13 | - | | 0 | 10 | 1379084 | 13 | - | | 0 | 100 | 1302458 | 13 | - | | 0 | 200 | 1302333 | 13 | - | |----------+------------+-----------------+------------+---------------| | 1 | 0 | 13677833 | 136 | 123 | | 1 | 1 | 18500916 | 185 | 172 | | 1 | 2 | 22856459 | 228 | 215 | | 1 | 10 | 58824709 | 588 | 575 | | 1 | 100 | 505141584 | 5051 | 5038 | | 1 | 200 | 1580473126 | 15804 | 15791 | |----------+------------+-----------------+------------+---------------| | 1 | 0 | 13561000 | 135 | 122 | | 2 | 0 | 19707292 | 197 | 184 | | 10 | 0 | 67774750 | 677 | 664 | | 100 | 0 | 714123125 | 7141 | 7128 | | 200 | 0 | 1918065668 | 19180 | 19167 | +----------+------------+-----------------+------------+---------------+ Note: per-call overhead is estimated relative to the baseline case with 0 relevant tracers and 0 irrelevant tracers. With this patch: v4-rc4 (Linux rivos 6.14.0-09598-gd75747611c93 #4 SMP Sat Mar 29 +-----------------------+-----------------+----------------------------+ | Number of tracers | Total time (ns) | Per-call average time | |-----------------------+-----------------+----------------------------| | Relevant | Irrelevant | 100000 calls | Total (ns) | Overhead (ns) | |----------+------------+-----------------+------------+---------------| | 0 | 0 | 1459917 | 14 | - | | 0 | 1 | 1408000 | 14 | - | | 0 | 2 | 1383792 | 13 | - | | 0 | 10 | 1430709 | 14 | - | | 0 | 100 | 1383791 | 13 | - | | 0 | 200 | 1383750 | 13 | - | |----------+------------+-----------------+------------+---------------| | 1 | 0 | 5238041 | 52 | 38 | | 1 | 1 | 5228542 | 52 | 38 | | 1 | 2 | 5325917 | 53 | 40 | | 1 | 10 | 5299667 | 52 | 38 | | 1 | 100 | 5245250 | 52 | 39 | | 1 | 200 | 5238459 | 52 | 39 | |----------+------------+-----------------+------------+---------------| | 1 | 0 | 5239083 | 52 | 38 | | 2 | 0 | 19449417 | 194 | 181 | | 10 | 0 | 67718584 | 677 | 663 | | 100 | 0 | 709840708 | 7098 | 7085 | | 200 | 0 | 2203580626 | 22035 | 22022 | +----------+------------+-----------------+------------+---------------+ Note: per-call overhead is estimated relative to the baseline case with 0 relevant tracers and 0 irrelevant tracers. As can be seen from the above: a) Whenever there is a single relevant tracer function associated with a tracee, the overhead of invoking the tracer is constant, and does not scale with the number of tracers which are *not* associated with that tracee. b) The overhead for a single relevant tracer has dropped to ~1/3 of the overhead prior to this series (from 122ns to 38ns). This is largely due to permitting calls to dynamically-allocated ftrace_ops without going through ftrace_ops_list_func. Signed-off-by: Puranjay Mohan [update kconfig, asm, refactor] Signed-off-by: Andy Chiu Tested-by: Björn Töpel Link: https://lore.kernel.org/r/20250407180838.42877-10-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 ++ arch/riscv/Makefile | 4 +-- arch/riscv/kernel/asm-offsets.c | 3 ++ arch/riscv/kernel/ftrace.c | 67 +++++++++++++++++++++++++++++++++++++++++ arch/riscv/kernel/mcount-dyn.S | 35 ++++++++++++++++++--- 5 files changed, 105 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index dc0fc11b6e96..ec986c9120e3 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -99,6 +99,7 @@ config RISCV select EDAC_SUPPORT select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE) select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE + select FUNCTION_ALIGNMENT_8B if DYNAMIC_FTRACE_WITH_CALL_OPS select GENERIC_ARCH_TOPOLOGY select GENERIC_ATOMIC64 if !64BIT select GENERIC_CLOCKEVENTS_BROADCAST if SMP @@ -152,6 +153,7 @@ config RISCV select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE) select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG) select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 539d2aef5cab..df57654a615e 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -15,9 +15,9 @@ ifeq ($(CONFIG_DYNAMIC_FTRACE),y) LDFLAGS_vmlinux += --no-relax KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY ifeq ($(CONFIG_RISCV_ISA_C),y) - CC_FLAGS_FTRACE := -fpatchable-function-entry=4 + CC_FLAGS_FTRACE := -fpatchable-function-entry=8,4 else - CC_FLAGS_FTRACE := -fpatchable-function-entry=2 + CC_FLAGS_FTRACE := -fpatchable-function-entry=4,2 endif endif diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 7c43c8e26ae7..2d96197a8abf 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -493,6 +493,9 @@ void asm_offsets(void) DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN)); OFFSET(STACKFRAME_FP, stackframe, fp); OFFSET(STACKFRAME_RA, stackframe, ra); +#ifdef CONFIG_FUNCTION_TRACER + DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func)); +#endif #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS DEFINE(FREGS_SIZE_ON_STACK, ALIGN(sizeof(struct __arch_ftrace_regs), STACK_ALIGN)); diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index b133c60808fe..d56fc6e9fba0 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -16,6 +16,9 @@ #ifdef CONFIG_DYNAMIC_FTRACE unsigned long ftrace_call_adjust(unsigned long addr) { + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return addr + 8; + return addr + MCOUNT_AUIPC_SIZE; } @@ -64,9 +67,52 @@ static int __ftrace_modify_call(unsigned long source, unsigned long target, bool return 0; } +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS +static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec) +{ + const struct ftrace_ops *ops = NULL; + + if (rec->flags & FTRACE_FL_CALL_OPS_EN) { + ops = ftrace_find_unique_ops(rec); + WARN_ON_ONCE(!ops); + } + + if (!ops) + ops = &ftrace_list_ops; + + return ops; +} + +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec, + const struct ftrace_ops *ops) +{ + unsigned long literal = rec->ip - 8; + + return patch_text_nosync((void *)literal, &ops, sizeof(ops)); +} + +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, &ftrace_nop_ops); +} + +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) +{ + return ftrace_rec_set_ops(rec, riscv64_rec_get_ops(rec)); +} +#else +static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; } +static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; } +#endif + int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) { unsigned long distance, orig_addr, pc = rec->ip - MCOUNT_AUIPC_SIZE; + int ret; + + ret = ftrace_rec_update_ops(rec); + if (ret) + return ret; orig_addr = (unsigned long)&ftrace_caller; distance = addr > orig_addr ? addr - orig_addr : orig_addr - addr; @@ -79,6 +125,11 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { u32 nop4 = RISCV_INSN_NOP4; + int ret; + + ret = ftrace_rec_set_nop_ops(rec); + if (ret) + return ret; if (patch_insn_write((void *)rec->ip, &nop4, MCOUNT_NOP4_SIZE)) return -EPERM; @@ -99,6 +150,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) unsigned int nops[2], offset; int ret; + ret = ftrace_rec_set_nop_ops(rec); + if (ret) + return ret; + offset = (unsigned long) &ftrace_caller - pc; nops[0] = to_auipc_t0(offset); nops[1] = RISCV_INSN_NOP4; @@ -113,6 +168,13 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) ftrace_func_t ftrace_call_dest = ftrace_stub; int ftrace_update_ftrace_func(ftrace_func_t func) { + /* + * When using CALL_OPS, the function to call is associated with the + * call site, and we don't have a global function pointer to update. + */ + if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) + return 0; + WRITE_ONCE(ftrace_call_dest, func); /* * The data fence ensure that the update to ftrace_call_dest happens @@ -143,8 +205,13 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, { unsigned long caller = rec->ip - MCOUNT_AUIPC_SIZE; unsigned int call[2]; + int ret; make_call_t0(caller, old_addr, call); + ret = ftrace_rec_update_ops(rec); + if (ret) + return ret; + return __ftrace_modify_call(caller, addr, true); } #endif diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index 8aa554d56096..699684eea7f0 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -139,10 +139,34 @@ .macro PREPARE_ARGS addi a0, t0, -MCOUNT_JALR_SIZE // ip (callsite's jalr insn) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + /* + * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the + * function entry, these are later overwritten with the pointer to the + * associated struct ftrace_ops. + * + * -8: &ftrace_ops of the associated tracer function. + *: + * 0: auipc t0/ra, 0x? + * 4: jalr t0/ra, ?(t0/ra) + * + * -8: &ftrace_nop_ops + *: + * 0: nop + * 4: nop + * + * t0 is set to ip+8 after the jalr is executed at the callsite, + * so we find the associated op at t0-16. + */ + mv a1, ra // parent_ip + REG_L a2, -16(t0) // op + REG_L ra, FTRACE_OPS_FUNC(a2) // op->func +#else la a1, function_trace_op - REG_L a2, 0(a1) - mv a1, ra - mv a3, sp + REG_L a2, 0(a1) // op + mv a1, ra // parent_ip +#endif + mv a3, sp // regs .endm SYM_FUNC_START(ftrace_caller) @@ -150,10 +174,13 @@ SYM_FUNC_START(ftrace_caller) SAVE_ABI_REGS PREPARE_ARGS +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + jalr ra +#else SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) REG_L ra, ftrace_call_dest jalr ra, 0(ra) - +#endif RESTORE_ABI_REGS bnez t1, .Ldirect jr t0 -- cgit v1.2.3 From b21cdb9523e5561b97fd534dbb75d132c5c938ff Mon Sep 17 00:00:00 2001 From: Andy Chiu Date: Tue, 8 Apr 2025 02:08:35 +0800 Subject: riscv: ftrace: support direct call using call_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit jump to FTRACE_ADDR if distance is out of reach Co-developed-by: Björn Töpel Signed-off-by: Björn Töpel Signed-off-by: Andy Chiu Link: https://lore.kernel.org/r/20250407180838.42877-11-andybnac@gmail.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- arch/riscv/include/asm/ftrace.h | 6 +++++ arch/riscv/kernel/asm-offsets.c | 3 +++ arch/riscv/kernel/ftrace.c | 13 ++++------- arch/riscv/kernel/mcount-dyn.S | 51 ++++++++++++++++++++++++++--------------- 5 files changed, 48 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index ec986c9120e3..8fdca6345fa3 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -152,7 +152,7 @@ config RISCV select HAVE_DMA_CONTIGUOUS if MMU select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE) select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C - select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG) select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_GRAPH_FUNC diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h index 6a5c0a7fb826..22ebea3c2b26 100644 --- a/arch/riscv/include/asm/ftrace.h +++ b/arch/riscv/include/asm/ftrace.h @@ -130,6 +130,9 @@ struct __arch_ftrace_regs { unsigned long sp; unsigned long s0; unsigned long t1; +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + unsigned long direct_tramp; +#endif union { unsigned long args[8]; struct { @@ -223,10 +226,13 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct ftrace_regs *fregs); #define ftrace_graph_func ftrace_graph_func +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) { arch_ftrace_regs(fregs)->t1 = addr; } +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ + #endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */ #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 2d96197a8abf..b26334075697 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -495,6 +495,9 @@ void asm_offsets(void) OFFSET(STACKFRAME_RA, stackframe, ra); #ifdef CONFIG_FUNCTION_TRACER DEFINE(FTRACE_OPS_FUNC, offsetof(struct ftrace_ops, func)); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + DEFINE(FTRACE_OPS_DIRECT_CALL, offsetof(struct ftrace_ops, direct_call)); +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #endif #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index d56fc6e9fba0..4c6c24380cfd 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -17,7 +17,7 @@ unsigned long ftrace_call_adjust(unsigned long addr) { if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS)) - return addr + 8; + return addr + 8 + MCOUNT_AUIPC_SIZE; return addr + MCOUNT_AUIPC_SIZE; } @@ -83,10 +83,9 @@ static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec) return ops; } -static int ftrace_rec_set_ops(const struct dyn_ftrace *rec, - const struct ftrace_ops *ops) +static int ftrace_rec_set_ops(const struct dyn_ftrace *rec, const struct ftrace_ops *ops) { - unsigned long literal = rec->ip - 8; + unsigned long literal = ALIGN_DOWN(rec->ip - 12, 8); return patch_text_nosync((void *)literal, &ops, sizeof(ops)); } @@ -117,7 +116,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) orig_addr = (unsigned long)&ftrace_caller; distance = addr > orig_addr ? addr - orig_addr : orig_addr - addr; if (distance > JALR_RANGE) - return -EINVAL; + addr = FTRACE_ADDR; return __ftrace_modify_call(pc, addr, false); } @@ -204,15 +203,13 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, unsigned long addr) { unsigned long caller = rec->ip - MCOUNT_AUIPC_SIZE; - unsigned int call[2]; int ret; - make_call_t0(caller, old_addr, call); ret = ftrace_rec_update_ops(rec); if (ret) return ret; - return __ftrace_modify_call(caller, addr, true); + return __ftrace_modify_call(caller, FTRACE_ADDR, true); } #endif diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S index 699684eea7f0..48f6c4f7dca0 100644 --- a/arch/riscv/kernel/mcount-dyn.S +++ b/arch/riscv/kernel/mcount-dyn.S @@ -82,12 +82,9 @@ * +++++++++ **/ .macro SAVE_ABI_REGS - mv t4, sp // Save original SP in T4 addi sp, sp, -FREGS_SIZE_ON_STACK - REG_S t0, FREGS_EPC(sp) REG_S x1, FREGS_RA(sp) - REG_S t4, FREGS_SP(sp) // Put original SP on stack #ifdef HAVE_FUNCTION_GRAPH_FP_TEST REG_S x8, FREGS_S0(sp) #endif @@ -108,9 +105,12 @@ REG_S x15, FREGS_A5(sp) REG_S x16, FREGS_A6(sp) REG_S x17, FREGS_A7(sp) + mv a0, sp + addi a0, a0, FREGS_SIZE_ON_STACK + REG_S a0, FREGS_SP(sp) // Put original SP on stack .endm - .macro RESTORE_ABI_REGS, all=0 + .macro RESTORE_ABI_REGS REG_L t0, FREGS_EPC(sp) REG_L x1, FREGS_RA(sp) #ifdef HAVE_FUNCTION_GRAPH_FP_TEST @@ -139,6 +139,19 @@ .macro PREPARE_ARGS addi a0, t0, -MCOUNT_JALR_SIZE // ip (callsite's jalr insn) +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS + mv a1, ra // parent_ip + REG_L a2, -16(t0) // op + REG_L ra, FTRACE_OPS_FUNC(a2) // op->func +#else + la a1, function_trace_op + REG_L a2, 0(a1) // op + mv a1, ra // parent_ip +#endif + mv a3, sp // regs + .endm + +SYM_FUNC_START(ftrace_caller) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS /* * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the @@ -158,19 +171,17 @@ * t0 is set to ip+8 after the jalr is executed at the callsite, * so we find the associated op at t0-16. */ - mv a1, ra // parent_ip - REG_L a2, -16(t0) // op - REG_L ra, FTRACE_OPS_FUNC(a2) // op->func -#else - la a1, function_trace_op - REG_L a2, 0(a1) // op - mv a1, ra // parent_ip -#endif - mv a3, sp // regs - .endm + REG_L t1, -16(t0) // op Should be SZ_REG instead of 16 -SYM_FUNC_START(ftrace_caller) - mv t1, zero +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + /* + * If the op has a direct call, handle it immediately without + * saving/restoring registers. + */ + REG_L t1, FTRACE_OPS_DIRECT_CALL(t1) + bnez t1, ftrace_caller_direct +#endif +#endif SAVE_ABI_REGS PREPARE_ARGS @@ -182,10 +193,14 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) jalr ra, 0(ra) #endif RESTORE_ABI_REGS - bnez t1, .Ldirect +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS + bnez t1, ftrace_caller_direct +#endif jr t0 -.Ldirect: +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS +SYM_INNER_LABEL(ftrace_caller_direct, SYM_L_LOCAL) jr t1 +#endif SYM_FUNC_END(ftrace_caller) #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS -- cgit v1.2.3 From 1df45f8a9fea5a7513bd1bad98604ce1fbefcaaf Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Wed, 9 Apr 2025 21:29:58 +0200 Subject: riscv: kexec_file: Split the loading of kernel and others MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the preparative patch for kexec_file_load Image support. It separates the elf_kexec_load() as two parts: - the first part loads the vmlinux (or Image) - the second part loads other segments (e.g. initrd,fdt,purgatory) And the second part is exported as the load_extra_segments() function which would be used in both kexec-elf.c and kexec-image.c. No functional change intended. Signed-off-by: Song Shuai Signed-off-by: Björn Töpel Link: https://lore.kernel.org/r/20250409193004.643839-2-bjorn@kernel.org Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/kexec.h | 5 + arch/riscv/kernel/Makefile | 2 +- arch/riscv/kernel/elf_kexec.c | 485 --------------------------------- arch/riscv/kernel/kexec_elf.c | 144 ++++++++++ arch/riscv/kernel/machine_kexec_file.c | 360 ++++++++++++++++++++++++ 5 files changed, 510 insertions(+), 486 deletions(-) delete mode 100644 arch/riscv/kernel/elf_kexec.c create mode 100644 arch/riscv/kernel/kexec_elf.c (limited to 'arch') diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h index 2b56769cb530..518825fe4160 100644 --- a/arch/riscv/include/asm/kexec.h +++ b/arch/riscv/include/asm/kexec.h @@ -67,6 +67,11 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, struct kimage; int arch_kimage_file_post_load_cleanup(struct kimage *image); #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup + +int load_extra_segments(struct kimage *image, unsigned long kernel_start, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len); #endif #endif diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 8d186bfced45..d56305c8e631 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -107,7 +107,7 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KEXEC_CORE) += kexec_relocate.o crash_save_regs.o machine_kexec.o -obj-$(CONFIG_KEXEC_FILE) += elf_kexec.o machine_kexec_file.o +obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o machine_kexec_file.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c deleted file mode 100644 index e783a72d051f..000000000000 --- a/arch/riscv/kernel/elf_kexec.c +++ /dev/null @@ -1,485 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Load ELF vmlinux file for the kexec_file_load syscall. - * - * Copyright (C) 2021 Huawei Technologies Co, Ltd. - * - * Author: Liao Chang (liaochang1@huawei.com) - * - * Based on kexec-tools' kexec-elf-riscv.c, heavily modified - * for kernel. - */ - -#define pr_fmt(fmt) "kexec_image: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int arch_kimage_file_post_load_cleanup(struct kimage *image) -{ - kvfree(image->arch.fdt); - image->arch.fdt = NULL; - - vfree(image->elf_headers); - image->elf_headers = NULL; - image->elf_headers_sz = 0; - - return kexec_image_post_load_cleanup_default(image); -} - -static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, - struct kexec_elf_info *elf_info, unsigned long old_pbase, - unsigned long new_pbase) -{ - int i; - int ret = 0; - size_t size; - struct kexec_buf kbuf; - const struct elf_phdr *phdr; - - kbuf.image = image; - - for (i = 0; i < ehdr->e_phnum; i++) { - phdr = &elf_info->proghdrs[i]; - if (phdr->p_type != PT_LOAD) - continue; - - size = phdr->p_filesz; - if (size > phdr->p_memsz) - size = phdr->p_memsz; - - kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset; - kbuf.bufsz = size; - kbuf.buf_align = phdr->p_align; - kbuf.mem = phdr->p_paddr - old_pbase + new_pbase; - kbuf.memsz = phdr->p_memsz; - kbuf.top_down = false; - ret = kexec_add_buffer(&kbuf); - if (ret) - break; - } - - return ret; -} - -/* - * Go through the available phsyical memory regions and find one that hold - * an image of the specified size. - */ -static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, - struct elfhdr *ehdr, struct kexec_elf_info *elf_info, - unsigned long *old_pbase, unsigned long *new_pbase) -{ - int i; - int ret; - struct kexec_buf kbuf; - const struct elf_phdr *phdr; - unsigned long lowest_paddr = ULONG_MAX; - unsigned long lowest_vaddr = ULONG_MAX; - - for (i = 0; i < ehdr->e_phnum; i++) { - phdr = &elf_info->proghdrs[i]; - if (phdr->p_type != PT_LOAD) - continue; - - if (lowest_paddr > phdr->p_paddr) - lowest_paddr = phdr->p_paddr; - - if (lowest_vaddr > phdr->p_vaddr) - lowest_vaddr = phdr->p_vaddr; - } - - kbuf.image = image; - kbuf.buf_min = lowest_paddr; - kbuf.buf_max = ULONG_MAX; - - /* - * Current riscv boot protocol requires 2MB alignment for - * RV64 and 4MB alignment for RV32 - * - */ - kbuf.buf_align = PMD_SIZE; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); - kbuf.top_down = false; - ret = arch_kexec_locate_mem_hole(&kbuf); - if (!ret) { - *old_pbase = lowest_paddr; - *new_pbase = kbuf.mem; - image->start = ehdr->e_entry - lowest_vaddr + kbuf.mem; - } - return ret; -} - -#ifdef CONFIG_CRASH_DUMP -static int get_nr_ram_ranges_callback(struct resource *res, void *arg) -{ - unsigned int *nr_ranges = arg; - - (*nr_ranges)++; - return 0; -} - -static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) -{ - struct crash_mem *cmem = arg; - - cmem->ranges[cmem->nr_ranges].start = res->start; - cmem->ranges[cmem->nr_ranges].end = res->end; - cmem->nr_ranges++; - - return 0; -} - -static int prepare_elf_headers(void **addr, unsigned long *sz) -{ - struct crash_mem *cmem; - unsigned int nr_ranges; - int ret; - - nr_ranges = 1; /* For exclusion of crashkernel region */ - walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); - - cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); - if (!cmem) - return -ENOMEM; - - cmem->max_nr_ranges = nr_ranges; - cmem->nr_ranges = 0; - ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); - if (ret) - goto out; - - /* Exclude crashkernel region */ - ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); - if (!ret) - ret = crash_prepare_elf64_headers(cmem, true, addr, sz); - -out: - kfree(cmem); - return ret; -} - -static char *setup_kdump_cmdline(struct kimage *image, char *cmdline, - unsigned long cmdline_len) -{ - int elfcorehdr_strlen; - char *cmdline_ptr; - - cmdline_ptr = kzalloc(COMMAND_LINE_SIZE, GFP_KERNEL); - if (!cmdline_ptr) - return NULL; - - elfcorehdr_strlen = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ", - image->elf_load_addr); - - if (elfcorehdr_strlen + cmdline_len > COMMAND_LINE_SIZE) { - pr_err("Appending elfcorehdr= exceeds cmdline size\n"); - kfree(cmdline_ptr); - return NULL; - } - - memcpy(cmdline_ptr + elfcorehdr_strlen, cmdline, cmdline_len); - /* Ensure it's nul terminated */ - cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0'; - return cmdline_ptr; -} -#endif - -static void *elf_kexec_load(struct kimage *image, char *kernel_buf, - unsigned long kernel_len, char *initrd, - unsigned long initrd_len, char *cmdline, - unsigned long cmdline_len) -{ - int ret; - void *fdt; - unsigned long old_kernel_pbase = ULONG_MAX; - unsigned long new_kernel_pbase = 0UL; - unsigned long initrd_pbase = 0UL; - unsigned long kernel_start; - struct elfhdr ehdr; - struct kexec_buf kbuf; - struct kexec_elf_info elf_info; - char *modified_cmdline = NULL; - - ret = kexec_build_elf_info(kernel_buf, kernel_len, &ehdr, &elf_info); - if (ret) - return ERR_PTR(ret); - - ret = elf_find_pbase(image, kernel_len, &ehdr, &elf_info, - &old_kernel_pbase, &new_kernel_pbase); - if (ret) - goto out; - kernel_start = image->start; - - /* Add the kernel binary to the image */ - ret = riscv_kexec_elf_load(image, &ehdr, &elf_info, - old_kernel_pbase, new_kernel_pbase); - if (ret) - goto out; - - kbuf.image = image; - kbuf.buf_min = new_kernel_pbase + kernel_len; - kbuf.buf_max = ULONG_MAX; - -#ifdef CONFIG_CRASH_DUMP - /* Add elfcorehdr */ - if (image->type == KEXEC_TYPE_CRASH) { - void *headers; - unsigned long headers_sz; - ret = prepare_elf_headers(&headers, &headers_sz); - if (ret) { - pr_err("Preparing elf core header failed\n"); - goto out; - } - - kbuf.buffer = headers; - kbuf.bufsz = headers_sz; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - kbuf.memsz = headers_sz; - kbuf.buf_align = ELF_CORE_HEADER_ALIGN; - kbuf.top_down = true; - - ret = kexec_add_buffer(&kbuf); - if (ret) { - vfree(headers); - goto out; - } - image->elf_headers = headers; - image->elf_load_addr = kbuf.mem; - image->elf_headers_sz = headers_sz; - - kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->elf_load_addr, kbuf.bufsz, kbuf.memsz); - - /* Setup cmdline for kdump kernel case */ - modified_cmdline = setup_kdump_cmdline(image, cmdline, - cmdline_len); - if (!modified_cmdline) { - pr_err("Setting up cmdline for kdump kernel failed\n"); - ret = -EINVAL; - goto out; - } - cmdline = modified_cmdline; - } -#endif - -#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY - /* Add purgatory to the image */ - kbuf.top_down = true; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - ret = kexec_load_purgatory(image, &kbuf); - if (ret) { - pr_err("Error loading purgatory ret=%d\n", ret); - goto out; - } - kexec_dprintk("Loaded purgatory at 0x%lx\n", kbuf.mem); - - ret = kexec_purgatory_get_set_symbol(image, "riscv_kernel_entry", - &kernel_start, - sizeof(kernel_start), 0); - if (ret) - pr_err("Error update purgatory ret=%d\n", ret); -#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */ - - /* Add the initrd to the image */ - if (initrd != NULL) { - kbuf.buffer = initrd; - kbuf.bufsz = kbuf.memsz = initrd_len; - kbuf.buf_align = PAGE_SIZE; - kbuf.top_down = true; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - ret = kexec_add_buffer(&kbuf); - if (ret) - goto out; - initrd_pbase = kbuf.mem; - kexec_dprintk("Loaded initrd at 0x%lx\n", initrd_pbase); - } - - /* Add the DTB to the image */ - fdt = of_kexec_alloc_and_setup_fdt(image, initrd_pbase, - initrd_len, cmdline, 0); - if (!fdt) { - pr_err("Error setting up the new device tree.\n"); - ret = -EINVAL; - goto out; - } - - fdt_pack(fdt); - kbuf.buffer = fdt; - kbuf.bufsz = kbuf.memsz = fdt_totalsize(fdt); - kbuf.buf_align = PAGE_SIZE; - kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; - kbuf.top_down = true; - ret = kexec_add_buffer(&kbuf); - if (ret) { - pr_err("Error add DTB kbuf ret=%d\n", ret); - goto out_free_fdt; - } - /* Cache the fdt buffer address for memory cleanup */ - image->arch.fdt = fdt; - kexec_dprintk("Loaded device tree at 0x%lx\n", kbuf.mem); - goto out; - -out_free_fdt: - kvfree(fdt); -out: - kfree(modified_cmdline); - kexec_free_elf_info(&elf_info); - return ret ? ERR_PTR(ret) : NULL; -} - -#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) -#define RISCV_IMM_BITS 12 -#define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS) -#define RISCV_CONST_HIGH_PART(x) \ - (((x) + (RISCV_IMM_REACH >> 1)) & ~(RISCV_IMM_REACH - 1)) -#define RISCV_CONST_LOW_PART(x) ((x) - RISCV_CONST_HIGH_PART(x)) - -#define ENCODE_ITYPE_IMM(x) \ - (RV_X(x, 0, 12) << 20) -#define ENCODE_BTYPE_IMM(x) \ - ((RV_X(x, 1, 4) << 8) | (RV_X(x, 5, 6) << 25) | \ - (RV_X(x, 11, 1) << 7) | (RV_X(x, 12, 1) << 31)) -#define ENCODE_UTYPE_IMM(x) \ - (RV_X(x, 12, 20) << 12) -#define ENCODE_JTYPE_IMM(x) \ - ((RV_X(x, 1, 10) << 21) | (RV_X(x, 11, 1) << 20) | \ - (RV_X(x, 12, 8) << 12) | (RV_X(x, 20, 1) << 31)) -#define ENCODE_CBTYPE_IMM(x) \ - ((RV_X(x, 1, 2) << 3) | (RV_X(x, 3, 2) << 10) | (RV_X(x, 5, 1) << 2) | \ - (RV_X(x, 6, 2) << 5) | (RV_X(x, 8, 1) << 12)) -#define ENCODE_CJTYPE_IMM(x) \ - ((RV_X(x, 1, 3) << 3) | (RV_X(x, 4, 1) << 11) | (RV_X(x, 5, 1) << 2) | \ - (RV_X(x, 6, 1) << 7) | (RV_X(x, 7, 1) << 6) | (RV_X(x, 8, 2) << 9) | \ - (RV_X(x, 10, 1) << 8) | (RV_X(x, 11, 1) << 12)) -#define ENCODE_UJTYPE_IMM(x) \ - (ENCODE_UTYPE_IMM(RISCV_CONST_HIGH_PART(x)) | \ - (ENCODE_ITYPE_IMM(RISCV_CONST_LOW_PART(x)) << 32)) -#define ENCODE_UITYPE_IMM(x) \ - (ENCODE_UTYPE_IMM(x) | (ENCODE_ITYPE_IMM(x) << 32)) - -#define CLEAN_IMM(type, x) \ - ((~ENCODE_##type##_IMM((uint64_t)(-1))) & (x)) - -int arch_kexec_apply_relocations_add(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab) -{ - const char *strtab, *name, *shstrtab; - const Elf_Shdr *sechdrs; - Elf64_Rela *relas; - int i, r_type; - - /* String & section header string table */ - sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; - strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset; - shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; - - relas = (void *)pi->ehdr + relsec->sh_offset; - - for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) { - const Elf_Sym *sym; /* symbol to relocate */ - unsigned long addr; /* final location after relocation */ - unsigned long val; /* relocated symbol value */ - unsigned long sec_base; /* relocated symbol value */ - void *loc; /* tmp location to modify */ - - sym = (void *)pi->ehdr + symtab->sh_offset; - sym += ELF64_R_SYM(relas[i].r_info); - - if (sym->st_name) - name = strtab + sym->st_name; - else - name = shstrtab + sechdrs[sym->st_shndx].sh_name; - - loc = pi->purgatory_buf; - loc += section->sh_offset; - loc += relas[i].r_offset; - - if (sym->st_shndx == SHN_ABS) - sec_base = 0; - else if (sym->st_shndx >= pi->ehdr->e_shnum) { - pr_err("Invalid section %d for symbol %s\n", - sym->st_shndx, name); - return -ENOEXEC; - } else - sec_base = pi->sechdrs[sym->st_shndx].sh_addr; - - val = sym->st_value; - val += sec_base; - val += relas[i].r_addend; - - addr = section->sh_addr + relas[i].r_offset; - - r_type = ELF64_R_TYPE(relas[i].r_info); - - switch (r_type) { - case R_RISCV_BRANCH: - *(u32 *)loc = CLEAN_IMM(BTYPE, *(u32 *)loc) | - ENCODE_BTYPE_IMM(val - addr); - break; - case R_RISCV_JAL: - *(u32 *)loc = CLEAN_IMM(JTYPE, *(u32 *)loc) | - ENCODE_JTYPE_IMM(val - addr); - break; - /* - * With no R_RISCV_PCREL_LO12_S, R_RISCV_PCREL_LO12_I - * sym is expected to be next to R_RISCV_PCREL_HI20 - * in purgatory relsec. Handle it like R_RISCV_CALL - * sym, instead of searching the whole relsec. - */ - case R_RISCV_PCREL_HI20: - case R_RISCV_CALL_PLT: - case R_RISCV_CALL: - *(u64 *)loc = CLEAN_IMM(UITYPE, *(u64 *)loc) | - ENCODE_UJTYPE_IMM(val - addr); - break; - case R_RISCV_RVC_BRANCH: - *(u32 *)loc = CLEAN_IMM(CBTYPE, *(u32 *)loc) | - ENCODE_CBTYPE_IMM(val - addr); - break; - case R_RISCV_RVC_JUMP: - *(u32 *)loc = CLEAN_IMM(CJTYPE, *(u32 *)loc) | - ENCODE_CJTYPE_IMM(val - addr); - break; - case R_RISCV_ADD16: - *(u16 *)loc += val; - break; - case R_RISCV_SUB16: - *(u16 *)loc -= val; - break; - case R_RISCV_ADD32: - *(u32 *)loc += val; - break; - case R_RISCV_SUB32: - *(u32 *)loc -= val; - break; - /* It has been applied by R_RISCV_PCREL_HI20 sym */ - case R_RISCV_PCREL_LO12_I: - case R_RISCV_ALIGN: - case R_RISCV_RELAX: - break; - case R_RISCV_64: - *(u64 *)loc = val; - break; - default: - pr_err("Unknown rela relocation: %d\n", r_type); - return -ENOEXEC; - } - } - return 0; -} - -const struct kexec_file_ops elf_kexec_ops = { - .probe = kexec_elf_probe, - .load = elf_kexec_load, -}; diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c new file mode 100644 index 000000000000..f4755d49b89e --- /dev/null +++ b/arch/riscv/kernel/kexec_elf.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Load ELF vmlinux file for the kexec_file_load syscall. + * + * Copyright (C) 2021 Huawei Technologies Co, Ltd. + * + * Author: Liao Chang (liaochang1@huawei.com) + * + * Based on kexec-tools' kexec-elf-riscv.c, heavily modified + * for kernel. + */ + +#define pr_fmt(fmt) "kexec_image: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, + struct kexec_elf_info *elf_info, unsigned long old_pbase, + unsigned long new_pbase) +{ + int i; + int ret = 0; + size_t size; + struct kexec_buf kbuf; + const struct elf_phdr *phdr; + + kbuf.image = image; + + for (i = 0; i < ehdr->e_phnum; i++) { + phdr = &elf_info->proghdrs[i]; + if (phdr->p_type != PT_LOAD) + continue; + + size = phdr->p_filesz; + if (size > phdr->p_memsz) + size = phdr->p_memsz; + + kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset; + kbuf.bufsz = size; + kbuf.buf_align = phdr->p_align; + kbuf.mem = phdr->p_paddr - old_pbase + new_pbase; + kbuf.memsz = phdr->p_memsz; + kbuf.top_down = false; + ret = kexec_add_buffer(&kbuf); + if (ret) + break; + } + + return ret; +} + +/* + * Go through the available phsyical memory regions and find one that hold + * an image of the specified size. + */ +static int elf_find_pbase(struct kimage *image, unsigned long kernel_len, + struct elfhdr *ehdr, struct kexec_elf_info *elf_info, + unsigned long *old_pbase, unsigned long *new_pbase) +{ + int i; + int ret; + struct kexec_buf kbuf; + const struct elf_phdr *phdr; + unsigned long lowest_paddr = ULONG_MAX; + unsigned long lowest_vaddr = ULONG_MAX; + + for (i = 0; i < ehdr->e_phnum; i++) { + phdr = &elf_info->proghdrs[i]; + if (phdr->p_type != PT_LOAD) + continue; + + if (lowest_paddr > phdr->p_paddr) + lowest_paddr = phdr->p_paddr; + + if (lowest_vaddr > phdr->p_vaddr) + lowest_vaddr = phdr->p_vaddr; + } + + kbuf.image = image; + kbuf.buf_min = lowest_paddr; + kbuf.buf_max = ULONG_MAX; + + /* + * Current riscv boot protocol requires 2MB alignment for + * RV64 and 4MB alignment for RV32 + * + */ + kbuf.buf_align = PMD_SIZE; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE); + kbuf.top_down = false; + ret = arch_kexec_locate_mem_hole(&kbuf); + if (!ret) { + *old_pbase = lowest_paddr; + *new_pbase = kbuf.mem; + image->start = ehdr->e_entry - lowest_vaddr + kbuf.mem; + } + return ret; +} + +static void *elf_kexec_load(struct kimage *image, char *kernel_buf, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + int ret; + unsigned long old_kernel_pbase = ULONG_MAX; + unsigned long new_kernel_pbase = 0UL; + struct elfhdr ehdr; + struct kexec_elf_info elf_info; + + ret = kexec_build_elf_info(kernel_buf, kernel_len, &ehdr, &elf_info); + if (ret) + return ERR_PTR(ret); + + ret = elf_find_pbase(image, kernel_len, &ehdr, &elf_info, + &old_kernel_pbase, &new_kernel_pbase); + if (ret) + goto out; + + /* Add the kernel binary to the image */ + ret = riscv_kexec_elf_load(image, &ehdr, &elf_info, + old_kernel_pbase, new_kernel_pbase); + if (ret) + goto out; + + ret = load_extra_segments(image, image->start, kernel_len, + initrd, initrd_len, cmdline, cmdline_len); +out: + kexec_free_elf_info(&elf_info); + return ret ? ERR_PTR(ret) : NULL; +} + +const struct kexec_file_ops elf_kexec_ops = { + .probe = kexec_elf_probe, + .load = elf_kexec_load, +}; diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c index b0bf8c1722c0..99bd5a5f4234 100644 --- a/arch/riscv/kernel/machine_kexec_file.c +++ b/arch/riscv/kernel/machine_kexec_file.c @@ -7,8 +7,368 @@ * Author: Liao Chang (liaochang1@huawei.com) */ #include +#include +#include +#include +#include +#include +#include +#include +#include const struct kexec_file_ops * const kexec_file_loaders[] = { &elf_kexec_ops, NULL }; + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + kvfree(image->arch.fdt); + image->arch.fdt = NULL; + + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; + + return kexec_image_post_load_cleanup_default(image); +} + +#ifdef CONFIG_CRASH_DUMP +static int get_nr_ram_ranges_callback(struct resource *res, void *arg) +{ + unsigned int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg) +{ + struct crash_mem *cmem = arg; + + cmem->ranges[cmem->nr_ranges].start = res->start; + cmem->ranges[cmem->nr_ranges].end = res->end; + cmem->nr_ranges++; + + return 0; +} + +static int prepare_elf_headers(void **addr, unsigned long *sz) +{ + struct crash_mem *cmem; + unsigned int nr_ranges; + int ret; + + nr_ranges = 1; /* For exclusion of crashkernel region */ + walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); + + cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL); + if (!cmem) + return -ENOMEM; + + cmem->max_nr_ranges = nr_ranges; + cmem->nr_ranges = 0; + ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); + if (ret) + goto out; + + /* Exclude crashkernel region */ + ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (!ret) + ret = crash_prepare_elf64_headers(cmem, true, addr, sz); + +out: + kfree(cmem); + return ret; +} + +static char *setup_kdump_cmdline(struct kimage *image, char *cmdline, + unsigned long cmdline_len) +{ + int elfcorehdr_strlen; + char *cmdline_ptr; + + cmdline_ptr = kzalloc(COMMAND_LINE_SIZE, GFP_KERNEL); + if (!cmdline_ptr) + return NULL; + + elfcorehdr_strlen = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ", + image->elf_load_addr); + + if (elfcorehdr_strlen + cmdline_len > COMMAND_LINE_SIZE) { + pr_err("Appending elfcorehdr= exceeds cmdline size\n"); + kfree(cmdline_ptr); + return NULL; + } + + memcpy(cmdline_ptr + elfcorehdr_strlen, cmdline, cmdline_len); + /* Ensure it's nul terminated */ + cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0'; + return cmdline_ptr; +} +#endif + +#define RV_X(x, s, n) (((x) >> (s)) & ((1 << (n)) - 1)) +#define RISCV_IMM_BITS 12 +#define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS) +#define RISCV_CONST_HIGH_PART(x) \ + (((x) + (RISCV_IMM_REACH >> 1)) & ~(RISCV_IMM_REACH - 1)) +#define RISCV_CONST_LOW_PART(x) ((x) - RISCV_CONST_HIGH_PART(x)) + +#define ENCODE_ITYPE_IMM(x) \ + (RV_X(x, 0, 12) << 20) +#define ENCODE_BTYPE_IMM(x) \ + ((RV_X(x, 1, 4) << 8) | (RV_X(x, 5, 6) << 25) | \ + (RV_X(x, 11, 1) << 7) | (RV_X(x, 12, 1) << 31)) +#define ENCODE_UTYPE_IMM(x) \ + (RV_X(x, 12, 20) << 12) +#define ENCODE_JTYPE_IMM(x) \ + ((RV_X(x, 1, 10) << 21) | (RV_X(x, 11, 1) << 20) | \ + (RV_X(x, 12, 8) << 12) | (RV_X(x, 20, 1) << 31)) +#define ENCODE_CBTYPE_IMM(x) \ + ((RV_X(x, 1, 2) << 3) | (RV_X(x, 3, 2) << 10) | (RV_X(x, 5, 1) << 2) | \ + (RV_X(x, 6, 2) << 5) | (RV_X(x, 8, 1) << 12)) +#define ENCODE_CJTYPE_IMM(x) \ + ((RV_X(x, 1, 3) << 3) | (RV_X(x, 4, 1) << 11) | (RV_X(x, 5, 1) << 2) | \ + (RV_X(x, 6, 1) << 7) | (RV_X(x, 7, 1) << 6) | (RV_X(x, 8, 2) << 9) | \ + (RV_X(x, 10, 1) << 8) | (RV_X(x, 11, 1) << 12)) +#define ENCODE_UJTYPE_IMM(x) \ + (ENCODE_UTYPE_IMM(RISCV_CONST_HIGH_PART(x)) | \ + (ENCODE_ITYPE_IMM(RISCV_CONST_LOW_PART(x)) << 32)) +#define ENCODE_UITYPE_IMM(x) \ + (ENCODE_UTYPE_IMM(x) | (ENCODE_ITYPE_IMM(x) << 32)) + +#define CLEAN_IMM(type, x) \ + ((~ENCODE_##type##_IMM((uint64_t)(-1))) & (x)) + +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab) +{ + const char *strtab, *name, *shstrtab; + const Elf_Shdr *sechdrs; + Elf64_Rela *relas; + int i, r_type; + + /* String & section header string table */ + sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; + strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset; + shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; + + relas = (void *)pi->ehdr + relsec->sh_offset; + + for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) { + const Elf_Sym *sym; /* symbol to relocate */ + unsigned long addr; /* final location after relocation */ + unsigned long val; /* relocated symbol value */ + unsigned long sec_base; /* relocated symbol value */ + void *loc; /* tmp location to modify */ + + sym = (void *)pi->ehdr + symtab->sh_offset; + sym += ELF64_R_SYM(relas[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + loc = pi->purgatory_buf; + loc += section->sh_offset; + loc += relas[i].r_offset; + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= pi->ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = pi->sechdrs[sym->st_shndx].sh_addr; + + val = sym->st_value; + val += sec_base; + val += relas[i].r_addend; + + addr = section->sh_addr + relas[i].r_offset; + + r_type = ELF64_R_TYPE(relas[i].r_info); + + switch (r_type) { + case R_RISCV_BRANCH: + *(u32 *)loc = CLEAN_IMM(BTYPE, *(u32 *)loc) | + ENCODE_BTYPE_IMM(val - addr); + break; + case R_RISCV_JAL: + *(u32 *)loc = CLEAN_IMM(JTYPE, *(u32 *)loc) | + ENCODE_JTYPE_IMM(val - addr); + break; + /* + * With no R_RISCV_PCREL_LO12_S, R_RISCV_PCREL_LO12_I + * sym is expected to be next to R_RISCV_PCREL_HI20 + * in purgatory relsec. Handle it like R_RISCV_CALL + * sym, instead of searching the whole relsec. + */ + case R_RISCV_PCREL_HI20: + case R_RISCV_CALL_PLT: + case R_RISCV_CALL: + *(u64 *)loc = CLEAN_IMM(UITYPE, *(u64 *)loc) | + ENCODE_UJTYPE_IMM(val - addr); + break; + case R_RISCV_RVC_BRANCH: + *(u32 *)loc = CLEAN_IMM(CBTYPE, *(u32 *)loc) | + ENCODE_CBTYPE_IMM(val - addr); + break; + case R_RISCV_RVC_JUMP: + *(u32 *)loc = CLEAN_IMM(CJTYPE, *(u32 *)loc) | + ENCODE_CJTYPE_IMM(val - addr); + break; + case R_RISCV_ADD16: + *(u16 *)loc += val; + break; + case R_RISCV_SUB16: + *(u16 *)loc -= val; + break; + case R_RISCV_ADD32: + *(u32 *)loc += val; + break; + case R_RISCV_SUB32: + *(u32 *)loc -= val; + break; + /* It has been applied by R_RISCV_PCREL_HI20 sym */ + case R_RISCV_PCREL_LO12_I: + case R_RISCV_ALIGN: + case R_RISCV_RELAX: + break; + case R_RISCV_64: + *(u64 *)loc = val; + break; + default: + pr_err("Unknown rela relocation: %d\n", r_type); + return -ENOEXEC; + } + } + return 0; +} + + +int load_extra_segments(struct kimage *image, unsigned long kernel_start, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + int ret; + void *fdt; + unsigned long initrd_pbase = 0UL; + struct kexec_buf kbuf; + char *modified_cmdline = NULL; + + kbuf.image = image; + kbuf.buf_min = kernel_start + kernel_len; + kbuf.buf_max = ULONG_MAX; + +#ifdef CONFIG_CRASH_DUMP + /* Add elfcorehdr */ + if (image->type == KEXEC_TYPE_CRASH) { + void *headers; + unsigned long headers_sz; + ret = prepare_elf_headers(&headers, &headers_sz); + if (ret) { + pr_err("Preparing elf core header failed\n"); + goto out; + } + + kbuf.buffer = headers; + kbuf.bufsz = headers_sz; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = headers_sz; + kbuf.buf_align = ELF_CORE_HEADER_ALIGN; + kbuf.top_down = true; + + ret = kexec_add_buffer(&kbuf); + if (ret) { + vfree(headers); + goto out; + } + image->elf_headers = headers; + image->elf_load_addr = kbuf.mem; + image->elf_headers_sz = headers_sz; + + kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); + + /* Setup cmdline for kdump kernel case */ + modified_cmdline = setup_kdump_cmdline(image, cmdline, + cmdline_len); + if (!modified_cmdline) { + pr_err("Setting up cmdline for kdump kernel failed\n"); + ret = -EINVAL; + goto out; + } + cmdline = modified_cmdline; + } +#endif + +#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY + /* Add purgatory to the image */ + kbuf.top_down = true; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_load_purgatory(image, &kbuf); + if (ret) { + pr_err("Error loading purgatory ret=%d\n", ret); + goto out; + } + kexec_dprintk("Loaded purgatory at 0x%lx\n", kbuf.mem); + + ret = kexec_purgatory_get_set_symbol(image, "riscv_kernel_entry", + &kernel_start, + sizeof(kernel_start), 0); + if (ret) + pr_err("Error update purgatory ret=%d\n", ret); +#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */ + + /* Add the initrd to the image */ + if (initrd != NULL) { + kbuf.buffer = initrd; + kbuf.bufsz = kbuf.memsz = initrd_len; + kbuf.buf_align = PAGE_SIZE; + kbuf.top_down = true; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + ret = kexec_add_buffer(&kbuf); + if (ret) + goto out; + initrd_pbase = kbuf.mem; + kexec_dprintk("Loaded initrd at 0x%lx\n", initrd_pbase); + } + + /* Add the DTB to the image */ + fdt = of_kexec_alloc_and_setup_fdt(image, initrd_pbase, + initrd_len, cmdline, 0); + if (!fdt) { + pr_err("Error setting up the new device tree.\n"); + ret = -EINVAL; + goto out; + } + + fdt_pack(fdt); + kbuf.buffer = fdt; + kbuf.bufsz = kbuf.memsz = fdt_totalsize(fdt); + kbuf.buf_align = PAGE_SIZE; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.top_down = true; + ret = kexec_add_buffer(&kbuf); + if (ret) { + pr_err("Error add DTB kbuf ret=%d\n", ret); + goto out_free_fdt; + } + /* Cache the fdt buffer address for memory cleanup */ + image->arch.fdt = fdt; + kexec_dprintk("Loaded device tree at 0x%lx\n", kbuf.mem); + goto out; + +out_free_fdt: + kvfree(fdt); +out: + kfree(modified_cmdline); + return ret; +} -- cgit v1.2.3 From 809a11eea8e8c80491e3ba3a286af25409c072d5 Mon Sep 17 00:00:00 2001 From: Song Shuai Date: Wed, 9 Apr 2025 21:29:59 +0200 Subject: riscv: kexec_file: Support loading Image binary file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch creates image_kexec_ops to load Image binary file for kexec_file_load() syscall. Signed-off-by: Song Shuai Signed-off-by: Björn Töpel Link: https://lore.kernel.org/r/20250409193004.643839-3-bjorn@kernel.org Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/image.h | 2 + arch/riscv/include/asm/kexec.h | 1 + arch/riscv/kernel/Makefile | 2 +- arch/riscv/kernel/kexec_image.c | 96 ++++++++++++++++++++++++++++++++++ arch/riscv/kernel/machine_kexec_file.c | 1 + 5 files changed, 101 insertions(+), 1 deletion(-) create mode 100644 arch/riscv/kernel/kexec_image.c (limited to 'arch') diff --git a/arch/riscv/include/asm/image.h b/arch/riscv/include/asm/image.h index e0b319af3681..8927a6ea1127 100644 --- a/arch/riscv/include/asm/image.h +++ b/arch/riscv/include/asm/image.h @@ -30,6 +30,8 @@ RISCV_HEADER_VERSION_MINOR) #ifndef __ASSEMBLY__ +#define riscv_image_flag_field(flags, field)\ + (((flags) >> field##_SHIFT) & field##_MASK) /** * struct riscv_image_header - riscv kernel image header * @code0: Executable code diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h index 518825fe4160..b9ee8346cc8c 100644 --- a/arch/riscv/include/asm/kexec.h +++ b/arch/riscv/include/asm/kexec.h @@ -56,6 +56,7 @@ extern riscv_kexec_method riscv_kexec_norelocate; #ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops elf_kexec_ops; +extern const struct kexec_file_ops image_kexec_ops; struct purgatory_info; int arch_kexec_apply_relocations_add(struct purgatory_info *pi, diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index d56305c8e631..0ead29826419 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -107,7 +107,7 @@ obj-$(CONFIG_HOTPLUG_CPU) += cpu-hotplug.o obj-$(CONFIG_PARAVIRT) += paravirt.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_KEXEC_CORE) += kexec_relocate.o crash_save_regs.o machine_kexec.o -obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o machine_kexec_file.o +obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o kexec_image.o machine_kexec_file.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_VMCORE_INFO) += vmcore_info.o diff --git a/arch/riscv/kernel/kexec_image.c b/arch/riscv/kernel/kexec_image.c new file mode 100644 index 000000000000..26a81774a78a --- /dev/null +++ b/arch/riscv/kernel/kexec_image.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RISC-V Kexec image loader + * + */ + +#define pr_fmt(fmt) "kexec_file(Image): " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +static int image_probe(const char *kernel_buf, unsigned long kernel_len) +{ + const struct riscv_image_header *h = (const struct riscv_image_header *)kernel_buf; + + if (!h || kernel_len < sizeof(*h)) + return -EINVAL; + + /* According to Documentation/riscv/boot-image-header.rst, + * use "magic2" field to check when version >= 0.2. + */ + + if (h->version >= RISCV_HEADER_VERSION && + memcmp(&h->magic2, RISCV_IMAGE_MAGIC2, sizeof(h->magic2))) + return -EINVAL; + + return 0; +} + +static void *image_load(struct kimage *image, + char *kernel, unsigned long kernel_len, + char *initrd, unsigned long initrd_len, + char *cmdline, unsigned long cmdline_len) +{ + struct riscv_image_header *h; + u64 flags; + bool be_image, be_kernel; + struct kexec_buf kbuf; + int ret; + + /* Check Image header */ + h = (struct riscv_image_header *)kernel; + if (!h->image_size) { + ret = -EINVAL; + goto out; + } + + /* Check endianness */ + flags = le64_to_cpu(h->flags); + be_image = riscv_image_flag_field(flags, RISCV_IMAGE_FLAG_BE); + be_kernel = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); + if (be_image != be_kernel) { + ret = -EINVAL; + goto out; + } + + /* Load the kernel image */ + kbuf.image = image; + kbuf.buf_min = 0; + kbuf.buf_max = ULONG_MAX; + kbuf.top_down = false; + + kbuf.buffer = kernel; + kbuf.bufsz = kernel_len; + kbuf.mem = KEXEC_BUF_MEM_UNKNOWN; + kbuf.memsz = le64_to_cpu(h->image_size); + kbuf.buf_align = le64_to_cpu(h->text_offset); + + ret = kexec_add_buffer(&kbuf); + if (ret) { + pr_err("Error add kernel image ret=%d\n", ret); + goto out; + } + + image->start = kbuf.mem; + + pr_info("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kbuf.mem, kbuf.bufsz, kbuf.memsz); + + ret = load_extra_segments(image, kbuf.mem, kbuf.memsz, + initrd, initrd_len, cmdline, cmdline_len); + +out: + return ret ? ERR_PTR(ret) : NULL; +} + +const struct kexec_file_ops image_kexec_ops = { + .probe = image_probe, + .load = image_load, +}; diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c index 99bd5a5f4234..e36104af2e24 100644 --- a/arch/riscv/kernel/machine_kexec_file.c +++ b/arch/riscv/kernel/machine_kexec_file.c @@ -18,6 +18,7 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { &elf_kexec_ops, + &image_kexec_ops, NULL }; -- cgit v1.2.3 From 850d7b14c8f77407b7d2a1edeb83d3e36c46e7a8 Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Wed, 26 Mar 2025 07:34:51 +0000 Subject: riscv/kexec_file: Fix comment in purgatory relocator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apparently sec_base doesn't mean relocated symbol value, which seems a copy-pasting error in the comment. Assigned with the address of section indexed by sym->st_shndx, it should represent base address of the relevant section. Let's fix the comment to avoid possible confusion. Fixes: 838b3e28488f ("RISC-V: Load purgatory in kexec_file") Signed-off-by: Yao Zi Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20250326073450.57648-2-ziyao@disroot.org Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/elf_kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c index e783a72d051f..0dc5450f2c7f 100644 --- a/arch/riscv/kernel/elf_kexec.c +++ b/arch/riscv/kernel/elf_kexec.c @@ -390,7 +390,7 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Sym *sym; /* symbol to relocate */ unsigned long addr; /* final location after relocation */ unsigned long val; /* relocated symbol value */ - unsigned long sec_base; /* relocated symbol value */ + unsigned long sec_base; /* relocated section base address */ void *loc; /* tmp location to modify */ sym = (void *)pi->ehdr + symtab->sh_offset; -- cgit v1.2.3 From f0f4e64b9e3527c11dc6dcb005e577d661eb9ab5 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 21 Apr 2025 16:24:38 +0200 Subject: riscv: Introduce Zicbop instructions The S-type instructions are first introduced and then used to define the encoding of the Zicbop prefetching instructions. Co-developed-by: Guo Ren Signed-off-by: Guo Ren Tested-by: Andrea Parri Link: https://lore.kernel.org/r/20250421142441.395849-2-alexghiti@rivosinc.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index 71060a2f838e..02c92c1657d2 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -18,6 +18,13 @@ #define INSN_I_RD_SHIFT 7 #define INSN_I_OPCODE_SHIFT 0 +#define INSN_S_SIMM7_SHIFT 25 +#define INSN_S_RS2_SHIFT 20 +#define INSN_S_RS1_SHIFT 15 +#define INSN_S_FUNC3_SHIFT 12 +#define INSN_S_SIMM5_SHIFT 7 +#define INSN_S_OPCODE_SHIFT 0 + #ifdef __ASSEMBLY__ #ifdef CONFIG_AS_HAS_INSN @@ -30,6 +37,10 @@ .insn i \opcode, \func3, \rd, \rs1, \simm12 .endm + .macro insn_s, opcode, func3, rs2, simm12, rs1 + .insn s \opcode, \func3, \rs2, \simm12(\rs1) + .endm + #else #include @@ -51,10 +62,20 @@ (\simm12 << INSN_I_SIMM12_SHIFT)) .endm + .macro insn_s, opcode, func3, rs2, simm12, rs1 + .4byte ((\opcode << INSN_S_OPCODE_SHIFT) | \ + (\func3 << INSN_S_FUNC3_SHIFT) | \ + (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) | \ + (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) | \ + ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) | \ + (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT)) + .endm + #endif #define __INSN_R(...) insn_r __VA_ARGS__ #define __INSN_I(...) insn_i __VA_ARGS__ +#define __INSN_S(...) insn_s __VA_ARGS__ #else /* ! __ASSEMBLY__ */ @@ -66,6 +87,9 @@ #define __INSN_I(opcode, func3, rd, rs1, simm12) \ ".insn i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \ + ".insn s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n" + #else #include @@ -92,12 +116,26 @@ " (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n" \ " .endm\n" +#define DEFINE_INSN_S \ + __DEFINE_ASM_GPR_NUMS \ +" .macro insn_s, opcode, func3, rs2, simm12, rs1\n" \ +" .4byte ((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |" \ +" (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |" \ +" (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \ +" (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \ +" ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \ +" (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \ +" .endm\n" + #define UNDEFINE_INSN_R \ " .purgem insn_r\n" #define UNDEFINE_INSN_I \ " .purgem insn_i\n" +#define UNDEFINE_INSN_S \ +" .purgem insn_s\n" + #define __INSN_R(opcode, func3, func7, rd, rs1, rs2) \ DEFINE_INSN_R \ "insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \ @@ -108,6 +146,11 @@ "insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \ UNDEFINE_INSN_I +#define __INSN_S(opcode, func3, rs2, simm12, rs1) \ + DEFINE_INSN_S \ + "insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n" \ + UNDEFINE_INSN_S + #endif #endif /* ! __ASSEMBLY__ */ @@ -120,6 +163,10 @@ __INSN_I(RV_##opcode, RV_##func3, RV_##rd, \ RV_##rs1, RV_##simm12) +#define INSN_S(opcode, func3, rs2, simm12, rs1) \ + __INSN_S(RV_##opcode, RV_##func3, RV_##rs2, \ + RV_##simm12, RV_##rs1) + #define RV_OPCODE(v) __ASM_STR(v) #define RV_FUNC3(v) __ASM_STR(v) #define RV_FUNC7(v) __ASM_STR(v) @@ -133,6 +180,7 @@ #define RV___RS2(v) __RV_REG(v) #define RV_OPCODE_MISC_MEM RV_OPCODE(15) +#define RV_OPCODE_OP_IMM RV_OPCODE(19) #define RV_OPCODE_SYSTEM RV_OPCODE(115) #define HFENCE_VVMA(vaddr, asid) \ @@ -196,6 +244,18 @@ INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \ RS1(base), SIMM12(4)) +#define PREFETCH_I(base, offset) \ + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0), \ + SIMM12((offset) & 0xfe0), RS1(base)) + +#define PREFETCH_R(base, offset) \ + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1), \ + SIMM12((offset) & 0xfe0), RS1(base)) + +#define PREFETCH_W(base, offset) \ + INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3), \ + SIMM12((offset) & 0xfe0), RS1(base)) + #define RISCV_PAUSE ".4byte 0x100000f" #define ZAWRS_WRS_NTO ".4byte 0x00d00073" #define ZAWRS_WRS_STO ".4byte 0x01d00073" -- cgit v1.2.3 From 8d496b5a989120c1bce1ad8eb48ebae0350722d7 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 21 Apr 2025 16:24:39 +0200 Subject: riscv: Add support for Zicbop Zicbop introduces cache blocks prefetching instructions, add the necessary support for the kernel to use it in the coming commits. Co-developed-by: Guo Ren Signed-off-by: Guo Ren Tested-by: Andrea Parri Link: https://lore.kernel.org/r/20250421142441.395849-3-alexghiti@rivosinc.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 15 +++++++++++++++ arch/riscv/include/asm/barrier.h | 5 ----- arch/riscv/include/asm/cacheflush.h | 1 + arch/riscv/include/asm/hwcap.h | 1 + arch/riscv/include/asm/insn-def.h | 6 ++++++ arch/riscv/include/asm/processor.h | 1 - arch/riscv/kernel/cpufeature.c | 21 +++++++++++++++++++++ arch/riscv/mm/cacheflush.c | 14 +++++++++++--- 8 files changed, 55 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bbec87b79309..28765ce563de 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -842,6 +842,21 @@ config RISCV_ISA_ZICBOZ If you don't know what to do here, say Y. +config RISCV_ISA_ZICBOP + bool "Zicbop extension support for cache block prefetch" + depends on MMU + depends on RISCV_ALTERNATIVE + default y + help + Adds support to dynamically detect the presence of the ZICBOP + extension (Cache Block Prefetch Operations) and enable its + usage. + + The Zicbop extension can be used to prefetch cache blocks for + read/write fetch. + + If you don't know what to do here, say Y. + config TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI def_bool y # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=aed44286efa8ae8717a77d94b51ac3614e2ca6dc diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h index e1d9bf1deca6..b8c5726d86ac 100644 --- a/arch/riscv/include/asm/barrier.h +++ b/arch/riscv/include/asm/barrier.h @@ -14,11 +14,6 @@ #include #include -#define nop() __asm__ __volatile__ ("nop") -#define __nops(n) ".rept " #n "\nnop\n.endr\n" -#define nops(n) __asm__ __volatile__ (__nops(n)) - - /* These barriers need to enforce ordering on both devices or memory. */ #define __mb() RISCV_FENCE(iorw, iorw) #define __rmb() RISCV_FENCE(ir, ir) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 8de73f91bfa3..effa02c2e682 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -80,6 +80,7 @@ void flush_icache_mm(struct mm_struct *mm, bool local); extern unsigned int riscv_cbom_block_size; extern unsigned int riscv_cboz_block_size; +extern unsigned int riscv_cbop_block_size; void riscv_init_cbo_blocksizes(void); #ifdef CONFIG_RISCV_DMA_NONCOHERENT diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index e3cbf203cdde..affd63e11b0a 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -105,6 +105,7 @@ #define RISCV_ISA_EXT_ZVFBFWMA 96 #define RISCV_ISA_EXT_ZAAMO 97 #define RISCV_ISA_EXT_ZALRSC 98 +#define RISCV_ISA_EXT_ZICBOP 99 #define RISCV_ISA_EXT_XLINUXENVCFG 127 diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h index 02c92c1657d2..d5adbaec1d01 100644 --- a/arch/riscv/include/asm/insn-def.h +++ b/arch/riscv/include/asm/insn-def.h @@ -263,4 +263,10 @@ #define RISCV_INSN_NOP4 _AC(0x00000013, U) +#ifndef __ASSEMBLY__ +#define nop() __asm__ __volatile__ ("nop") +#define __nops(n) ".rept " #n "\nnop\n.endr\n" +#define nops(n) __asm__ __volatile__ (__nops(n)) +#endif + #endif /* __ASM_INSN_DEF_H */ diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 5f56eb9d114a..09d4c963399a 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -52,7 +52,6 @@ #endif #ifndef __ASSEMBLY__ -#include struct task_struct; struct pt_regs; diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 2054f6c4b0ae..743d53415572 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -32,6 +32,7 @@ #define NUM_ALPHA_EXTS ('z' - 'a' + 1) static bool any_cpu_has_zicboz; +static bool any_cpu_has_zicbop; static bool any_cpu_has_zicbom; unsigned long elf_hwcap __read_mostly; @@ -119,6 +120,21 @@ static int riscv_ext_zicboz_validate(const struct riscv_isa_ext_data *data, return 0; } +static int riscv_ext_zicbop_validate(const struct riscv_isa_ext_data *data, + const unsigned long *isa_bitmap) +{ + if (!riscv_cbop_block_size) { + pr_err("Zicbop detected in ISA string, disabling as no cbop-block-size found\n"); + return -EINVAL; + } + if (!is_power_of_2(riscv_cbop_block_size)) { + pr_err("Zicbop disabled as cbop-block-size present, but is not a power-of-2\n"); + return -EINVAL; + } + any_cpu_has_zicbop = true; + return 0; +} + static int riscv_ext_f_validate(const struct riscv_isa_ext_data *data, const unsigned long *isa_bitmap) { @@ -442,6 +458,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = { __RISCV_ISA_EXT_SUPERSET_VALIDATE(v, RISCV_ISA_EXT_v, riscv_v_exts, riscv_ext_vector_float_validate), __RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h), __RISCV_ISA_EXT_SUPERSET_VALIDATE(zicbom, RISCV_ISA_EXT_ZICBOM, riscv_xlinuxenvcfg_exts, riscv_ext_zicbom_validate), + __RISCV_ISA_EXT_DATA_VALIDATE(zicbop, RISCV_ISA_EXT_ZICBOP, riscv_ext_zicbop_validate), __RISCV_ISA_EXT_SUPERSET_VALIDATE(zicboz, RISCV_ISA_EXT_ZICBOZ, riscv_xlinuxenvcfg_exts, riscv_ext_zicboz_validate), __RISCV_ISA_EXT_DATA(ziccrse, RISCV_ISA_EXT_ZICCRSE), __RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR), @@ -1112,6 +1129,10 @@ void __init riscv_user_isa_enable(void) current->thread.envcfg |= ENVCFG_CBCFE; else if (any_cpu_has_zicbom) pr_warn("Zicbom disabled as it is unavailable on some harts\n"); + + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOP) && + any_cpu_has_zicbop) + pr_warn("Zicbop disabled as it is unavailable on some harts\n"); } #ifdef CONFIG_RISCV_ALTERNATIVE diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index b81672729887..6265052ef8b6 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -101,6 +101,9 @@ EXPORT_SYMBOL_GPL(riscv_cbom_block_size); unsigned int riscv_cboz_block_size; EXPORT_SYMBOL_GPL(riscv_cboz_block_size); +unsigned int riscv_cbop_block_size; +EXPORT_SYMBOL_GPL(riscv_cbop_block_size); + static void __init cbo_get_block_size(struct device_node *node, const char *name, u32 *block_size, unsigned long *first_hartid) @@ -125,8 +128,8 @@ static void __init cbo_get_block_size(struct device_node *node, void __init riscv_init_cbo_blocksizes(void) { - unsigned long cbom_hartid, cboz_hartid; - u32 cbom_block_size = 0, cboz_block_size = 0; + unsigned long cbom_hartid, cboz_hartid, cbop_hartid; + u32 cbom_block_size = 0, cboz_block_size = 0, cbop_block_size = 0; struct device_node *node; struct acpi_table_header *rhct; acpi_status status; @@ -138,13 +141,15 @@ void __init riscv_init_cbo_blocksizes(void) &cbom_block_size, &cbom_hartid); cbo_get_block_size(node, "riscv,cboz-block-size", &cboz_block_size, &cboz_hartid); + cbo_get_block_size(node, "riscv,cbop-block-size", + &cbop_block_size, &cbop_hartid); } } else { status = acpi_get_table(ACPI_SIG_RHCT, 0, &rhct); if (ACPI_FAILURE(status)) return; - acpi_get_cbo_block_size(rhct, &cbom_block_size, &cboz_block_size, NULL); + acpi_get_cbo_block_size(rhct, &cbom_block_size, &cboz_block_size, &cbop_block_size); acpi_put_table((struct acpi_table_header *)rhct); } @@ -153,6 +158,9 @@ void __init riscv_init_cbo_blocksizes(void) if (cboz_block_size) riscv_cboz_block_size = cboz_block_size; + + if (cbop_block_size) + riscv_cbop_block_size = cbop_block_size; } #ifdef CONFIG_SMP -- cgit v1.2.3 From a5f947c73115efb6fb0d9579e71ce1ee5cf706aa Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Mon, 21 Apr 2025 16:24:40 +0200 Subject: riscv: Add ARCH_HAS_PREFETCH[W] support with Zicbop Enable Linux prefetch and prefetchw primitives using Zicbop. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20231231082955.16516-3-guoren@kernel.org Tested-by: Andrea Parri Link: https://lore.kernel.org/r/20250421142441.395849-4-alexghiti@rivosinc.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/processor.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 09d4c963399a..39dfab495a4c 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -13,6 +13,9 @@ #include #include +#include +#include +#include #define arch_get_mmap_end(addr, len, flags) \ ({ \ @@ -135,6 +138,27 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset, #define KSTK_EIP(tsk) (task_pt_regs(tsk)->epc) #define KSTK_ESP(tsk) (task_pt_regs(tsk)->sp) +#define PREFETCH_ASM(x) \ + ALTERNATIVE(__nops(1), PREFETCH_R(x, 0), 0, \ + RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP) + +#define PREFETCHW_ASM(x) \ + ALTERNATIVE(__nops(1), PREFETCH_W(x, 0), 0, \ + RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP) + +#ifdef CONFIG_RISCV_ISA_ZICBOP +#define ARCH_HAS_PREFETCH +static inline void prefetch(const void *x) +{ + __asm__ __volatile__(PREFETCH_ASM(%0) : : "r" (x) : "memory"); +} + +#define ARCH_HAS_PREFETCHW +static inline void prefetchw(const void *x) +{ + __asm__ __volatile__(PREFETCHW_ASM(%0) : : "r" (x) : "memory"); +} +#endif /* CONFIG_RISCV_ISA_ZICBOP */ /* Do necessary setup to start up a newly executed thread. */ extern void start_thread(struct pt_regs *regs, -- cgit v1.2.3 From eb87e56d651d0a72009842bc0d8eeae7b605c97d Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Mon, 21 Apr 2025 16:24:41 +0200 Subject: riscv: xchg: Prefetch the destination word for sc.w The cost of changing a cacheline from shared to exclusive state can be significant, especially when this is triggered by an exclusive store, since it may result in having to retry the transaction. This patch makes use of prefetch.w to prefetch cachelines for write prior to lr/sc loops when using the xchg_small atomic routine. This patch is inspired by commit 0ea366f5e1b6 ("arm64: atomics: prefetch the destination word for write prior to stxr"). Signed-off-by: Guo Ren Signed-off-by: Guo Ren Link: https://lore.kernel.org/r/20231231082955.16516-4-guoren@kernel.org Tested-by: Andrea Parri Link: https://lore.kernel.org/r/20250421142441.395849-5-alexghiti@rivosinc.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cmpxchg.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h index 2ec119eb147b..0b749e710216 100644 --- a/arch/riscv/include/asm/cmpxchg.h +++ b/arch/riscv/include/asm/cmpxchg.h @@ -13,6 +13,7 @@ #include #include #include +#include #define __arch_xchg_masked(sc_sfx, swap_sfx, prepend, sc_append, \ swap_append, r, p, n) \ @@ -37,6 +38,7 @@ \ __asm__ __volatile__ ( \ prepend \ + PREFETCHW_ASM(%5) \ "0: lr.w %0, %2\n" \ " and %1, %0, %z4\n" \ " or %1, %1, %z3\n" \ @@ -44,7 +46,7 @@ " bnez %1, 0b\n" \ sc_append \ : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b)) \ - : "rJ" (__newx), "rJ" (~__mask) \ + : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b) \ : "memory"); \ \ r = (__typeof__(*(p)))((__retx & __mask) >> __s); \ -- cgit v1.2.3 From c3cc2a4a3a23faf6b88459471c1c16ab3837cc2f Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 21 Mar 2025 13:39:54 +0100 Subject: riscv: Add support for PUD THP Add the necessary page table functions to deal with PUD THP, this enables the use of PUD pfnmap. Link: https://lore.kernel.org/r/20250321123954.225097-1-alexghiti@rivosinc.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable-64.h | 5 +- arch/riscv/include/asm/pgtable.h | 97 +++++++++++++++++++++++++++++++++++++ arch/riscv/include/asm/tlbflush.h | 2 + arch/riscv/mm/pgtable.c | 10 ++++ arch/riscv/mm/tlbflush.c | 7 +++ 6 files changed, 120 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bbec87b79309..63ef4aa03506 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -143,6 +143,7 @@ config RISCV select HAVE_ARCH_THREAD_STRUCT_WHITELIST select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU + select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if 64BIT && MMU select HAVE_ARCH_USERFAULTFD_MINOR if 64BIT && USERFAULTFD select HAVE_ARCH_VMAP_STACK if MMU && 64BIT select HAVE_ASM_MODVERSIONS diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h index 0897dd99ab8d..a2c00235c447 100644 --- a/arch/riscv/include/asm/pgtable-64.h +++ b/arch/riscv/include/asm/pgtable-64.h @@ -184,7 +184,7 @@ static inline int pud_none(pud_t pud) static inline int pud_bad(pud_t pud) { - return !pud_present(pud); + return !pud_present(pud) || (pud_val(pud) & _PAGE_LEAF); } #define pud_leaf pud_leaf @@ -401,6 +401,7 @@ p4d_t *p4d_offset(pgd_t *pgd, unsigned long address); #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pte_devmap(pte_t pte); static inline pte_t pmd_pte(pmd_t pmd); +static inline pte_t pud_pte(pud_t pud); static inline int pmd_devmap(pmd_t pmd) { @@ -409,7 +410,7 @@ static inline int pmd_devmap(pmd_t pmd) static inline int pud_devmap(pud_t pud) { - return 0; + return pte_devmap(pud_pte(pud)); } static inline int pgd_devmap(pgd_t pgd) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 428e48e5f57d..b84e2ff83cb7 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -902,6 +902,103 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, #define pmdp_collapse_flush pmdp_collapse_flush extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); + +static inline pud_t pud_wrprotect(pud_t pud) +{ + return pte_pud(pte_wrprotect(pud_pte(pud))); +} + +static inline int pud_trans_huge(pud_t pud) +{ + return pud_leaf(pud); +} + +static inline int pud_dirty(pud_t pud) +{ + return pte_dirty(pud_pte(pud)); +} + +static inline pud_t pud_mkyoung(pud_t pud) +{ + return pte_pud(pte_mkyoung(pud_pte(pud))); +} + +static inline pud_t pud_mkold(pud_t pud) +{ + return pte_pud(pte_mkold(pud_pte(pud))); +} + +static inline pud_t pud_mkdirty(pud_t pud) +{ + return pte_pud(pte_mkdirty(pud_pte(pud))); +} + +static inline pud_t pud_mkclean(pud_t pud) +{ + return pte_pud(pte_mkclean(pud_pte(pud))); +} + +static inline pud_t pud_mkwrite(pud_t pud) +{ + return pte_pud(pte_mkwrite_novma(pud_pte(pud))); +} + +static inline pud_t pud_mkhuge(pud_t pud) +{ + return pud; +} + +static inline pud_t pud_mkdevmap(pud_t pud) +{ + return pte_pud(pte_mkdevmap(pud_pte(pud))); +} + +static inline int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty) +{ + return ptep_set_access_flags(vma, address, (pte_t *)pudp, pud_pte(entry), dirty); +} + +static inline int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + return ptep_test_and_clear_young(vma, address, (pte_t *)pudp); +} + +static inline int pud_young(pud_t pud) +{ + return pte_young(pud_pte(pud)); +} + +static inline void update_mmu_cache_pud(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + pte_t *ptep = (pte_t *)pudp; + + update_mmu_cache(vma, address, ptep); +} + +static inline pud_t pudp_establish(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, pud_t pud) +{ + page_table_check_pud_set(vma->vm_mm, pudp, pud); + return __pud(atomic_long_xchg((atomic_long_t *)pudp, pud_val(pud))); +} + +static inline pud_t pud_mkinvalid(pud_t pud) +{ + return __pud(pud_val(pud) & ~(_PAGE_PRESENT | _PAGE_PROT_NONE)); +} + +extern pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp); + +static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) +{ + return pte_pud(pte_modify(pud_pte(pud), newprot)); +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h index ce0dd0fed764..1a20dd746a49 100644 --- a/arch/riscv/include/asm/tlbflush.h +++ b/arch/riscv/include/asm/tlbflush.h @@ -56,6 +56,8 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end); #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +void flush_pud_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end); #endif bool arch_tlbbatch_should_defer(struct mm_struct *mm); diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index 4ae67324f992..8b6c0a112a8d 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -154,4 +154,14 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, flush_tlb_mm(vma->vm_mm); return pmd; } + +pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp) +{ + VM_WARN_ON_ONCE(!pud_present(*pudp)); + pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); + + flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); + return old; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index f9e27ba1df99..97c8fde3cbfe 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -182,6 +182,13 @@ void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, __flush_tlb_range(vma->vm_mm, mm_cpumask(vma->vm_mm), start, end - start, PMD_SIZE); } + +void flush_pud_tlb_range(struct vm_area_struct *vma, unsigned long start, + unsigned long end) +{ + __flush_tlb_range(vma->vm_mm, mm_cpumask(vma->vm_mm), + start, end - start, PUD_SIZE); +} #endif bool arch_tlbbatch_should_defer(struct mm_struct *mm) -- cgit v1.2.3 From be17c0df67959fe4f88dac75dc26ed9252d4b133 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Wed, 9 Apr 2025 10:14:51 -0700 Subject: riscv: module: Optimize PLT/GOT entry counting perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent inside module_frob_arch_sections(). This is because amdgpu.ko contains about 300000 relocations in its .rela.text section, and the algorithm in count_max_entries() takes quadratic time. Apply two optimizations from the arm64 code, which together reduce the total execution time by 99.58%. First, sort the relocations so duplicate entries are adjacent. Second, reduce the number of relocations that must be sorted by filtering to only relocations that need PLT/GOT entries, as done in commit d4e0340919fb ("arm64/module: Optimize module load time by optimizing PLT counting"). Unlike the arm64 code, here the filtering and sorting is done in a scratch buffer, because the HI20 relocation search optimization in apply_relocate_add() depends on the original order of the relocations. This allows accumulating PLT/GOT relocations across sections so sorting and counting is only done once per module. Signed-off-by: Samuel Holland Reviewed-by: Andrew Jones Link: https://lore.kernel.org/r/20250409171526.862481-3-samuel.holland@sifive.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++++++++-------- 1 file changed, 65 insertions(+), 16 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c index 91d0b355ceef..75551ac6504c 100644 --- a/arch/riscv/kernel/module-sections.c +++ b/arch/riscv/kernel/module-sections.c @@ -9,6 +9,7 @@ #include #include #include +#include unsigned long module_emit_got_entry(struct module *mod, unsigned long val) { @@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val) return (unsigned long)&plt[i]; } -static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y) +#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) { - return x->r_info == y->r_info && x->r_addend == y->r_addend; + const Elf_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(x->r_info, y->r_info); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; } static bool duplicate_rela(const Elf_Rela *rela, int idx) { - int i; - for (i = 0; i < idx; i++) { - if (is_rela_equal(&rela[i], &rela[idx])) - return true; - } - return false; + /* + * Entries are sorted by type, symbol index and addend. That means + * that, if a duplicate entry exists, it must be in the preceding slot. + */ + return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0; } -static void count_max_entries(Elf_Rela *relas, int num, +static void count_max_entries(const Elf_Rela *relas, size_t num, unsigned int *plts, unsigned int *gots) { - for (int i = 0; i < num; i++) { + for (size_t i = 0; i < num; i++) { + if (duplicate_rela(relas, i)) + continue; + switch (ELF_R_TYPE(relas[i].r_info)) { case R_RISCV_CALL_PLT: case R_RISCV_PLT32: - if (!duplicate_rela(relas, i)) - (*plts)++; + (*plts)++; break; case R_RISCV_GOT_HI20: - if (!duplicate_rela(relas, i)) - (*gots)++; + (*gots)++; break; + default: + unreachable(); } } } +static bool rela_needs_plt_got_entry(const Elf_Rela *rela) +{ + switch (ELF_R_TYPE(rela->r_info)) { + case R_RISCV_CALL_PLT: + case R_RISCV_GOT_HI20: + case R_RISCV_PLT32: + return true; + default: + return false; + } +} + int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { + size_t num_scratch_relas = 0; unsigned int num_plts = 0; unsigned int num_gots = 0; + Elf_Rela *scratch = NULL; + size_t scratch_size = 0; int i; /* @@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, /* Calculate the maxinum number of entries */ for (i = 0; i < ehdr->e_shnum; i++) { + size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela); Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset; - int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela); Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info; + size_t scratch_size_needed; if (sechdrs[i].sh_type != SHT_RELA) continue; @@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, if (!(dst_sec->sh_flags & SHF_EXECINSTR)) continue; - count_max_entries(relas, num_rela, &num_plts, &num_gots); + /* + * apply_relocate_add() relies on HI20 and LO12 relocation pairs being + * close together, so sort a copy of the section to avoid interfering. + */ + scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch); + if (scratch_size_needed > scratch_size) { + scratch_size = scratch_size_needed; + scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL); + if (!scratch) + return -ENOMEM; + } + + for (size_t j = 0; j < num_relas; j++) + if (rela_needs_plt_got_entry(&relas[j])) + scratch[num_scratch_relas++] = relas[j]; + } + + if (scratch) { + /* sort the accumulated PLT/GOT relocations so duplicates are adjacent */ + sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL); + count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots); + kvfree(scratch); } mod->arch.plt.shdr->sh_type = SHT_NOBITS; -- cgit v1.2.3 From 48d9aabf2dc5d93b402ce55e3187e95698f2b9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B0=A2=E8=87=B4=E9=82=A6=20=28XIE=20Zhibang=29?= Date: Fri, 28 Mar 2025 10:14:22 +0000 Subject: RISC-V: Kconfig: Fix help text of CMDLINE_EXTEND MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is the built-in command line appended to the bootloader command line, not the bootloader command line appended to the built-in command line. Fixes: 3aed8c43267e ("RISC-V: Update Kconfig to better handle CMDLINE") Signed-off-by: 谢致邦 (XIE Zhibang) Link: https://lore.kernel.org/r/tencent_A93C7FB46BFD20054AD2FEF4645913FF550A@qq.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 674cf6ff7188..78640cd353fd 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -1176,8 +1176,8 @@ config CMDLINE_FALLBACK config CMDLINE_EXTEND bool "Extend bootloader kernel arguments" help - The command-line arguments provided during boot will be - appended to the built-in command line. This is useful in + The built-in command line will be appended to the command- + line arguments provided during boot. This is useful in cases where the provided arguments are insufficient and you don't want to or cannot modify them. -- cgit v1.2.3 From d7e0cce103663a60b14c52a0bcad62c7e8c0e6e8 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 22 Apr 2025 19:31:56 +0800 Subject: riscv: Make regs_irqs_disabled() more clear The return value of regs_irqs_disabled() is true or false, so change its type to reflect that and also make it always inline. Suggested-by: Huacai Chen Signed-off-by: Tiezhu Yang Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250422113156.25742-1-yangtiezhu@loongson.cn Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h index 2910231977cb..a7dc0e330757 100644 --- a/arch/riscv/include/asm/ptrace.h +++ b/arch/riscv/include/asm/ptrace.h @@ -175,7 +175,7 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, return 0; } -static inline int regs_irqs_disabled(struct pt_regs *regs) +static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) { return !(regs->status & SR_PIE); } -- cgit v1.2.3 From 415a8c81da3dab0a585bd4f8d505a11ad5a171a7 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 21 Apr 2025 16:14:13 +0200 Subject: riscv: hwprobe: export Zabha extension MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export Zabha through the hwprobe syscall. Reviewed-by: Clément Léger Link: https://lore.kernel.org/r/20250421141413.394444-1-alexghiti@rivosinc.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/include/uapi/asm/hwprobe.h | 1 + arch/riscv/kernel/sys_hwprobe.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h index 3c2fce939673..fca15f2bf6f3 100644 --- a/arch/riscv/include/uapi/asm/hwprobe.h +++ b/arch/riscv/include/uapi/asm/hwprobe.h @@ -81,6 +81,7 @@ struct riscv_hwprobe { #define RISCV_HWPROBE_EXT_ZICBOM (1ULL << 55) #define RISCV_HWPROBE_EXT_ZAAMO (1ULL << 56) #define RISCV_HWPROBE_EXT_ZALRSC (1ULL << 57) +#define RISCV_HWPROBE_EXT_ZABHA (1ULL << 58) #define RISCV_HWPROBE_KEY_CPUPERF_0 5 #define RISCV_HWPROBE_MISALIGNED_UNKNOWN (0 << 0) #define RISCV_HWPROBE_MISALIGNED_EMULATED (1 << 0) diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c index 249aec8594a9..ed3123396a96 100644 --- a/arch/riscv/kernel/sys_hwprobe.c +++ b/arch/riscv/kernel/sys_hwprobe.c @@ -96,6 +96,7 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair, * presence in the hart_isa bitmap, are made. */ EXT_KEY(ZAAMO); + EXT_KEY(ZABHA); EXT_KEY(ZACAS); EXT_KEY(ZALRSC); EXT_KEY(ZAWRS); -- cgit v1.2.3 From a4348546332c9fae12b29acb514535e0a52b9b3c Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 2 Jun 2025 21:39:14 +0200 Subject: riscv: make unsafe user copy routines use existing assembly routines The current implementation is underperforming and in addition, it triggers misaligned access traps on platforms which do not handle misaligned accesses in hardware. Use the existing assembly routines to solve both problems at once. Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250602193918.868962-2-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/asm-prototypes.h | 2 +- arch/riscv/include/asm/uaccess.h | 33 ++++++---------------- arch/riscv/lib/riscv_v_helpers.c | 11 ++++++-- arch/riscv/lib/uaccess.S | 50 ++++++++++++++++++++++----------- arch/riscv/lib/uaccess_vector.S | 15 ++++++++-- 5 files changed, 63 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h index cd627ec289f1..5d10edde6d17 100644 --- a/arch/riscv/include/asm/asm-prototypes.h +++ b/arch/riscv/include/asm/asm-prototypes.h @@ -12,7 +12,7 @@ long long __ashlti3(long long a, int b); #ifdef CONFIG_RISCV_ISA_V #ifdef CONFIG_MMU -asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n); +asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n, bool enable_sum); #endif /* CONFIG_MMU */ void xor_regs_2_(unsigned long bytes, unsigned long *__restrict p1, diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 87d01168f80a..046de7ced09c 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -450,35 +450,18 @@ static inline void user_access_restore(unsigned long enabled) { } (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) -#define unsafe_copy_loop(dst, src, len, type, op, label) \ - while (len >= sizeof(type)) { \ - op(*(type *)(src), (type __user *)(dst), label); \ - dst += sizeof(type); \ - src += sizeof(type); \ - len -= sizeof(type); \ - } +unsigned long __must_check __asm_copy_to_user_sum_enabled(void __user *to, + const void *from, unsigned long n); +unsigned long __must_check __asm_copy_from_user_sum_enabled(void *to, + const void __user *from, unsigned long n); #define unsafe_copy_to_user(_dst, _src, _len, label) \ -do { \ - char __user *__ucu_dst = (_dst); \ - const char *__ucu_src = (_src); \ - size_t __ucu_len = (_len); \ - unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, unsafe_put_user, label); \ - unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, unsafe_put_user, label); \ - unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, unsafe_put_user, label); \ - unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, unsafe_put_user, label); \ -} while (0) + if (__asm_copy_to_user_sum_enabled(_dst, _src, _len)) \ + goto label; #define unsafe_copy_from_user(_dst, _src, _len, label) \ -do { \ - char *__ucu_dst = (_dst); \ - const char __user *__ucu_src = (_src); \ - size_t __ucu_len = (_len); \ - unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u64, unsafe_get_user, label); \ - unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u32, unsafe_get_user, label); \ - unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u16, unsafe_get_user, label); \ - unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u8, unsafe_get_user, label); \ -} while (0) + if (__asm_copy_from_user_sum_enabled(_dst, _src, _len)) \ + goto label; #else /* CONFIG_MMU */ #include diff --git a/arch/riscv/lib/riscv_v_helpers.c b/arch/riscv/lib/riscv_v_helpers.c index be38a93cedae..7bbdfc6d4552 100644 --- a/arch/riscv/lib/riscv_v_helpers.c +++ b/arch/riscv/lib/riscv_v_helpers.c @@ -16,8 +16,11 @@ #ifdef CONFIG_MMU size_t riscv_v_usercopy_threshold = CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD; int __asm_vector_usercopy(void *dst, void *src, size_t n); +int __asm_vector_usercopy_sum_enabled(void *dst, void *src, size_t n); int fallback_scalar_usercopy(void *dst, void *src, size_t n); -asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n) +int fallback_scalar_usercopy_sum_enabled(void *dst, void *src, size_t n); +asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n, + bool enable_sum) { size_t remain, copied; @@ -26,7 +29,8 @@ asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n) goto fallback; kernel_vector_begin(); - remain = __asm_vector_usercopy(dst, src, n); + remain = enable_sum ? __asm_vector_usercopy(dst, src, n) : + __asm_vector_usercopy_sum_enabled(dst, src, n); kernel_vector_end(); if (remain) { @@ -40,6 +44,7 @@ asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n) return remain; fallback: - return fallback_scalar_usercopy(dst, src, n); + return enable_sum ? fallback_scalar_usercopy(dst, src, n) : + fallback_scalar_usercopy_sum_enabled(dst, src, n); } #endif diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S index 6a9f116bb545..4efea1b3326c 100644 --- a/arch/riscv/lib/uaccess.S +++ b/arch/riscv/lib/uaccess.S @@ -17,14 +17,43 @@ SYM_FUNC_START(__asm_copy_to_user) ALTERNATIVE("j fallback_scalar_usercopy", "nop", 0, RISCV_ISA_EXT_ZVE32X, CONFIG_RISCV_ISA_V) REG_L t0, riscv_v_usercopy_threshold bltu a2, t0, fallback_scalar_usercopy - tail enter_vector_usercopy + li a3, 1 + tail enter_vector_usercopy #endif -SYM_FUNC_START(fallback_scalar_usercopy) +SYM_FUNC_END(__asm_copy_to_user) +EXPORT_SYMBOL(__asm_copy_to_user) +SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user) +EXPORT_SYMBOL(__asm_copy_from_user) +SYM_FUNC_START(fallback_scalar_usercopy) /* Enable access to user memory */ - li t6, SR_SUM - csrs CSR_STATUS, t6 + li t6, SR_SUM + csrs CSR_STATUS, t6 + mv t6, ra + call fallback_scalar_usercopy_sum_enabled + + /* Disable access to user memory */ + mv ra, t6 + li t6, SR_SUM + csrc CSR_STATUS, t6 + ret +SYM_FUNC_END(fallback_scalar_usercopy) + +SYM_FUNC_START(__asm_copy_to_user_sum_enabled) +#ifdef CONFIG_RISCV_ISA_V + ALTERNATIVE("j fallback_scalar_usercopy_sum_enabled", "nop", 0, RISCV_ISA_EXT_ZVE32X, CONFIG_RISCV_ISA_V) + REG_L t0, riscv_v_usercopy_threshold + bltu a2, t0, fallback_scalar_usercopy_sum_enabled + li a3, 0 + tail enter_vector_usercopy +#endif +SYM_FUNC_END(__asm_copy_to_user_sum_enabled) +SYM_FUNC_ALIAS(__asm_copy_from_user_sum_enabled, __asm_copy_to_user_sum_enabled) +EXPORT_SYMBOL(__asm_copy_from_user_sum_enabled) +EXPORT_SYMBOL(__asm_copy_to_user_sum_enabled) + +SYM_FUNC_START(fallback_scalar_usercopy_sum_enabled) /* * Save the terminal address which will be used to compute the number * of bytes copied in case of a fixup exception. @@ -178,23 +207,12 @@ SYM_FUNC_START(fallback_scalar_usercopy) bltu a0, t0, 4b /* t0 - end of dst */ .Lout_copy_user: - /* Disable access to user memory */ - csrc CSR_STATUS, t6 li a0, 0 ret - - /* Exception fixup code */ 10: - /* Disable access to user memory */ - csrc CSR_STATUS, t6 sub a0, t5, a0 ret -SYM_FUNC_END(__asm_copy_to_user) -SYM_FUNC_END(fallback_scalar_usercopy) -EXPORT_SYMBOL(__asm_copy_to_user) -SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user) -EXPORT_SYMBOL(__asm_copy_from_user) - +SYM_FUNC_END(fallback_scalar_usercopy_sum_enabled) SYM_FUNC_START(__clear_user) diff --git a/arch/riscv/lib/uaccess_vector.S b/arch/riscv/lib/uaccess_vector.S index 7c45f26de4f7..03b5560609a2 100644 --- a/arch/riscv/lib/uaccess_vector.S +++ b/arch/riscv/lib/uaccess_vector.S @@ -24,7 +24,18 @@ SYM_FUNC_START(__asm_vector_usercopy) /* Enable access to user memory */ li t6, SR_SUM csrs CSR_STATUS, t6 + mv t6, ra + call __asm_vector_usercopy_sum_enabled + + /* Disable access to user memory */ + mv ra, t6 + li t6, SR_SUM + csrc CSR_STATUS, t6 + ret +SYM_FUNC_END(__asm_vector_usercopy) + +SYM_FUNC_START(__asm_vector_usercopy_sum_enabled) loop: vsetvli iVL, iNum, e8, ELEM_LMUL_SETTING, ta, ma fixup vle8.v vData, (pSrc), 10f @@ -36,8 +47,6 @@ loop: /* Exception fixup for vector load is shared with normal exit */ 10: - /* Disable access to user memory */ - csrc CSR_STATUS, t6 mv a0, iNum ret @@ -49,4 +58,4 @@ loop: csrr t2, CSR_VSTART sub iNum, iNum, t2 j 10b -SYM_FUNC_END(__asm_vector_usercopy) +SYM_FUNC_END(__asm_vector_usercopy_sum_enabled) -- cgit v1.2.3 From 020667d661f9be65167174d28a6eda7102f7293f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Mon, 2 Jun 2025 21:39:15 +0200 Subject: riscv: process: use unsigned int instead of unsigned long for put_user() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The specification of prctl() for GET_UNALIGN_CTL states that the value is returned in an unsigned int * address passed as an unsigned long. Change the type to match that and avoid an unaligned access as well. Signed-off-by: Clément Léger Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250602193918.868962-3-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 7c244de77180..fe10e326f44e 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -57,7 +57,7 @@ int get_unalign_ctl(struct task_struct *tsk, unsigned long adr) if (!unaligned_ctl_available()) return -EINVAL; - return put_user(tsk->thread.align_ctl, (unsigned long __user *)adr); + return put_user(tsk->thread.align_ctl, (unsigned int __user *)adr); } void __show_regs(struct pt_regs *regs) -- cgit v1.2.3 From ca1a66cdd685030738cf077e3955fdedfe39fbb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Mon, 2 Jun 2025 21:39:16 +0200 Subject: riscv: uaccess: do not do misaligned accesses in get/put_user() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Doing misaligned access to userspace memory would make a trap on platform where it is emulated. Latest fixes removed the kernel capability to do unaligned accesses to userspace memory safely since interrupts are kept disabled at all time during that. Thus doing so would crash the kernel. Such behavior was detected with GET_UNALIGN_CTL() that was doing a put_user() with an unsigned long* address that should have been an unsigned int*. Reenabling kernel misaligned access emulation is a bit risky and it would also degrade performances. Rather than doing that, we will try to avoid any misaligned accessed by using copy_from/to_user() which does not do any misaligned accesses. This can be done only for !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS and thus allows to only generate a bit more code for this config. Signed-off-by: Clément Léger Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250602193918.868962-4-cleger@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 046de7ced09c..d472da4450e6 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -169,8 +169,19 @@ do { \ #endif /* CONFIG_64BIT */ +unsigned long __must_check __asm_copy_to_user_sum_enabled(void __user *to, + const void *from, unsigned long n); +unsigned long __must_check __asm_copy_from_user_sum_enabled(void *to, + const void __user *from, unsigned long n); + #define __get_user_nocheck(x, __gu_ptr, label) \ do { \ + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \ + !IS_ALIGNED((uintptr_t)__gu_ptr, sizeof(*__gu_ptr))) { \ + if (__asm_copy_from_user_sum_enabled(&(x), __gu_ptr, sizeof(*__gu_ptr))) \ + goto label; \ + break; \ + } \ switch (sizeof(*__gu_ptr)) { \ case 1: \ __get_user_asm("lb", (x), __gu_ptr, label); \ @@ -297,6 +308,13 @@ do { \ #define __put_user_nocheck(x, __gu_ptr, label) \ do { \ + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \ + !IS_ALIGNED((uintptr_t)__gu_ptr, sizeof(*__gu_ptr))) { \ + __inttype(x) val = (__inttype(x))x; \ + if (__asm_copy_to_user_sum_enabled(__gu_ptr, &(val), sizeof(*__gu_ptr))) \ + goto label; \ + break; \ + } \ switch (sizeof(*__gu_ptr)) { \ case 1: \ __put_user_asm("sb", (x), __gu_ptr, label); \ @@ -450,11 +468,6 @@ static inline void user_access_restore(unsigned long enabled) { } (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) -unsigned long __must_check __asm_copy_to_user_sum_enabled(void __user *to, - const void *from, unsigned long n); -unsigned long __must_check __asm_copy_from_user_sum_enabled(void *to, - const void __user *from, unsigned long n); - #define unsafe_copy_to_user(_dst, _src, _len, label) \ if (__asm_copy_to_user_sum_enabled(_dst, _src, _len)) \ goto label; -- cgit v1.2.3 From c39d53750ff96b282c869a0184a7c3ecfd298ca8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= Date: Thu, 1 May 2025 15:03:09 +0200 Subject: riscv: Improve Kconfig help for RISCV_ISA_V_PREEMPTIVE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a couple of spelling issues plus some minor details on the grammar. Signed-off-by: Miquel Sabaté Solà Link: https://lore.kernel.org/r/20250501130309.14803-1-mikisabate@gmail.com Reviewed-by: Alexandre Ghiti Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a93af30727ee..98a3ecdc65f6 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -669,12 +669,12 @@ config RISCV_ISA_V_PREEMPTIVE default y help Usually, in-kernel SIMD routines are run with preemption disabled. - Functions which envoke long running SIMD thus must yield core's + Functions which invoke long running SIMD thus must yield the core's vector unit to prevent blocking other tasks for too long. - This config allows kernel to run SIMD without explicitly disable - preemption. Enabling this config will result in higher memory - consumption due to the allocation of per-task's kernel Vector context. + This config allows the kernel to run SIMD without explicitly disabling + preemption. Enabling this config will result in higher memory consumption + due to the allocation of per-task's kernel Vector context. config RISCV_ISA_ZAWRS bool "Zawrs extension support for more efficient busy waiting" -- cgit v1.2.3 From a56972698810089d8f1bdc296cd709726db7176b Mon Sep 17 00:00:00 2001 From: Mayuresh Chitale Date: Tue, 2 Jul 2024 15:56:37 +0530 Subject: riscv: mm: Add support for Svinval extension The Svinval extension splits SFENCE.VMA instruction into finer-grained invalidation and ordering operations and is mandatory for RVA23S64 profile. When Svinval is enabled the local_flush_tlb_range_threshold_asid function should use the following sequence to optimize the tlb flushes instead of a simple sfence.vma: sfence.w.inval svinval.vma . . svinval.vma sfence.inval.ir The maximum number of consecutive svinval.vma instructions that can be executed in local_flush_tlb_range_threshold_asid function is limited to 64. This is required to avoid soft lockups and the approach is similar to that used in arm64. Signed-off-by: Mayuresh Chitale Reviewed-by: Andrew Jones Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240702102637.9074-1-mchitale@ventanamicro.com Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/tlbflush.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'arch') diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index f9e27ba1df99..6289ed5c7eb4 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -7,6 +7,27 @@ #include #include #include +#include + +#define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL) + +static inline void local_sfence_inval_ir(void) +{ + asm volatile(SFENCE_INVAL_IR() ::: "memory"); +} + +static inline void local_sfence_w_inval(void) +{ + asm volatile(SFENCE_W_INVAL() ::: "memory"); +} + +static inline void local_sinval_vma(unsigned long vma, unsigned long asid) +{ + if (asid != FLUSH_TLB_NO_ASID) + asm volatile(SINVAL_VMA(%0, %1) : : "r" (vma), "r" (asid) : "memory"); + else + asm volatile(SINVAL_VMA(%0, zero) : : "r" (vma) : "memory"); +} /* * Flush entire TLB if number of entries to be flushed is greater @@ -27,6 +48,16 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start, return; } + if (has_svinval()) { + local_sfence_w_inval(); + for (i = 0; i < nr_ptes_in_range; ++i) { + local_sinval_vma(start, asid); + start += stride; + } + local_sfence_inval_ir(); + return; + } + for (i = 0; i < nr_ptes_in_range; ++i) { local_flush_tlb_page_asid(start, asid); start += stride; -- cgit v1.2.3 From a869b8c29f864b9530a6473a30c09546333b571a Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Sat, 26 Apr 2025 21:59:54 +0800 Subject: riscv: enable mseal sysmap for RV64 Provide support for CONFIG_MSEAL_SYSTEM_MAPPINGS for RV64, covering the vdso, vvar. Passed sysmap_is_sealed and mseal_test self tests. Passed booting a buildroot rootfs image and a cli debian rootfs image. Signed-off-by: Jisheng Zhang Cc: Jeff Xu Link: https://lore.kernel.org/r/20250426135954.5614-1-jszhang@kernel.org Tested-by: Alexandre Ghiti Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/kernel/vdso.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index bbec87b79309..3cb0b05eef62 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -70,6 +70,7 @@ config RISCV # LLD >= 14: https://github.com/llvm/llvm-project/issues/50505 select ARCH_SUPPORTS_LTO_CLANG if LLD_VERSION >= 140000 select ARCH_SUPPORTS_LTO_CLANG_THIN if LLD_VERSION >= 140000 + select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS if 64BIT && MMU select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU select ARCH_SUPPORTS_PER_VMA_LOCK if MMU select ARCH_SUPPORTS_RT diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index cc2895d1fbc2..3a8e038b10a2 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -136,7 +136,7 @@ static int __setup_additional_pages(struct mm_struct *mm, ret = _install_special_mapping(mm, vdso_base, vdso_text_len, - (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), + (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_SEALED_SYSMAP), vdso_info->cm); if (IS_ERR(ret)) -- cgit v1.2.3 From ee0d03053e7009a3a3532fb37f6c94bfa0a8cca3 Mon Sep 17 00:00:00 2001 From: Xi Ruoyao Date: Fri, 11 Apr 2025 10:46:00 +0800 Subject: RISC-V: vDSO: Wire up getrandom() vDSO implementation Hook up the generic vDSO implementation to the generic vDSO getrandom implementation by providing the required __arch_chacha20_blocks_nostack and getrandom_syscall implementations. Also wire up the selftests. The benchmark result: vdso: 25000000 times in 2.466341333 seconds libc: 25000000 times in 41.447720005 seconds syscall: 25000000 times in 41.043926672 seconds vdso: 25000000 x 256 times in 162.286219353 seconds libc: 25000000 x 256 times in 2953.855018685 seconds syscall: 25000000 x 256 times in 2796.268546000 seconds [ alex: - Fix dynamic relocation - Squash Nathan's fix https://lore.kernel.org/all/20250423-riscv-fix-compat_vdso-lld-v2-1-b7bbbc244501@kernel.org/ - Add comment from Loongarch ] Signed-off-by: Xi Ruoyao Link: https://lore.kernel.org/r/20250411024600.16045-1-xry111@xry111.site Tested-by: Alexandre Ghiti Signed-off-by: Alexandre Ghiti Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/vdso/getrandom.h | 30 ++++ arch/riscv/kernel/vdso/Makefile | 13 ++ arch/riscv/kernel/vdso/getrandom.c | 10 ++ arch/riscv/kernel/vdso/vdso.lds.S | 3 + arch/riscv/kernel/vdso/vgetrandom-chacha.S | 249 +++++++++++++++++++++++++++++ 6 files changed, 306 insertions(+) create mode 100644 arch/riscv/include/asm/vdso/getrandom.h create mode 100644 arch/riscv/kernel/vdso/getrandom.c create mode 100644 arch/riscv/kernel/vdso/vgetrandom-chacha.S (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 3cb0b05eef62..b5b3ead92f64 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -219,6 +219,7 @@ config RISCV select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU + select VDSO_GETRANDOM if HAVE_GENERIC_VDSO select USER_STACKTRACE_SUPPORT select ZONE_DMA32 if 64BIT diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h new file mode 100644 index 000000000000..8dc92441702a --- /dev/null +++ b/arch/riscv/include/asm/vdso/getrandom.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Xi Ruoyao . All Rights Reserved. + */ +#ifndef __ASM_VDSO_GETRANDOM_H +#define __ASM_VDSO_GETRANDOM_H + +#ifndef __ASSEMBLY__ + +#include + +static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags) +{ + register long ret asm("a0"); + register long nr asm("a7") = __NR_getrandom; + register void *buffer asm("a0") = _buffer; + register size_t len asm("a1") = _len; + register unsigned int flags asm("a2") = _flags; + + asm volatile ("ecall\n" + : "+r" (ret) + : "r" (nr), "r" (buffer), "r" (len), "r" (flags) + : "memory"); + + return ret; +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index ad73607abc28..dca888852d93 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -13,9 +13,17 @@ vdso-syms += flush_icache vdso-syms += hwprobe vdso-syms += sys_hwprobe +ifdef CONFIG_VDSO_GETRANDOM +vdso-syms += getrandom +endif + # Files to link into the vdso obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o +ifdef CONFIG_VDSO_GETRANDOM +obj-vdso += vgetrandom-chacha.o +endif + ccflags-y := -fno-stack-protector ccflags-y += -DDISABLE_BRANCH_PROFILING ccflags-y += -fno-builtin @@ -24,6 +32,10 @@ ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y) endif +ifneq ($(c-getrandom-y),) + CFLAGS_getrandom.o += -fPIC -include $(c-getrandom-y) +endif + CFLAGS_hwprobe.o += -fPIC # Build rules @@ -38,6 +50,7 @@ endif # Disable -pg to prevent insert call site CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) +CFLAGS_REMOVE_getrandom.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) CFLAGS_REMOVE_hwprobe.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) # Force dependency diff --git a/arch/riscv/kernel/vdso/getrandom.c b/arch/riscv/kernel/vdso/getrandom.c new file mode 100644 index 000000000000..f21922e8cebd --- /dev/null +++ b/arch/riscv/kernel/vdso/getrandom.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Xi Ruoyao . All Rights Reserved. + */ +#include + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); +} diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index 8e86965a8aae..7c15b0f4ee3b 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -79,6 +79,9 @@ VERSION __vdso_flush_icache; #ifndef COMPAT_VDSO __vdso_riscv_hwprobe; +#endif +#if defined(CONFIG_VDSO_GETRANDOM) && !defined(COMPAT_VDSO) + __vdso_getrandom; #endif local: *; }; diff --git a/arch/riscv/kernel/vdso/vgetrandom-chacha.S b/arch/riscv/kernel/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..5f0dad8f2373 --- /dev/null +++ b/arch/riscv/kernel/vdso/vgetrandom-chacha.S @@ -0,0 +1,249 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Xi Ruoyao . All Rights Reserved. + * + * Based on arch/loongarch/vdso/vgetrandom-chacha.S. + */ + +#include +#include + +.text + +.macro ROTRI rd rs imm + slliw t0, \rs, 32 - \imm + srliw \rd, \rs, \imm + or \rd, \rd, t0 +.endm + +.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 + \op \d0, \d0, \s0 + \op \d1, \d1, \s1 + \op \d2, \d2, \s2 + \op \d3, \d3, \s3 +.endm + +/* + * a0: output bytes + * a1: 32-byte key input + * a2: 8-byte counter input/output + * a3: number of 64-byte blocks to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + +#define output a0 +#define key a1 +#define counter a2 +#define nblocks a3 +#define i a4 +#define state0 s0 +#define state1 s1 +#define state2 s2 +#define state3 s3 +#define state4 s4 +#define state5 s5 +#define state6 s6 +#define state7 s7 +#define state8 s8 +#define state9 s9 +#define state10 s10 +#define state11 s11 +#define state12 a5 +#define state13 a6 +#define state14 a7 +#define state15 t1 +#define cnt t2 +#define copy0 t3 +#define copy1 t4 +#define copy2 t5 +#define copy3 t6 + +/* Packs to be used with OP_4REG */ +#define line0 state0, state1, state2, state3 +#define line1 state4, state5, state6, state7 +#define line2 state8, state9, state10, state11 +#define line3 state12, state13, state14, state15 + +#define line1_perm state5, state6, state7, state4 +#define line2_perm state10, state11, state8, state9 +#define line3_perm state15, state12, state13, state14 + +#define copy copy0, copy1, copy2, copy3 + +#define _16 16, 16, 16, 16 +#define _20 20, 20, 20, 20 +#define _24 24, 24, 24, 24 +#define _25 25, 25, 25, 25 + + /* + * The ABI requires s0-s9 saved. + * This does not violate the stack-less requirement: no sensitive data + * is spilled onto the stack. + */ + addi sp, sp, -12*SZREG + REG_S s0, (sp) + REG_S s1, SZREG(sp) + REG_S s2, 2*SZREG(sp) + REG_S s3, 3*SZREG(sp) + REG_S s4, 4*SZREG(sp) + REG_S s5, 5*SZREG(sp) + REG_S s6, 6*SZREG(sp) + REG_S s7, 7*SZREG(sp) + REG_S s8, 8*SZREG(sp) + REG_S s9, 9*SZREG(sp) + REG_S s10, 10*SZREG(sp) + REG_S s11, 11*SZREG(sp) + + ld cnt, (counter) + + li copy0, 0x61707865 + li copy1, 0x3320646e + li copy2, 0x79622d32 + li copy3, 0x6b206574 + +.Lblock: + /* state[0,1,2,3] = "expand 32-byte k" */ + mv state0, copy0 + mv state1, copy1 + mv state2, copy2 + mv state3, copy3 + + /* state[4,5,..,11] = key */ + lw state4, (key) + lw state5, 4(key) + lw state6, 8(key) + lw state7, 12(key) + lw state8, 16(key) + lw state9, 20(key) + lw state10, 24(key) + lw state11, 28(key) + + /* state[12,13] = counter */ + mv state12, cnt + srli state13, cnt, 32 + + /* state[14,15] = 0 */ + mv state14, zero + mv state15, zero + + li i, 10 +.Lpermute: + /* odd round */ + OP_4REG addw line0, line1 + OP_4REG xor line3, line0 + OP_4REG ROTRI line3, _16 + + OP_4REG addw line2, line3 + OP_4REG xor line1, line2 + OP_4REG ROTRI line1, _20 + + OP_4REG addw line0, line1 + OP_4REG xor line3, line0 + OP_4REG ROTRI line3, _24 + + OP_4REG addw line2, line3 + OP_4REG xor line1, line2 + OP_4REG ROTRI line1, _25 + + /* even round */ + OP_4REG addw line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG ROTRI line3_perm, _16 + + OP_4REG addw line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG ROTRI line1_perm, _20 + + OP_4REG addw line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG ROTRI line3_perm, _24 + + OP_4REG addw line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG ROTRI line1_perm, _25 + + addi i, i, -1 + bnez i, .Lpermute + + /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ + OP_4REG addw line0, copy + sw state0, (output) + sw state1, 4(output) + sw state2, 8(output) + sw state3, 12(output) + + /* from now on state[0,1,2,3] are scratch registers */ + + /* state[0,1,2,3] = lo(key) */ + lw state0, (key) + lw state1, 4(key) + lw state2, 8(key) + lw state3, 12(key) + + /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ + OP_4REG addw line1, line0 + sw state4, 16(output) + sw state5, 20(output) + sw state6, 24(output) + sw state7, 28(output) + + /* state[0,1,2,3] = hi(key) */ + lw state0, 16(key) + lw state1, 20(key) + lw state2, 24(key) + lw state3, 28(key) + + /* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */ + OP_4REG addw line2, line0 + sw state8, 32(output) + sw state9, 36(output) + sw state10, 40(output) + sw state11, 44(output) + + /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ + addw state12, state12, cnt + srli state0, cnt, 32 + addw state13, state13, state0 + sw state12, 48(output) + sw state13, 52(output) + sw state14, 56(output) + sw state15, 60(output) + + /* ++counter */ + addi cnt, cnt, 1 + + /* output += 64 */ + addi output, output, 64 + /* --nblocks */ + addi nblocks, nblocks, -1 + bnez nblocks, .Lblock + + /* counter = [cnt_lo, cnt_hi] */ + sd cnt, (counter) + + /* Zero out the potentially sensitive regs, in case nothing uses these + * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and + * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we + * only need to zero state[12,...,15]. + */ + mv state12, zero + mv state13, zero + mv state14, zero + mv state15, zero + + REG_L s0, (sp) + REG_L s1, SZREG(sp) + REG_L s2, 2*SZREG(sp) + REG_L s3, 3*SZREG(sp) + REG_L s4, 4*SZREG(sp) + REG_L s5, 5*SZREG(sp) + REG_L s6, 6*SZREG(sp) + REG_L s7, 7*SZREG(sp) + REG_L s8, 8*SZREG(sp) + REG_L s9, 9*SZREG(sp) + REG_L s10, 10*SZREG(sp) + REG_L s11, 11*SZREG(sp) + addi sp, sp, 12*SZREG + + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) -- cgit v1.2.3 From 265d6aba165c500389c80d394ac247460c443ef5 Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Mon, 2 Jun 2025 12:15:43 +0000 Subject: riscv: uaccess: Only restore the CSR_STATUS SUM bit During switch to csrs will OR the value of the register into the corresponding csr. In this case we're only interested in restoring the SUM bit not the entire register. Signed-off-by: Cyril Bur Link: https://lore.kernel.org/r/20250522160954.429333-1-cyrilbur@tenstorrent.com Co-developed-by: Alexandre Ghiti Signed-off-by: Alexandre Ghiti Fixes: 788aa64c01f1 ("riscv: save the SR_SUM status over switches") Link: https://lore.kernel.org/r/20250602121543.1544278-1-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/processor.h | 2 +- arch/riscv/kernel/asm-offsets.c | 6 +++--- arch/riscv/kernel/entry.S | 9 +++++---- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h index 8111250f3c1b..24d3af4d3807 100644 --- a/arch/riscv/include/asm/processor.h +++ b/arch/riscv/include/asm/processor.h @@ -110,7 +110,7 @@ struct thread_struct { struct __riscv_d_ext_state fstate; unsigned long bad_cause; unsigned long envcfg; - unsigned long status; + unsigned long sum; u32 riscv_v_flags; u32 vstate_ctrl; struct __riscv_v_ext_state vstate; diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c index 9420ec6a50fd..6e8c0d6feae9 100644 --- a/arch/riscv/kernel/asm-offsets.c +++ b/arch/riscv/kernel/asm-offsets.c @@ -34,7 +34,7 @@ void asm_offsets(void) OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]); OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]); OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]); - OFFSET(TASK_THREAD_STATUS, task_struct, thread.status); + OFFSET(TASK_THREAD_SUM, task_struct, thread.sum); OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu); OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count); @@ -347,8 +347,8 @@ void asm_offsets(void) offsetof(struct task_struct, thread.s[11]) - offsetof(struct task_struct, thread.ra) ); - DEFINE(TASK_THREAD_STATUS_RA, - offsetof(struct task_struct, thread.status) + DEFINE(TASK_THREAD_SUM_RA, + offsetof(struct task_struct, thread.sum) - offsetof(struct task_struct, thread.ra) ); diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 00bd0de9faa2..a49e19ce3a97 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -399,14 +399,15 @@ SYM_FUNC_START(__switch_to) REG_S s11, TASK_THREAD_S11_RA(a3) /* save the user space access flag */ - li s0, SR_SUM - csrr s1, CSR_STATUS - REG_S s1, TASK_THREAD_STATUS_RA(a3) + csrr s0, CSR_STATUS + REG_S s0, TASK_THREAD_SUM_RA(a3) /* Save the kernel shadow call stack pointer */ scs_save_current /* Restore context from next->thread */ - REG_L s0, TASK_THREAD_STATUS_RA(a4) + REG_L s0, TASK_THREAD_SUM_RA(a4) + li s1, SR_SUM + and s0, s0, s1 csrs CSR_STATUS, s0 REG_L ra, TASK_THREAD_RA_RA(a4) REG_L sp, TASK_THREAD_SP_RA(a4) -- cgit v1.2.3 From 15ac613f124e51a6623975efad9657b1f3ee47e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 19 May 2025 15:56:57 +0100 Subject: KVM: s390: rename PROT_NONE to PROT_TYPE_DUMMY The enum type prot_type declared in arch/s390/kvm/gaccess.c declares an unfortunate identifier within it - PROT_NONE. This clashes with the protection bit define from the uapi for mmap() declared in include/uapi/asm-generic/mman-common.h, which is indeed what those casually reading this code would assume this to refer to. This means that any changes which subsequently alter headers in any way which results in the uapi header being imported here will cause build errors. Resolve the issue by renaming PROT_NONE to PROT_TYPE_DUMMY. Link: https://lkml.kernel.org/r/20250519145657.178365-1-lorenzo.stoakes@oracle.com Fixes: b3cefd6bf16e ("KVM: s390: Pass initialized arg even if unused") Signed-off-by: Lorenzo Stoakes Suggested-by: Ignacio Moreno Gonzalez Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505140943.IgHDa9s7-lkp@intel.com/ Acked-by: Christian Borntraeger Acked-by: Ignacio Moreno Gonzalez Acked-by: Yang Shi Reviewed-by: David Hildenbrand Acked-by: Liam R. Howlett Reviewed-by: Oscar Salvador Reviewed-by: Claudio Imbrenda Cc: Cc: Alexander Gordeev Cc: Heiko Carstens Cc: James Houghton Cc: Janosch Frank Cc: Matthew Wilcox (Oracle) Cc: Paolo Bonzini Cc: Sven Schnelle Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/s390/kvm/gaccess.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index e23670e1949c..21c2e61fece4 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -319,7 +319,7 @@ enum prot_type { PROT_TYPE_DAT = 3, PROT_TYPE_IEP = 4, /* Dummy value for passing an initialized value when code != PGM_PROTECTION */ - PROT_NONE, + PROT_TYPE_DUMMY, }; static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, @@ -335,7 +335,7 @@ static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (code) { case PGM_PROTECTION: switch (prot) { - case PROT_NONE: + case PROT_TYPE_DUMMY: /* We should never get here, acts like termination */ WARN_ON_ONCE(1); break; @@ -805,7 +805,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, gpa = kvm_s390_real_to_abs(vcpu, ga); if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) { rc = PGM_ADDRESSING; - prot = PROT_NONE; + prot = PROT_TYPE_DUMMY; } } if (rc) @@ -963,7 +963,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, if (rc == PGM_PROTECTION) prot = PROT_TYPE_KEYC; else - prot = PROT_NONE; + prot = PROT_TYPE_DUMMY; rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate); } out_unlock: -- cgit v1.2.3 From e242bbbb6d7ac7556aa1e358294dc7e3c82cc902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Thu, 5 Jun 2025 20:34:18 +0800 Subject: LoongArch: vDSO: Correctly use asm parameters in syscall wrappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The syscall wrappers use the "a0" register for two different register variables, both the first argument and the return value. Here the "ret" variable is used as both input and output while the argument register is only used as input. Clang treats the conflicting input parameters as an undefined behaviour and optimizes away the argument assignment. The code seems to work by chance for the most part today but that may change in the future. Specifically clock_gettime_fallback() fails with clockids from 16 to 23, as implemented by the upcoming auxiliary clocks. Switch the "ret" register variable to a pure output, similar to the other architectures' vDSO code. This works in both clang and GCC. Link: https://lore.kernel.org/lkml/20250602102825-42aa84f0-23f1-4d10-89fc-e8bbaffd291a@linutronix.de/ Link: https://lore.kernel.org/lkml/20250519082042.742926976@linutronix.de/ Fixes: c6b99bed6b8f ("LoongArch: Add VDSO and VSYSCALL support") Fixes: 18efd0b10e0f ("LoongArch: vDSO: Wire up getrandom() vDSO implementation") Cc: stable@vger.kernel.org Reviewed-by: Nathan Chancellor Reviewed-by: Yanteng Si Reviewed-by: WANG Xuerui Reviewed-by: Xi Ruoyao Signed-off-by: Thomas Weißschuh Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/vdso/getrandom.h | 2 +- arch/loongarch/include/asm/vdso/gettimeofday.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/loongarch/include/asm/vdso/getrandom.h b/arch/loongarch/include/asm/vdso/getrandom.h index 48c43f55b039..a81724b69f29 100644 --- a/arch/loongarch/include/asm/vdso/getrandom.h +++ b/arch/loongarch/include/asm/vdso/getrandom.h @@ -20,7 +20,7 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns asm volatile( " syscall 0\n" - : "+r" (ret) + : "=r" (ret) : "r" (nr), "r" (buffer), "r" (len), "r" (flags) : "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8", "memory"); diff --git a/arch/loongarch/include/asm/vdso/gettimeofday.h b/arch/loongarch/include/asm/vdso/gettimeofday.h index 88cfcf133116..f15503e3336c 100644 --- a/arch/loongarch/include/asm/vdso/gettimeofday.h +++ b/arch/loongarch/include/asm/vdso/gettimeofday.h @@ -25,7 +25,7 @@ static __always_inline long gettimeofday_fallback( asm volatile( " syscall 0\n" - : "+r" (ret) + : "=r" (ret) : "r" (nr), "r" (tv), "r" (tz) : "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8", "memory"); @@ -44,7 +44,7 @@ static __always_inline long clock_gettime_fallback( asm volatile( " syscall 0\n" - : "+r" (ret) + : "=r" (ret) : "r" (nr), "r" (clkid), "r" (ts) : "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8", "memory"); @@ -63,7 +63,7 @@ static __always_inline int clock_getres_fallback( asm volatile( " syscall 0\n" - : "+r" (ret) + : "=r" (ret) : "r" (nr), "r" (clkid), "r" (ts) : "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8", "memory"); -- cgit v1.2.3 From 07aeb50e6c74e59c268ad6460c6b51411bea1759 Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Thu, 5 Jun 2025 20:34:34 +0800 Subject: LoongArch: dts: Add PWM support to Loongson-2K0500 The module is supported, enable it. Reviewed-by: Yanteng Si Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k0500.dtsi | 160 +++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) (limited to 'arch') diff --git a/arch/loongarch/boot/dts/loongson-2k0500.dtsi b/arch/loongarch/boot/dts/loongson-2k0500.dtsi index 3b38ff8853a7..760c60eebb89 100644 --- a/arch/loongarch/boot/dts/loongson-2k0500.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k0500.dtsi @@ -169,6 +169,166 @@ interrupts = <3>; }; + pwm@1ff5c000 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c000 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <24 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c010 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c010 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <24 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c020 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c020 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <24 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c030 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c030 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <24 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c040 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c040 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <25 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c050 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c050 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <25 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c060 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c060 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <25 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c070 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c070 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <25 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c080 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c080 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <26 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c090 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c090 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <26 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c0a0 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c0a0 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <26 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c0b0 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c0b0 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <26 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c0c0 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c0c0 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <27 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c0d0 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c0d0 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <27 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c0e0 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c0e0 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <27 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1ff5c0f0 { + compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1ff5c0f0 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <27 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + gmac0: ethernet@1f020000 { compatible = "snps,dwmac-3.70a"; reg = <0x0 0x1f020000 0x0 0x10000>; -- cgit v1.2.3 From abd000dbaee45fa620ddaeb13757b9a8985ce50f Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Thu, 5 Jun 2025 20:34:34 +0800 Subject: LoongArch: dts: Add PWM support to Loongson-2K1000 The module is supported, enable it. Also, add the pwm-fan and cooling-maps associated with it. Reviewed-by: Yanteng Si Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k1000-ref.dts | 24 ++++++++++++++ arch/loongarch/boot/dts/loongson-2k1000.dtsi | 42 ++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts index 3514ea78f525..78ea995abf1c 100644 --- a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts +++ b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts @@ -5,6 +5,7 @@ /dts-v1/; +#include "dt-bindings/thermal/thermal.h" #include "loongson-2k1000.dtsi" / { @@ -38,6 +39,13 @@ linux,cma-default; }; }; + + fan0: pwm-fan { + compatible = "pwm-fan"; + cooling-levels = <255 153 85 25>; + pwms = <&pwm1 0 100000 0>; + #cooling-cells = <2>; + }; }; &gmac0 { @@ -92,6 +100,22 @@ #size-cells = <0>; }; +&pwm1 { + status = "okay"; + + pinctrl-0 = <&pwm1_pins_default>; + pinctrl-names = "default"; +}; + +&cpu_thermal { + cooling-maps { + map0 { + trip = <&cpu_alert>; + cooling-device = <&fan0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>; + }; + }; +}; + &ehci0 { status = "okay"; }; diff --git a/arch/loongarch/boot/dts/loongson-2k1000.dtsi b/arch/loongarch/boot/dts/loongson-2k1000.dtsi index 8dff2aa52417..1da3beb00f0e 100644 --- a/arch/loongarch/boot/dts/loongson-2k1000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k1000.dtsi @@ -68,7 +68,7 @@ }; thermal-zones { - cpu-thermal { + cpu_thermal: cpu-thermal { polling-delay-passive = <1000>; polling-delay = <5000>; thermal-sensors = <&tsensor 0>; @@ -322,6 +322,46 @@ status = "disabled"; }; + pwm@1fe22000 { + compatible = "loongson,ls2k1000-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1fe22000 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <24 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm1: pwm@1fe22010 { + compatible = "loongson,ls2k1000-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1fe22010 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <25 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1fe22020 { + compatible = "loongson,ls2k1000-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1fe22020 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <26 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@1fe22030 { + compatible = "loongson,ls2k1000-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x1fe22030 0x0 0x10>; + interrupt-parent = <&liointc0>; + interrupts = <27 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_APB_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + pmc: power-management@1fe27000 { compatible = "loongson,ls2k1000-pmc", "loongson,ls2k0500-pmc", "syscon"; reg = <0x0 0x1fe27000 0x0 0x58>; -- cgit v1.2.3 From 1cf806006574b0e874fb39a18b1b26559770fb1c Mon Sep 17 00:00:00 2001 From: Binbin Zhou Date: Thu, 5 Jun 2025 20:34:34 +0800 Subject: LoongArch: dts: Add PWM support to Loongson-2K2000 The module is supported, enable it. Reviewed-by: Yanteng Si Signed-off-by: Binbin Zhou Signed-off-by: Huacai Chen --- arch/loongarch/boot/dts/loongson-2k2000.dtsi | 60 ++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) (limited to 'arch') diff --git a/arch/loongarch/boot/dts/loongson-2k2000.dtsi b/arch/loongarch/boot/dts/loongson-2k2000.dtsi index b4ff55a33e90..9e0411f2754c 100644 --- a/arch/loongarch/boot/dts/loongson-2k2000.dtsi +++ b/arch/loongarch/boot/dts/loongson-2k2000.dtsi @@ -165,6 +165,66 @@ interrupt-parent = <&eiointc>; }; + pwm@100a0000 { + compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x100a0000 0x0 0x10>; + interrupt-parent = <&pic>; + interrupts = <24 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_MISC_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@100a0100 { + compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x100a0100 0x0 0x10>; + interrupt-parent = <&pic>; + interrupts = <25 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_MISC_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@100a0200 { + compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x100a0200 0x0 0x10>; + interrupt-parent = <&pic>; + interrupts = <26 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_MISC_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@100a0300 { + compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x100a0300 0x0 0x10>; + interrupt-parent = <&pic>; + interrupts = <27 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_MISC_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@100a0400 { + compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x100a0400 0x0 0x10>; + interrupt-parent = <&pic>; + interrupts = <38 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_MISC_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + + pwm@100a0500 { + compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm"; + reg = <0x0 0x100a0500 0x0 0x10>; + interrupt-parent = <&pic>; + interrupts = <39 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&clk LOONGSON2_MISC_CLK>; + #pwm-cells = <3>; + status = "disabled"; + }; + rtc0: rtc@100d0100 { compatible = "loongson,ls2k2000-rtc", "loongson,ls7a-rtc"; reg = <0x0 0x100d0100 0x0 0x100>; -- cgit v1.2.3 From e21efe833eae4e2a56c2c2a11caae870a65926fa Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 3 Jun 2025 03:12:54 +0900 Subject: arch: use always-$(KBUILD_BUILTIN) for vmlinux.lds The extra-y syntax is deprecated. Instead, use always-$(KBUILD_BUILTIN), which behaves equivalently. Signed-off-by: Masahiro Yamada Acked-by: Johannes Berg Reviewed-by: Nicolas Schier --- arch/alpha/kernel/Makefile | 2 +- arch/arc/kernel/Makefile | 2 +- arch/arm/kernel/Makefile | 2 +- arch/arm64/kernel/Makefile | 2 +- arch/csky/kernel/Makefile | 2 +- arch/hexagon/kernel/Makefile | 2 +- arch/loongarch/kernel/Makefile | 2 +- arch/m68k/kernel/Makefile | 2 +- arch/microblaze/kernel/Makefile | 2 +- arch/mips/kernel/Makefile | 2 +- arch/nios2/kernel/Makefile | 2 +- arch/openrisc/kernel/Makefile | 2 +- arch/parisc/kernel/Makefile | 2 +- arch/powerpc/kernel/Makefile | 2 +- arch/riscv/kernel/Makefile | 2 +- arch/s390/kernel/Makefile | 2 +- arch/sh/kernel/Makefile | 2 +- arch/sparc/kernel/Makefile | 2 +- arch/um/kernel/Makefile | 2 +- arch/x86/kernel/Makefile | 2 +- arch/xtensa/kernel/Makefile | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/alpha/kernel/Makefile b/arch/alpha/kernel/Makefile index b6c862dff1f6..187cd8df2faf 100644 --- a/arch/alpha/kernel/Makefile +++ b/arch/alpha/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the linux kernel. # -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds asflags-y := $(KBUILD_CFLAGS) ccflags-y := -Wno-sign-compare diff --git a/arch/arc/kernel/Makefile b/arch/arc/kernel/Makefile index 95fbf9364c67..fa94fff02419 100644 --- a/arch/arc/kernel/Makefile +++ b/arch/arc/kernel/Makefile @@ -26,4 +26,4 @@ ifdef CONFIG_ISA_ARCOMPACT CFLAGS_fpu.o += -mdpfp endif -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index b3333d070390..afc9de7ef9a1 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -104,4 +104,4 @@ obj-$(CONFIG_HAVE_ARM_SMCCC) += smccc-call.o obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += spectre.o -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 71c29a2a2f19..2920b0a51403 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -78,7 +78,7 @@ $(obj)/vdso32-wrap.o: $(obj)/vdso32/vdso.so obj-y += probes/ obj-y += head.o -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds ifeq ($(CONFIG_DEBUG_EFI),y) AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\"" diff --git a/arch/csky/kernel/Makefile b/arch/csky/kernel/Makefile index de1c3472e8f0..a406a4ac2378 100644 --- a/arch/csky/kernel/Makefile +++ b/arch/csky/kernel/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y += head.o entry.o atomic.o signal.o traps.o irq.o time.o vdso.o vdso/ obj-y += power.o syscall.o syscall_table.o setup.o diff --git a/arch/hexagon/kernel/Makefile b/arch/hexagon/kernel/Makefile index 3fdf937eb572..8e0fb4a62315 100644 --- a/arch/hexagon/kernel/Makefile +++ b/arch/hexagon/kernel/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y += head.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index f9dcaa60033d..6f5a4574a911 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -5,7 +5,7 @@ OBJECT_FILES_NON_STANDARD_head.o := y -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y += head.o cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \ traps.o irq.o idle.o process.o dma.o mem.o reset.o switch.o \ diff --git a/arch/m68k/kernel/Makefile b/arch/m68k/kernel/Makefile index 6c732ed3998b..57c1b3e8d60e 100644 --- a/arch/m68k/kernel/Makefile +++ b/arch/m68k/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the linux kernel. # -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds obj-$(CONFIG_MMU_MOTOROLA) := head.o obj-$(CONFIG_SUN3) := sun3-head.o diff --git a/arch/microblaze/kernel/Makefile b/arch/microblaze/kernel/Makefile index 85c4d29ef43e..241e466e7333 100644 --- a/arch/microblaze/kernel/Makefile +++ b/arch/microblaze/kernel/Makefile @@ -11,7 +11,7 @@ CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_process.o = -pg endif -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y += head.o dma.o exceptions.o \ hw_exception_handler.o irq.o \ diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile index ecf3278a32f7..95a1e674fd67 100644 --- a/arch/mips/kernel/Makefile +++ b/arch/mips/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the Linux/MIPS kernel. # -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y += head.o branch.o cmpxchg.o elf.o entry.o genex.o idle.o irq.o \ process.o prom.o ptrace.o reset.o setup.o signal.o \ diff --git a/arch/nios2/kernel/Makefile b/arch/nios2/kernel/Makefile index 78a913181fa1..4dce965a7b73 100644 --- a/arch/nios2/kernel/Makefile +++ b/arch/nios2/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the nios2 linux kernel. # -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds obj-y += head.o obj-y += cpuinfo.o diff --git a/arch/openrisc/kernel/Makefile b/arch/openrisc/kernel/Makefile index e4c7d9bdd598..58e6a1b525b7 100644 --- a/arch/openrisc/kernel/Makefile +++ b/arch/openrisc/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the linux kernel. # -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y := head.o setup.o or32_ksyms.o process.o dma.o \ traps.o time.o irq.o entry.o ptrace.o signal.o \ diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile index 5ab0467be70a..d5055ba33722 100644 --- a/arch/parisc/kernel/Makefile +++ b/arch/parisc/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for arch/parisc/kernel # -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y := head.o cache.o pacache.o setup.o pdt.o traps.o time.o irq.o \ syscall.o entry.o sys_parisc.o firmware.o \ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 6ac621155ec3..4d2daa8e7bca 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -126,7 +126,7 @@ obj-$(CONFIG_PPC_BOOK3S_32) += head_book3s_32.o obj-$(CONFIG_44x) += head_44x.o obj-$(CONFIG_PPC_8xx) += head_8xx.o obj-$(CONFIG_PPC_85xx) += head_85xx.o -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds obj-$(CONFIG_RELOCATABLE) += reloc_$(BITS).o diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index f7480c9c6f8d..48dcaf2efae8 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -43,7 +43,7 @@ CFLAGS_sbi_ecall.o += -D__NO_FORTIFY endif endif -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds obj-y += head.o obj-y += soc.o diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index db5f3a3faefb..ea5ed6654050 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -46,7 +46,7 @@ obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o obj-y += smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o wti.o obj-y += diag/ -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) diff --git a/arch/sh/kernel/Makefile b/arch/sh/kernel/Makefile index 7b453592adaf..5ef123bc63f8 100644 --- a/arch/sh/kernel/Makefile +++ b/arch/sh/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the Linux/SuperH kernel. # -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 58ea4ef9b622..2859842d6bb7 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -9,7 +9,7 @@ asflags-y := -ansi # Undefine sparc when processing vmlinux.lds - it is used # And teach CPP we are doing $(BITS) builds (for this case) CPPFLAGS_vmlinux.lds := -Usparc -m$(BITS) -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 4df1cd0d2017..821bf4027a23 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -12,7 +12,7 @@ CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START) \ -DELF_ARCH=$(LDS_ELF_ARCH) \ -DELF_FORMAT=$(LDS_ELF_FORMAT) \ $(LDS_EXTRA) -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ physmem.o process.o ptrace.o reboot.o sigio.o \ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 84cfa179802c..9a30c9816b16 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the linux kernel. # -extra-y += vmlinux.lds +always-$(KBUILD_BUILTIN) += vmlinux.lds CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) diff --git a/arch/xtensa/kernel/Makefile b/arch/xtensa/kernel/Makefile index f28b8e3d717e..d3ef0407401f 100644 --- a/arch/xtensa/kernel/Makefile +++ b/arch/xtensa/kernel/Makefile @@ -3,7 +3,7 @@ # Makefile for the Linux/Xtensa kernel. # -extra-y := vmlinux.lds +always-$(KBUILD_BUILTIN) := vmlinux.lds obj-y := head.o align.o coprocessor.o entry.o irq.o platform.o process.o \ ptrace.o setup.o signal.o stacktrace.o syscall.o time.o traps.o \ -- cgit v1.2.3 From 4c529a4a7260776bb4abe264498857b4537aa70d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 7 Jun 2025 14:22:56 +0200 Subject: x86/smp: PM/hibernate: Split arch_resume_nosmt() Move the inner part of the arch_resume_nosmt() code into a separate function called arch_cpu_rescan_dead_smt_siblings(), so it can be used in other places where "dead" SMT siblings may need to be taken online and offline again in order to get into deep idle states. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Dave Hansen Tested-by: Artem Bityutskiy Link: https://patch.msgid.link/3361688.44csPzL39Z@rjwysocki.net [ rjw: Prevent build issues with CONFIG_SMP unset ] Signed-off-by: Rafael J. Wysocki --- arch/x86/kernel/smp.c | 24 ++++++++++++++++++++++++ arch/x86/power/hibernate.c | 19 ++++++------------- 2 files changed, 30 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18266cc3d98c..b014e6d229f9 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -299,3 +299,27 @@ struct smp_ops smp_ops = { .send_call_func_single_ipi = native_send_call_func_single_ipi, }; EXPORT_SYMBOL_GPL(smp_ops); + +int arch_cpu_rescan_dead_smt_siblings(void) +{ + enum cpuhp_smt_control old = cpu_smt_control; + int ret; + + /* + * If SMT has been disabled and SMT siblings are in HLT, bring them back + * online and offline them again so that they end up in MWAIT proper. + * + * Called with hotplug enabled. + */ + if (old != CPU_SMT_DISABLED && old != CPU_SMT_FORCE_DISABLED) + return 0; + + ret = cpuhp_smt_enable(); + if (ret) + return ret; + + ret = cpuhp_smt_disable(old); + + return ret; +} +EXPORT_SYMBOL_GPL(arch_cpu_rescan_dead_smt_siblings); diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index a7c23f2a58c9..a2294c1649f6 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -192,7 +192,8 @@ out: int arch_resume_nosmt(void) { - int ret = 0; + int ret; + /* * We reached this while coming out of hibernation. This means * that SMT siblings are sleeping in hlt, as mwait is not safe @@ -206,18 +207,10 @@ int arch_resume_nosmt(void) * Called with hotplug disabled. */ cpu_hotplug_enable(); - if (cpu_smt_control == CPU_SMT_DISABLED || - cpu_smt_control == CPU_SMT_FORCE_DISABLED) { - enum cpuhp_smt_control old = cpu_smt_control; - - ret = cpuhp_smt_enable(); - if (ret) - goto out; - ret = cpuhp_smt_disable(old); - if (ret) - goto out; - } -out: + + ret = arch_cpu_rescan_dead_smt_siblings(); + cpu_hotplug_disable(); + return ret; } -- cgit v1.2.3 From a18d098f2aab82abde1d59c56aef9beca01c892b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 5 Jun 2025 17:09:35 +0200 Subject: Reapply "x86/smp: Eliminate mwait_play_dead_cpuid_hint()" Revert commit 70523f335734 ("Revert "x86/smp: Eliminate mwait_play_dead_cpuid_hint()"") to reapply the changes from commit 96040f7273e2 ("x86/smp: Eliminate mwait_play_dead_cpuid_hint()") reverted by it. Previously, these changes caused idle power to rise on systems booting with "nosmt" in the kernel command line because they effectively caused "dead" SMT siblings to remain in idle state C1 after executing the HLT instruction, which prevented the processor from reaching package idle states deeper than PC2 going forward. Now, the "dead" SMT siblings are rescanned after initializing a proper cpuidle driver for the processor (either intel_idle or ACPI idle), at which point they are able to enter a sufficiently deep idle state in native_play_dead() via cpuidle, so the code changes in question can be reapplied. Signed-off-by: Rafael J. Wysocki Acked-by: Dave Hansen Tested-by: Artem Bityutskiy Link: https://patch.msgid.link/7813065.EvYhyI6sBW@rjwysocki.net --- arch/x86/kernel/smpboot.c | 54 ++++++----------------------------------------- 1 file changed, 7 insertions(+), 47 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fc78c2325fd2..58ede3fa6a75 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1244,6 +1244,10 @@ void play_dead_common(void) local_irq_disable(); } +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ void __noreturn mwait_play_dead(unsigned int eax_hint) { struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead); @@ -1289,50 +1293,6 @@ void __noreturn mwait_play_dead(unsigned int eax_hint) } } -/* - * We need to flush the caches before going to sleep, lest we have - * dirty data in our caches when we come back up. - */ -static inline void mwait_play_dead_cpuid_hint(void) -{ - unsigned int eax, ebx, ecx, edx; - unsigned int highest_cstate = 0; - unsigned int highest_subcstate = 0; - int i; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) - return; - if (!this_cpu_has(X86_FEATURE_MWAIT)) - return; - if (!this_cpu_has(X86_FEATURE_CLFLUSH)) - return; - - eax = CPUID_LEAF_MWAIT; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - - /* - * eax will be 0 if EDX enumeration is not valid. - * Initialized below to cstate, sub_cstate value when EDX is valid. - */ - if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { - eax = 0; - } else { - edx >>= MWAIT_SUBSTATE_SIZE; - for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { - if (edx & MWAIT_SUBSTATE_MASK) { - highest_cstate = i; - highest_subcstate = edx & MWAIT_SUBSTATE_MASK; - } - } - eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | - (highest_subcstate - 1); - } - - mwait_play_dead(eax); -} - /* * Kick all "offline" CPUs out of mwait on kexec(). See comment in * mwait_play_dead(). @@ -1383,9 +1343,9 @@ void native_play_dead(void) play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - mwait_play_dead_cpuid_hint(); - if (cpuidle_play_dead()) - hlt_play_dead(); + /* Below returns only on error. */ + cpuidle_play_dead(); + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From 9cc646950eefda5605111cbc387b00b1f741c239 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 14 Mar 2025 08:10:03 +0100 Subject: sh: Replace __ASSEMBLY__ with __ASSEMBLER__ in all headers While the GCC and Clang compilers already define __ASSEMBLER__ automatically when compiling assembly code, __ASSEMBLY__ is a macro that only gets defined by the Makefiles in the kernel. This can be very confusing when switching between userspace and kernelspace coding, or when dealing with uapi headers that rather should use __ASSEMBLER__ instead. So let's standardize on the __ASSEMBLER__ macro that is provided by the compilers now. This is a completely mechanical patch (done with a simple "sed -i" statement). Cc: Yoshinori Sato Cc: Rich Felker Cc: John Paul Adrian Glaubitz Cc: linux-sh@vger.kernel.org Signed-off-by: Thomas Huth Reviewed-by: John Paul Adrian Glaubitz Signed-off-by: John Paul Adrian Glaubitz --- arch/sh/include/asm/cache.h | 4 ++-- arch/sh/include/asm/dwarf.h | 6 +++--- arch/sh/include/asm/fpu.h | 4 ++-- arch/sh/include/asm/ftrace.h | 8 ++++---- arch/sh/include/asm/mmu.h | 4 ++-- arch/sh/include/asm/page.h | 8 ++++---- arch/sh/include/asm/pgtable.h | 4 ++-- arch/sh/include/asm/pgtable_32.h | 8 ++++---- arch/sh/include/asm/processor.h | 4 ++-- arch/sh/include/asm/smc37c93x.h | 4 ++-- arch/sh/include/asm/suspend.h | 2 +- arch/sh/include/asm/thread_info.h | 10 +++++----- arch/sh/include/asm/tlb.h | 4 ++-- arch/sh/include/asm/types.h | 4 ++-- arch/sh/include/mach-common/mach/romimage.h | 6 +++--- arch/sh/include/mach-ecovec24/mach/romimage.h | 6 +++--- arch/sh/include/mach-kfr2r09/mach/romimage.h | 6 +++--- 17 files changed, 46 insertions(+), 46 deletions(-) (limited to 'arch') diff --git a/arch/sh/include/asm/cache.h b/arch/sh/include/asm/cache.h index b38dbc975581..e7ac9c950275 100644 --- a/arch/sh/include/asm/cache.h +++ b/arch/sh/include/asm/cache.h @@ -22,7 +22,7 @@ #define __read_mostly __section(".data..read_mostly") -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct cache_info { unsigned int ways; /* Number of cache ways */ unsigned int sets; /* Number of cache sets */ @@ -48,5 +48,5 @@ struct cache_info { unsigned long flags; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_SH_CACHE_H */ diff --git a/arch/sh/include/asm/dwarf.h b/arch/sh/include/asm/dwarf.h index 571954474122..f46d18b84833 100644 --- a/arch/sh/include/asm/dwarf.h +++ b/arch/sh/include/asm/dwarf.h @@ -189,7 +189,7 @@ */ #define DWARF_ARCH_RA_REG 17 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -379,7 +379,7 @@ extern int module_dwarf_finalize(const Elf_Ehdr *, const Elf_Shdr *, struct module *); extern void module_dwarf_cleanup(struct module *); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #define CFI_STARTPROC .cfi_startproc #define CFI_ENDPROC .cfi_endproc @@ -402,7 +402,7 @@ extern void module_dwarf_cleanup(struct module *); #define CFI_REL_OFFSET CFI_IGNORE #define CFI_UNDEFINED CFI_IGNORE -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ static inline void dwarf_unwinder_init(void) { } diff --git a/arch/sh/include/asm/fpu.h b/arch/sh/include/asm/fpu.h index 0379f4cce5ed..a086e38b70ee 100644 --- a/arch/sh/include/asm/fpu.h +++ b/arch/sh/include/asm/fpu.h @@ -2,7 +2,7 @@ #ifndef __ASM_SH_FPU_H #define __ASM_SH_FPU_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -67,6 +67,6 @@ static inline void clear_fpu(struct task_struct *tsk, struct pt_regs *regs) void float_raise(unsigned int flags); int float_rounding_mode(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_SH_FPU_H */ diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index 1c10e1066390..d35781ab716e 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -7,7 +7,7 @@ #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ #define FTRACE_SYSCALL_MAX NR_syscalls -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern void mcount(void); #define MCOUNT_ADDR ((unsigned long)(mcount)) @@ -35,10 +35,10 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* arch/sh/kernel/return_address.c */ extern void *return_address(unsigned int); @@ -53,6 +53,6 @@ static inline void arch_ftrace_nmi_enter(void) { } static inline void arch_ftrace_nmi_exit(void) { } #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_SH_FTRACE_H */ diff --git a/arch/sh/include/asm/mmu.h b/arch/sh/include/asm/mmu.h index 172e329fd92d..b9c9f91e6616 100644 --- a/arch/sh/include/asm/mmu.h +++ b/arch/sh/include/asm/mmu.h @@ -33,7 +33,7 @@ #define PMB_NO_ENTRY (-1) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include #include @@ -102,6 +102,6 @@ pmb_remap(phys_addr_t phys, unsigned long size, pgprot_t prot) return pmb_remap_caller(phys, size, prot, __builtin_return_address(0)); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __MMU_H */ diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h index 3990cbd9aa04..def4205491ec 100644 --- a/arch/sh/include/asm/page.h +++ b/arch/sh/include/asm/page.h @@ -30,7 +30,7 @@ #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT-PAGE_SHIFT) #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include extern unsigned long shm_align_mask; @@ -85,7 +85,7 @@ typedef struct page *pgtable_t; #define pte_pgprot(x) __pgprot(pte_val(x) & PTE_FLAGS_MASK) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * __MEMORY_START and SIZE are the physical addresses and size of RAM. @@ -126,10 +126,10 @@ typedef struct page *pgtable_t; #define ___va(x) ((x)+PAGE_OFFSET) #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define __pa(x) ___pa((unsigned long)x) #define __va(x) (void *)___va((unsigned long)x) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #ifdef CONFIG_UNCACHED_MAPPING #if defined(CONFIG_29BIT) diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 729f5c6225fb..10fa8f2bb8d1 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -17,7 +17,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -28,7 +28,7 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * Effective and physical address definitions, to aid with sign diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index f939f1215232..bb9f9a2fc85c 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -170,7 +170,7 @@ static inline unsigned long copy_ptea_attributes(unsigned long x) (PTE_MASK | _PAGE_ACCESSED | _PAGE_CACHABLE | \ _PAGE_DIRTY | _PAGE_SPECIAL) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #if defined(CONFIG_X2TLB) /* SH-X2 TLB */ #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_CACHABLE | \ @@ -287,9 +287,9 @@ static inline unsigned long copy_ptea_attributes(unsigned long x) __pgprot(0) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Certain architectures need to do special things when PTEs @@ -486,5 +486,5 @@ static inline int pte_swp_exclusive(pte_t pte) PTE_BIT_FUNC(low, swp_mkexclusive, |= _PAGE_SWP_EXCLUSIVE); PTE_BIT_FUNC(low, swp_clear_exclusive, &= ~_PAGE_SWP_EXCLUSIVE); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_SH_PGTABLE_32_H */ diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h index 73fba7c922f9..2a0b5713ab80 100644 --- a/arch/sh/include/asm/processor.h +++ b/arch/sh/include/asm/processor.h @@ -5,7 +5,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * CPU type and hardware bug flags. Kept separately for each CPU. * @@ -168,7 +168,7 @@ extern unsigned int instruction_size(unsigned int insn); void select_idle_routine(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #include diff --git a/arch/sh/include/asm/smc37c93x.h b/arch/sh/include/asm/smc37c93x.h index 891f2f8f2fd0..caf4cd8dd241 100644 --- a/arch/sh/include/asm/smc37c93x.h +++ b/arch/sh/include/asm/smc37c93x.h @@ -67,7 +67,7 @@ #define UART_DLL 0x0 /* Divisor Latch (LS) */ #define UART_DLM 0x2 /* Divisor Latch (MS) */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef struct uart_reg { volatile __u16 rbr; volatile __u16 ier; @@ -78,7 +78,7 @@ typedef struct uart_reg { volatile __u16 msr; volatile __u16 scr; } uart_reg; -#endif /* ! __ASSEMBLY__ */ +#endif /* ! __ASSEMBLER__ */ /* Alias for Write Only Register */ diff --git a/arch/sh/include/asm/suspend.h b/arch/sh/include/asm/suspend.h index 47db17520261..0f991babc559 100644 --- a/arch/sh/include/asm/suspend.h +++ b/arch/sh/include/asm/suspend.h @@ -2,7 +2,7 @@ #ifndef _ASM_SH_SUSPEND_H #define _ASM_SH_SUSPEND_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index 9f19a682d315..471db5173036 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -21,7 +21,7 @@ #define FAULT_CODE_PROT (1 << 3) /* protection fault */ #define FAULT_CODE_USER (1 << 4) /* user-mode access */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include struct thread_info { @@ -49,7 +49,7 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ @@ -86,7 +86,7 @@ static inline struct thread_info *current_thread_info(void) extern void init_thread_xstate(void); -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ /* * Thread information flags @@ -144,7 +144,7 @@ extern void init_thread_xstate(void); */ #define TS_USEDFPU 0x0002 /* FPU used by this task this quantum */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define TI_FLAG_FAULT_CODE_SHIFT 24 @@ -164,5 +164,5 @@ static inline unsigned int get_thread_fault_code(void) return ti->flags >> TI_FLAG_FAULT_CODE_SHIFT; } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_SH_THREAD_INFO_H */ diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index ddf324bfb9a0..39df40d0ebc2 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -2,7 +2,7 @@ #ifndef __ASM_SH_TLB_H #define __ASM_SH_TLB_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -29,5 +29,5 @@ asmlinkage int handle_tlbmiss(struct pt_regs *regs, unsigned long error_code, unsigned long address); #endif /* CONFIG_MMU */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_SH_TLB_H */ diff --git a/arch/sh/include/asm/types.h b/arch/sh/include/asm/types.h index 9b3fc923ee28..fec3e89df0b1 100644 --- a/arch/sh/include/asm/types.h +++ b/arch/sh/include/asm/types.h @@ -7,10 +7,10 @@ /* * These aren't exported outside the kernel to avoid name space clashes */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef u16 insn_size_t; typedef u32 reg_size_t; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* __ASM_SH_TYPES_H */ diff --git a/arch/sh/include/mach-common/mach/romimage.h b/arch/sh/include/mach-common/mach/romimage.h index 1915714263aa..22fb47ec9b15 100644 --- a/arch/sh/include/mach-common/mach/romimage.h +++ b/arch/sh/include/mach-common/mach/romimage.h @@ -1,12 +1,12 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* do nothing here by default */ -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ static inline void mmcif_update_progress(int nr) { } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ diff --git a/arch/sh/include/mach-ecovec24/mach/romimage.h b/arch/sh/include/mach-ecovec24/mach/romimage.h index 2da6ff326cbd..f93d494736c3 100644 --- a/arch/sh/include/mach-ecovec24/mach/romimage.h +++ b/arch/sh/include/mach-ecovec24/mach/romimage.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* EcoVec board specific boot code: * converts the "partner-jet-script.txt" script into assembly @@ -22,7 +22,7 @@ 1 : .long 0xa8000000 2 : -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ /* Ecovec board specific information: * @@ -45,4 +45,4 @@ static inline void mmcif_update_progress(int nr) __raw_writeb(1 << (nr - 1), PGDR); } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ diff --git a/arch/sh/include/mach-kfr2r09/mach/romimage.h b/arch/sh/include/mach-kfr2r09/mach/romimage.h index 209275872ff0..f68bb480d378 100644 --- a/arch/sh/include/mach-kfr2r09/mach/romimage.h +++ b/arch/sh/include/mach-kfr2r09/mach/romimage.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* kfr2r09 board specific boot code: * converts the "partner-jet-script.txt" script into assembly @@ -22,10 +22,10 @@ 1: .long 0xa8000000 2: -#else /* __ASSEMBLY__ */ +#else /* __ASSEMBLER__ */ static inline void mmcif_update_progress(int nr) { } -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ -- cgit v1.2.3 From ab0a168fcd984c2fc93eb0fb2367881c6aa62eb3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 2 May 2025 13:13:36 +0200 Subject: sh: ecovec24: Make SPI mode explicit Commit cf9e4784f3bde3e4 ("spi: sh-msiof: Add slave mode support") added a new mode member to the sh_msiof_spi_info structure, but did not update any board files. Hence all users in board files rely on the default being host mode. Make this unambiguous by configuring host mode explicitly. Signed-off-by: Geert Uytterhoeven Reviewed-by: John Paul Adrian Glaubitz Signed-off-by: John Paul Adrian Glaubitz --- arch/sh/boards/mach-ecovec24/setup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index 6f13557eecd6..a641e26f8fdf 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -825,6 +825,7 @@ static struct spi_board_info spi_bus[] = { /* MSIOF0 */ static struct sh_msiof_spi_info msiof0_data = { .num_chipselect = 1, + .mode = MSIOF_SPI_HOST, }; static struct resource msiof0_resources[] = { -- cgit v1.2.3 From 8a3682601ddaa4ef0c400f627a7f4b9388bbccef Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sat, 17 May 2025 12:30:48 +0300 Subject: sh: kprobes: Remove unused variables in kprobe_exceptions_notify() kbuild reports the following warning: arch/sh/kernel/kprobes.c: In function 'kprobe_exceptions_notify': >> arch/sh/kernel/kprobes.c:412:24: warning: variable 'p' set but not used [-Wunused-but-set-variable] 412 | struct kprobe *p = NULL; | ^ The variable 'p' is indeed unused since the commit fa5a24b16f94 ("sh/kprobes: Don't call the ->break_handler() in SH kprobes code") Remove that variable along with 'kprobe_opcode_t *addr' which also becomes unused after 'p' is removed. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505151341.EuRFR22l-lkp@intel.com/ Fixes: fa5a24b16f94 ("sh/kprobes: Don't call the ->break_handler() in SH kprobes code") Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: John Paul Adrian Glaubitz Reviewed-by: Geert Uytterhoeven Signed-off-by: John Paul Adrian Glaubitz --- arch/sh/kernel/kprobes.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch') diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c index 49c4ffd782d6..a250fb1b9420 100644 --- a/arch/sh/kernel/kprobes.c +++ b/arch/sh/kernel/kprobes.c @@ -404,13 +404,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) int __kprobes kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { - struct kprobe *p = NULL; struct die_args *args = (struct die_args *)data; int ret = NOTIFY_DONE; - kprobe_opcode_t *addr = NULL; struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - addr = (kprobe_opcode_t *) (args->regs->pc); if (val == DIE_TRAP && args->trapnr == (BREAKPOINT_INSTRUCTION & 0xff)) { if (!kprobe_running()) { @@ -421,7 +418,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, ret = NOTIFY_DONE; } } else { - p = get_kprobe(addr); if ((kcb->kprobe_status == KPROBE_HIT_SS) || (kcb->kprobe_status == KPROBE_REENTER)) { if (post_kprobe_handler(args->regs)) -- cgit v1.2.3 From 41cb08555c4164996d67c78b3bf1c658075b75f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 May 2025 07:51:14 +0200 Subject: treewide, timers: Rename from_timer() to timer_container_of() Move this API to the canonical timer_*() namespace. [ tglx: Redone against pre rc1 ] Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/aB2X0jCKQO56WdMt@gmail.com --- arch/alpha/kernel/srmcons.c | 3 ++- arch/powerpc/kvm/booke.c | 2 +- arch/powerpc/platforms/powermac/low_i2c.c | 3 ++- arch/sh/drivers/heartbeat.c | 2 +- arch/sh/drivers/pci/common.c | 4 ++-- arch/sh/drivers/push-switch.c | 2 +- arch/sparc/kernel/viohs.c | 2 +- arch/um/drivers/vector_kern.c | 2 +- arch/x86/kvm/xen.c | 3 ++- arch/xtensa/platforms/iss/network.c | 2 +- 10 files changed, 14 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/alpha/kernel/srmcons.c b/arch/alpha/kernel/srmcons.c index a89ce84371f9..d19e51ec711d 100644 --- a/arch/alpha/kernel/srmcons.c +++ b/arch/alpha/kernel/srmcons.c @@ -69,7 +69,8 @@ srmcons_do_receive_chars(struct tty_port *port) static void srmcons_receive_chars(struct timer_list *t) { - struct srmcons_private *srmconsp = from_timer(srmconsp, t, timer); + struct srmcons_private *srmconsp = timer_container_of(srmconsp, t, + timer); struct tty_port *port = &srmconsp->port; unsigned long flags; int incr = 10; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 791d1942a058..3401b96be475 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -628,7 +628,7 @@ static void arm_next_watchdog(struct kvm_vcpu *vcpu) static void kvmppc_watchdog_func(struct timer_list *t) { - struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.wdt_timer); + struct kvm_vcpu *vcpu = timer_container_of(vcpu, t, arch.wdt_timer); u32 tsr, new_tsr; int final; diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index a0ae58636e10..02474e27df9b 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -359,7 +359,8 @@ static irqreturn_t kw_i2c_irq(int irq, void *dev_id) static void kw_i2c_timeout(struct timer_list *t) { - struct pmac_i2c_host_kw *host = from_timer(host, t, timeout_timer); + struct pmac_i2c_host_kw *host = timer_container_of(host, t, + timeout_timer); unsigned long flags; spin_lock_irqsave(&host->lock, flags); diff --git a/arch/sh/drivers/heartbeat.c b/arch/sh/drivers/heartbeat.c index 24391b444b28..42103038a7d0 100644 --- a/arch/sh/drivers/heartbeat.c +++ b/arch/sh/drivers/heartbeat.c @@ -58,7 +58,7 @@ static inline void heartbeat_toggle_bit(struct heartbeat_data *hd, static void heartbeat_timer(struct timer_list *t) { - struct heartbeat_data *hd = from_timer(hd, t, timer); + struct heartbeat_data *hd = timer_container_of(hd, t, timer); static unsigned bit = 0, up = 1; heartbeat_toggle_bit(hd, bit, hd->flags & HEARTBEAT_INVERTED); diff --git a/arch/sh/drivers/pci/common.c b/arch/sh/drivers/pci/common.c index 5442475d132e..9633b6147a05 100644 --- a/arch/sh/drivers/pci/common.c +++ b/arch/sh/drivers/pci/common.c @@ -88,7 +88,7 @@ int __init pci_is_66mhz_capable(struct pci_channel *hose, static void pcibios_enable_err(struct timer_list *t) { - struct pci_channel *hose = from_timer(hose, t, err_timer); + struct pci_channel *hose = timer_container_of(hose, t, err_timer); timer_delete(&hose->err_timer); printk(KERN_DEBUG "PCI: re-enabling error IRQ.\n"); @@ -97,7 +97,7 @@ static void pcibios_enable_err(struct timer_list *t) static void pcibios_enable_serr(struct timer_list *t) { - struct pci_channel *hose = from_timer(hose, t, serr_timer); + struct pci_channel *hose = timer_container_of(hose, t, serr_timer); timer_delete(&hose->serr_timer); printk(KERN_DEBUG "PCI: re-enabling system error IRQ.\n"); diff --git a/arch/sh/drivers/push-switch.c b/arch/sh/drivers/push-switch.c index 2b51ad9d5586..443cc6fd26a8 100644 --- a/arch/sh/drivers/push-switch.c +++ b/arch/sh/drivers/push-switch.c @@ -25,7 +25,7 @@ static DEVICE_ATTR_RO(switch); static void switch_timer(struct timer_list *t) { - struct push_switch *psw = from_timer(psw, t, debounce); + struct push_switch *psw = timer_container_of(psw, t, debounce); schedule_work(&psw->work); } diff --git a/arch/sparc/kernel/viohs.c b/arch/sparc/kernel/viohs.c index e27afd233bf5..8fb2e7ca5015 100644 --- a/arch/sparc/kernel/viohs.c +++ b/arch/sparc/kernel/viohs.c @@ -804,7 +804,7 @@ EXPORT_SYMBOL(vio_port_up); static void vio_port_timer(struct timer_list *t) { - struct vio_driver_state *vio = from_timer(vio, t, timer); + struct vio_driver_state *vio = timer_container_of(vio, t, timer); vio_port_up(vio); } diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 5226d2c52e6a..f292e0b4ff8b 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1534,7 +1534,7 @@ static const struct net_device_ops vector_netdev_ops = { static void vector_timer_expire(struct timer_list *t) { - struct vector_private *vp = from_timer(vp, t, tl); + struct vector_private *vp = timer_container_of(vp, t, tl); vp->estats.tx_kicks++; napi_schedule(&vp->napi); diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 38b33cdd4232..9b029bb29a16 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -1571,7 +1571,8 @@ out: static void cancel_evtchn_poll(struct timer_list *t) { - struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer); + struct kvm_vcpu *vcpu = timer_container_of(vcpu, t, + arch.xen.poll_timer); kvm_make_request(KVM_REQ_UNBLOCK, vcpu); kvm_vcpu_kick(vcpu); diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index c6d8c62695e1..f0a63b2f85cc 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -338,7 +338,7 @@ static int iss_net_poll(struct iss_net_private *lp) static void iss_net_timer(struct timer_list *t) { - struct iss_net_private *lp = from_timer(lp, t, timer); + struct iss_net_private *lp = timer_container_of(lp, t, timer); iss_net_poll(lp); mod_timer(&lp->timer, jiffies + lp->timer_val); -- cgit v1.2.3 From e34dbbc85d64af59176fe59fad7b4122f4330fe2 Mon Sep 17 00:00:00 2001 From: "Xin Li (Intel)" Date: Mon, 9 Jun 2025 01:40:53 -0700 Subject: x86/fred/signal: Prevent immediate repeat of single step trap on return from SIGTRAP handler Clear the software event flag in the augmented SS to prevent immediate repeat of single step trap on return from SIGTRAP handler if the trap flag (TF) is set without an external debugger attached. Following is a typical single-stepping flow for a user process: 1) The user process is prepared for single-stepping by setting RFLAGS.TF = 1. 2) When any instruction in user space completes, a #DB is triggered. 3) The kernel handles the #DB and returns to user space, invoking the SIGTRAP handler with RFLAGS.TF = 0. 4) After the SIGTRAP handler finishes, the user process performs a sigreturn syscall, restoring the original state, including RFLAGS.TF = 1. 5) Goto step 2. According to the FRED specification: A) Bit 17 in the augmented SS is designated as the software event flag, which is set to 1 for FRED event delivery of SYSCALL, SYSENTER, or INT n. B) If bit 17 of the augmented SS is 1 and ERETU would result in RFLAGS.TF = 1, a single-step trap will be pending upon completion of ERETU. In step 4) above, the software event flag is set upon the sigreturn syscall, and its corresponding ERETU would restore RFLAGS.TF = 1. This combination causes a pending single-step trap upon completion of ERETU. Therefore, another #DB is triggered before any user space instruction is executed, which leads to an infinite loop in which the SIGTRAP handler keeps being invoked on the same user space IP. Fixes: 14619d912b65 ("x86/fred: FRED entry/exit and dispatch code") Suggested-by: H. Peter Anvin (Intel) Signed-off-by: Xin Li (Intel) Signed-off-by: Dave Hansen Tested-by: Sohil Mehta Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20250609084054.2083189-2-xin%40zytor.com --- arch/x86/include/asm/sighandling.h | 22 ++++++++++++++++++++++ arch/x86/kernel/signal_32.c | 4 ++++ arch/x86/kernel/signal_64.c | 4 ++++ 3 files changed, 30 insertions(+) (limited to 'arch') diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index e770c4fc47f4..8727c7e21dd1 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -24,4 +24,26 @@ int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs); +/* + * To prevent immediate repeat of single step trap on return from SIGTRAP + * handler if the trap flag (TF) is set without an external debugger attached, + * clear the software event flag in the augmented SS, ensuring no single-step + * trap is pending upon ERETU completion. + * + * Note, this function should be called in sigreturn() before the original + * state is restored to make sure the TF is read from the entry frame. + */ +static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs) +{ + /* + * If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction + * is being single-stepped, do not clear the software event flag in the + * augmented SS, thus a debugger won't skip over the following instruction. + */ +#ifdef CONFIG_X86_FRED + if (!(regs->flags & X86_EFLAGS_TF)) + regs->fred_ss.swevent = 0; +#endif +} + #endif /* _ASM_X86_SIGHANDLING_H */ diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 98123ff10506..42bbc42bd350 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -152,6 +152,8 @@ SYSCALL32_DEFINE0(sigreturn) struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; + prevent_single_step_upon_eretu(regs); + if (!access_ok(frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) @@ -175,6 +177,8 @@ SYSCALL32_DEFINE0(rt_sigreturn) struct rt_sigframe_ia32 __user *frame; sigset_t set; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); if (!access_ok(frame, sizeof(*frame))) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ee9453891901..d483b585c6c6 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -250,6 +250,8 @@ SYSCALL_DEFINE0(rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(frame, sizeof(*frame))) goto badframe; @@ -366,6 +368,8 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn) sigset_t set; unsigned long uc_flags; + prevent_single_step_upon_eretu(regs); + frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); if (!access_ok(frame, sizeof(*frame))) -- cgit v1.2.3 From ea7caffedd011f7d40abe93a884ffbe46f122535 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 8 Apr 2025 14:22:56 -0300 Subject: ARC: atomics: Implement arch_atomic64_cmpxchg using _relaxed The core atomic code has a number of macros where it elaborates architecture primitives into more functions. ARC uses arch_atomic64_cmpxchg() as it's architecture primitive which disable alot of the additional functions. Instead provide arch_cmpxchg64_relaxed() as the primitive and rely on the core macros to create arch_cmpxchg64(). The macros will also provide other functions, for instance, try_cmpxchg64_release(), giving a more complete implementation. Suggested-by: Mark Rutland Link: https://lore.kernel.org/r/Z0747n5bSep4_1VX@J2N7QTR9R3 Signed-off-by: Jason Gunthorpe Signed-off-by: Vineet Gupta --- arch/arc/include/asm/atomic64-arcv2.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arc/include/asm/atomic64-arcv2.h b/arch/arc/include/asm/atomic64-arcv2.h index 9b5791b85471..73080a664369 100644 --- a/arch/arc/include/asm/atomic64-arcv2.h +++ b/arch/arc/include/asm/atomic64-arcv2.h @@ -137,12 +137,9 @@ ATOMIC64_OPS(xor, xor, xor) #undef ATOMIC64_OP_RETURN #undef ATOMIC64_OP -static inline s64 -arch_atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new) +static inline u64 __arch_cmpxchg64_relaxed(volatile void *ptr, u64 old, u64 new) { - s64 prev; - - smp_mb(); + u64 prev; __asm__ __volatile__( "1: llockd %0, [%1] \n" @@ -152,14 +149,12 @@ arch_atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new) " bnz 1b \n" "2: \n" : "=&r"(prev) - : "r"(ptr), "ir"(expected), "r"(new) - : "cc"); /* memory clobber comes from smp_mb() */ - - smp_mb(); + : "r"(ptr), "ir"(old), "r"(new) + : "memory", "cc"); return prev; } -#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg +#define arch_cmpxchg64_relaxed __arch_cmpxchg64_relaxed static inline s64 arch_atomic64_xchg(atomic64_t *ptr, s64 new) { -- cgit v1.2.3 From 857f4517965b282234e12f6bca0c21ef10eec09b Mon Sep 17 00:00:00 2001 From: Yu-Chun Lin Date: Thu, 10 Apr 2025 01:11:16 +0800 Subject: ARC: unwind: Use built-in sort swap to reduce code size and improve performance The custom swap function used in sort() was identical to the default built-in sort swap. Remove the custom swap function and passes NULL to sort(), allowing it to use the default swap function. This change reduces code size and improves performance, particularly when CONFIG_MITIGATION_RETPOLINE is enabled. With RETPOLINE mitigation, indirect function calls incur significant overhead, and using the default swap function avoids this cost. $ ./scripts/bloat-o-meter ./unwind.o.old ./unwind.o.new add/remove: 0/1 grow/shrink: 0/1 up/down: 0/-22 (-22) Function old new delta init_unwind_hdr.constprop 544 540 -4 swap_eh_frame_hdr_table_entries 18 - -18 Total: Before=4410, After=4388, chg -0.50% Signed-off-by: Yu-Chun Lin Signed-off-by: Vineet Gupta --- arch/arc/kernel/unwind.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c index d8969dab12d4..789cfb9ea14e 100644 --- a/arch/arc/kernel/unwind.c +++ b/arch/arc/kernel/unwind.c @@ -241,15 +241,6 @@ static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2) return (e1->start > e2->start) - (e1->start < e2->start); } -static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size) -{ - struct eh_frame_hdr_table_entry *e1 = p1; - struct eh_frame_hdr_table_entry *e2 = p2; - - swap(e1->start, e2->start); - swap(e1->fde, e2->fde); -} - static void init_unwind_hdr(struct unwind_table *table, void *(*alloc) (unsigned long)) { @@ -345,7 +336,7 @@ static void init_unwind_hdr(struct unwind_table *table, sort(header->table, n, sizeof(*header->table), - cmp_eh_frame_hdr_table_entries, swap_eh_frame_hdr_table_entries); + cmp_eh_frame_hdr_table_entries, NULL); table->hdrsz = hdrSize; smp_wmb(); -- cgit v1.2.3 From 2cb74be378675c860af0fcaf1ec2801beebdf028 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 14 Mar 2025 08:09:35 +0100 Subject: ARC: Replace __ASSEMBLY__ with __ASSEMBLER__ in uapi headers __ASSEMBLY__ is only defined by the Makefile of the kernel, so this is not really useful for uapi headers (unless the userspace Makefile defines it, too). Let's switch to __ASSEMBLER__ which gets set automatically by the compiler when compiling assembly code. Cc: linux-snps-arc@lists.infradead.org Signed-off-by: Thomas Huth Signed-off-by: Vineet Gupta --- arch/arc/include/uapi/asm/ptrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arc/include/uapi/asm/ptrace.h b/arch/arc/include/uapi/asm/ptrace.h index 2a6eff57f6dd..3ae832db278c 100644 --- a/arch/arc/include/uapi/asm/ptrace.h +++ b/arch/arc/include/uapi/asm/ptrace.h @@ -14,7 +14,7 @@ #define PTRACE_GET_THREAD_AREA 25 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Userspace ABI: Register state needed by * -ptrace (gdbserver) @@ -53,6 +53,6 @@ struct user_regs_arcv2 { unsigned long r30, r58, r59; }; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* _UAPI__ASM_ARC_PTRACE_H */ -- cgit v1.2.3 From 179e949719fe81219a3e23f1e716ac2d02eea845 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Fri, 14 Mar 2025 08:09:36 +0100 Subject: ARC: Replace __ASSEMBLY__ with __ASSEMBLER__ in the non-uapi headers While the GCC and Clang compilers already define __ASSEMBLER__ automatically when compiling assembly code, __ASSEMBLY__ is a macro that only gets defined by the Makefiles in the kernel. This can be very confusing when switching between userspace and kernelspace coding, or when dealing with uapi headers that rather should use __ASSEMBLER__ instead. So let's standardize on the __ASSEMBLER__ macro that is provided by the compilers now. This is a completely mechanical patch (done with a simple "sed -i" statement). Cc: linux-snps-arc@lists.infradead.org Signed-off-by: Thomas Huth Signed-off-by: Vineet Gupta --- arch/arc/include/asm/arcregs.h | 2 +- arch/arc/include/asm/atomic.h | 4 ++-- arch/arc/include/asm/bitops.h | 4 ++-- arch/arc/include/asm/bug.h | 4 ++-- arch/arc/include/asm/cache.h | 4 ++-- arch/arc/include/asm/current.h | 4 ++-- arch/arc/include/asm/dsp-impl.h | 2 +- arch/arc/include/asm/dsp.h | 4 ++-- arch/arc/include/asm/dwarf.h | 4 ++-- arch/arc/include/asm/entry.h | 4 ++-- arch/arc/include/asm/irqflags-arcv2.h | 4 ++-- arch/arc/include/asm/irqflags-compact.h | 4 ++-- arch/arc/include/asm/jump_label.h | 4 ++-- arch/arc/include/asm/linkage.h | 6 +++--- arch/arc/include/asm/mmu-arcv2.h | 4 ++-- arch/arc/include/asm/mmu.h | 2 +- arch/arc/include/asm/page.h | 4 ++-- arch/arc/include/asm/pgtable-bits-arcv2.h | 4 ++-- arch/arc/include/asm/pgtable-levels.h | 4 ++-- arch/arc/include/asm/pgtable.h | 4 ++-- arch/arc/include/asm/processor.h | 4 ++-- arch/arc/include/asm/ptrace.h | 4 ++-- arch/arc/include/asm/switch_to.h | 2 +- arch/arc/include/asm/thread_info.h | 4 ++-- 24 files changed, 45 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index 005d9e4d187a..a31bbf5c8bbc 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -144,7 +144,7 @@ #define ARC_AUX_AGU_MOD2 0x5E2 #define ARC_AUX_AGU_MOD3 0x5E3 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 592d7fffc223..e615c42b93ba 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -6,7 +6,7 @@ #ifndef _ASM_ARC_ATOMIC_H #define _ASM_ARC_ATOMIC_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -31,6 +31,6 @@ #include #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index f5a936496f06..5340c2871392 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -10,7 +10,7 @@ #error only can be included directly #endif -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -192,6 +192,6 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x) #include #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/bug.h b/arch/arc/include/asm/bug.h index 4c453ba96c51..171c16021f70 100644 --- a/arch/arc/include/asm/bug.h +++ b/arch/arc/include/asm/bug.h @@ -6,7 +6,7 @@ #ifndef _ASM_ARC_BUG_H #define _ASM_ARC_BUG_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -29,6 +29,6 @@ void die(const char *str, struct pt_regs *regs, unsigned long address); #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index f0f1fc5d62b6..040a97f4dd82 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -23,7 +23,7 @@ */ #define ARC_UNCACHED_ADDR_SPACE 0xc0000000 -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -65,7 +65,7 @@ extern int ioc_enable; extern unsigned long perip_base, perip_end; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* Instruction cache related Auxiliary registers */ #define ARC_REG_IC_BCR 0x77 /* Build Config reg */ diff --git a/arch/arc/include/asm/current.h b/arch/arc/include/asm/current.h index 06be89f6f2f0..03ffd005f3fa 100644 --- a/arch/arc/include/asm/current.h +++ b/arch/arc/include/asm/current.h @@ -9,7 +9,7 @@ #ifndef _ASM_ARC_CURRENT_H #define _ASM_ARC_CURRENT_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #ifdef CONFIG_ARC_CURR_IN_REG @@ -20,6 +20,6 @@ register struct task_struct *curr_arc asm("gp"); #include #endif /* ! CONFIG_ARC_CURR_IN_REG */ -#endif /* ! __ASSEMBLY__ */ +#endif /* ! __ASSEMBLER__ */ #endif /* _ASM_ARC_CURRENT_H */ diff --git a/arch/arc/include/asm/dsp-impl.h b/arch/arc/include/asm/dsp-impl.h index cd5636dfeb6f..fd5fdaad90c1 100644 --- a/arch/arc/include/asm/dsp-impl.h +++ b/arch/arc/include/asm/dsp-impl.h @@ -11,7 +11,7 @@ #define DSP_CTRL_DISABLED_ALL 0 -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ /* clobbers r5 register */ .macro DSP_EARLY_INIT diff --git a/arch/arc/include/asm/dsp.h b/arch/arc/include/asm/dsp.h index f496dbc4640b..eeaaf4e4eabd 100644 --- a/arch/arc/include/asm/dsp.h +++ b/arch/arc/include/asm/dsp.h @@ -7,7 +7,7 @@ #ifndef __ASM_ARC_DSP_H #define __ASM_ARC_DSP_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * DSP-related saved registers - need to be saved only when you are @@ -24,6 +24,6 @@ struct dsp_callee_regs { #endif }; -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_ARC_DSP_H */ diff --git a/arch/arc/include/asm/dwarf.h b/arch/arc/include/asm/dwarf.h index a0d5ebe1bc3f..1524c5cf8b59 100644 --- a/arch/arc/include/asm/dwarf.h +++ b/arch/arc/include/asm/dwarf.h @@ -6,7 +6,7 @@ #ifndef _ASM_ARC_DWARF_H #define _ASM_ARC_DWARF_H -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef ARC_DW2_UNWIND_AS_CFI @@ -38,6 +38,6 @@ #endif /* !ARC_DW2_UNWIND_AS_CFI */ -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif /* _ASM_ARC_DWARF_H */ diff --git a/arch/arc/include/asm/entry.h b/arch/arc/include/asm/entry.h index 38c35722cebf..f453af251a1a 100644 --- a/arch/arc/include/asm/entry.h +++ b/arch/arc/include/asm/entry.h @@ -13,7 +13,7 @@ #include /* For VMALLOC_START */ #include -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ #ifdef CONFIG_ISA_ARCOMPACT #include /* ISA specific bits */ @@ -146,7 +146,7 @@ #endif /* CONFIG_ARC_CURR_IN_REG */ -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ extern void do_signal(struct pt_regs *); extern void do_notify_resume(struct pt_regs *); diff --git a/arch/arc/include/asm/irqflags-arcv2.h b/arch/arc/include/asm/irqflags-arcv2.h index fb3c21f1a238..30aea562f8aa 100644 --- a/arch/arc/include/asm/irqflags-arcv2.h +++ b/arch/arc/include/asm/irqflags-arcv2.h @@ -50,7 +50,7 @@ #define ISA_INIT_STATUS_BITS (STATUS_IE_MASK | __AD_ENB | \ (ARCV2_IRQ_DEF_PRIO << 1)) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /* * Save IRQ state and disable IRQs @@ -170,6 +170,6 @@ static inline void arc_softirq_clear(int irq) seti .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/irqflags-compact.h b/arch/arc/include/asm/irqflags-compact.h index 936a2f21f315..85c2f6bcde0c 100644 --- a/arch/arc/include/asm/irqflags-compact.h +++ b/arch/arc/include/asm/irqflags-compact.h @@ -40,7 +40,7 @@ #define ISA_INIT_STATUS_BITS STATUS_IE_MASK -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ /****************************************************************** * IRQ Control Macros @@ -196,6 +196,6 @@ static inline int arch_irqs_disabled(void) flag \scratch .endm -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/jump_label.h b/arch/arc/include/asm/jump_label.h index a339223d9e05..66ead75784d9 100644 --- a/arch/arc/include/asm/jump_label.h +++ b/arch/arc/include/asm/jump_label.h @@ -2,7 +2,7 @@ #ifndef _ASM_ARC_JUMP_LABEL_H #define _ASM_ARC_JUMP_LABEL_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -68,5 +68,5 @@ struct jump_entry { jump_label_t key; }; -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/linkage.h b/arch/arc/include/asm/linkage.h index 8a3fb71e9cfa..ba3cb65b5eaa 100644 --- a/arch/arc/include/asm/linkage.h +++ b/arch/arc/include/asm/linkage.h @@ -12,7 +12,7 @@ #define __ALIGN .align 4 #define __ALIGN_STR __stringify(__ALIGN) -#ifdef __ASSEMBLY__ +#ifdef __ASSEMBLER__ .macro ST2 e, o, off #ifdef CONFIG_ARC_HAS_LL64 @@ -61,7 +61,7 @@ CFI_ENDPROC ASM_NL \ .size name, .-name -#else /* !__ASSEMBLY__ */ +#else /* !__ASSEMBLER__ */ #ifdef CONFIG_ARC_HAS_ICCM #define __arcfp_code __section(".text.arcfp") @@ -75,6 +75,6 @@ #define __arcfp_data __section(".data") #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/mmu-arcv2.h b/arch/arc/include/asm/mmu-arcv2.h index 41412642f279..5e5482026ac9 100644 --- a/arch/arc/include/asm/mmu-arcv2.h +++ b/arch/arc/include/asm/mmu-arcv2.h @@ -69,7 +69,7 @@ #define PTE_BITS_NON_RWX_IN_PD1 (PAGE_MASK_PHYS | _PAGE_CACHEABLE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ struct mm_struct; extern int pae40_exist_but_not_enab(void); @@ -100,6 +100,6 @@ static inline void mmu_setup_pgd(struct mm_struct *mm, void *pgd) sr \reg, [ARC_REG_PID] .endm -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/mmu.h b/arch/arc/include/asm/mmu.h index 4ae2db59d494..e3b35ceab582 100644 --- a/arch/arc/include/asm/mmu.h +++ b/arch/arc/include/asm/mmu.h @@ -6,7 +6,7 @@ #ifndef _ASM_ARC_MMU_H #define _ASM_ARC_MMU_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include /* NR_CPUS */ diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h index def0dfb95b43..9720fe6b2c24 100644 --- a/arch/arc/include/asm/page.h +++ b/arch/arc/include/asm/page.h @@ -19,7 +19,7 @@ #endif /* CONFIG_ARC_HAS_PAE40 */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define clear_page(paddr) memset((paddr), 0, PAGE_SIZE) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) @@ -136,6 +136,6 @@ static inline unsigned long virt_to_pfn(const void *kaddr) #include /* page_to_pfn, pfn_to_page */ #include -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index 8ebec1b21d24..a8fdb4fe1fbd 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -75,7 +75,7 @@ * This is to enable COW mechanism */ /* xwr */ -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #define pte_write(pte) (pte_val(pte) & _PAGE_WRITE) #define pte_dirty(pte) (pte_val(pte) & _PAGE_DIRTY) @@ -142,6 +142,6 @@ PTE_BIT_FUNC(swp_clear_exclusive, &= ~(_PAGE_SWP_EXCLUSIVE)); #include #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/pgtable-levels.h b/arch/arc/include/asm/pgtable-levels.h index d1ce4b0f1071..c8f9273372c0 100644 --- a/arch/arc/include/asm/pgtable-levels.h +++ b/arch/arc/include/asm/pgtable-levels.h @@ -85,7 +85,7 @@ #define PTRS_PER_PTE BIT(PMD_SHIFT - PAGE_SHIFT) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #if CONFIG_PGTABLE_LEVELS > 3 #include @@ -181,6 +181,6 @@ #define pmd_leaf(x) (pmd_val(x) & _PAGE_HW_SZ) #endif -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 4cf45a99fd79..bd580e2b62d7 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -19,7 +19,7 @@ */ #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ extern char empty_zero_page[PAGE_SIZE]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) @@ -29,6 +29,6 @@ extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE); /* to cope with aliasing VIPT cache */ #define HAVE_ARCH_UNMAPPED_AREA -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLER__ */ #endif diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index d606658e2fe7..7f7901ac6643 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -11,7 +11,7 @@ #ifndef __ASM_ARC_PROCESSOR_H #define __ASM_ARC_PROCESSOR_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include @@ -66,7 +66,7 @@ extern void start_thread(struct pt_regs * regs, unsigned long pc, extern unsigned int __get_wchan(struct task_struct *p); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * Default System Memory Map on ARC diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h index cf79df0b2570..f6c052af8f4d 100644 --- a/arch/arc/include/asm/ptrace.h +++ b/arch/arc/include/asm/ptrace.h @@ -10,7 +10,7 @@ #include #include -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ typedef union { struct { @@ -172,6 +172,6 @@ static inline unsigned long regs_get_register(struct pt_regs *regs, extern int syscall_trace_enter(struct pt_regs *); extern void syscall_trace_exit(struct pt_regs *); -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ #endif /* __ASM_PTRACE_H */ diff --git a/arch/arc/include/asm/switch_to.h b/arch/arc/include/asm/switch_to.h index 1f85de8288b1..5806106a65f9 100644 --- a/arch/arc/include/asm/switch_to.h +++ b/arch/arc/include/asm/switch_to.h @@ -6,7 +6,7 @@ #ifndef _ASM_ARC_SWITCH_TO_H #define _ASM_ARC_SWITCH_TO_H -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include #include diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h index 12daaf3a61ea..255d2c774219 100644 --- a/arch/arc/include/asm/thread_info.h +++ b/arch/arc/include/asm/thread_info.h @@ -24,7 +24,7 @@ #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define THREAD_SHIFT (PAGE_SHIFT << THREAD_SIZE_ORDER) -#ifndef __ASSEMBLY__ +#ifndef __ASSEMBLER__ #include @@ -62,7 +62,7 @@ static inline __attribute_const__ struct thread_info *current_thread_info(void) return (struct thread_info *)(sp & ~(THREAD_SIZE - 1)); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLER__ */ /* * thread information flags -- cgit v1.2.3 From cd097df4596f3a1e9d75eb8520162de1eb8485b2 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Tue, 10 Jun 2025 07:42:26 +0530 Subject: powerpc/powernv/memtrace: Fix out of bounds issue in memtrace mmap memtrace mmap issue has an out of bounds issue. This patch fixes the by checking that the requested mapping region size should stay within the allocated region size. Reported-by: Jonathan Greental Fixes: 08a022ad3dfa ("powerpc/powernv/memtrace: Allow mmaping trace buffers") Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250610021227.361980-1-maddy@linux.ibm.com --- arch/powerpc/platforms/powernv/memtrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 4ac9808e55a4..2ea30b343354 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -48,11 +48,15 @@ static ssize_t memtrace_read(struct file *filp, char __user *ubuf, static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma) { struct memtrace_entry *ent = filp->private_data; + unsigned long ent_nrpages = ent->size >> PAGE_SHIFT; + unsigned long vma_nrpages = vma_pages(vma); - if (ent->size < vma->vm_end - vma->vm_start) + /* The requested page offset should be within object's page count */ + if (vma->vm_pgoff >= ent_nrpages) return -EINVAL; - if (vma->vm_pgoff << PAGE_SHIFT >= ent->size) + /* The requested mapping range should remain within the bounds */ + if (vma_nrpages > ent_nrpages - vma->vm_pgoff) return -EINVAL; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); -- cgit v1.2.3 From 0d67f0dee6c9176bc09a5482dd7346e3a0f14d0b Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Tue, 10 Jun 2025 07:42:27 +0530 Subject: powerpc/vas: Return -EINVAL if the offset is non-zero in mmap() The user space calls mmap() to map VAS window paste address and the kernel returns the complete mapped page for each window. So return -EINVAL if non-zero is passed for offset parameter to mmap(). See Documentation/arch/powerpc/vas-api.rst for mmap() restrictions. Co-developed-by: Jonathan Greental Signed-off-by: Jonathan Greental Reported-by: Jonathan Greental Fixes: dda44eb29c23 ("powerpc/vas: Add VAS user space API") Signed-off-by: Haren Myneni Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250610021227.361980-2-maddy@linux.ibm.com --- arch/powerpc/platforms/book3s/vas-api.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index 0b6365d85d11..dc6f75d3ac6e 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -521,6 +521,15 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) return -EINVAL; } + /* + * Map complete page to the paste address. So the user + * space should pass 0ULL to the offset parameter. + */ + if (vma->vm_pgoff) { + pr_debug("Page offset unsupported to map paste address\n"); + return -EINVAL; + } + /* Ensure instance has an open send window */ if (!txwin) { pr_err("No send window open?\n"); -- cgit v1.2.3 From 0b3bc018e86afdc0cbfef61328c63d5c08f8b370 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Sat, 7 Jun 2025 01:07:37 +1200 Subject: x86/virt/tdx: Avoid indirect calls to TDX assembly functions Two 'static inline' TDX helper functions (sc_retry() and sc_retry_prerr()) take function pointer arguments which refer to assembly functions. Normally, the compiler inlines the TDX helper, realizes that the function pointer targets are completely static -- thus can be resolved at compile time -- and generates direct call instructions. But, other times (like when CONFIG_CC_OPTIMIZE_FOR_SIZE=y), the compiler declines to inline the helpers and will instead generate indirect call instructions. Indirect calls to assembly functions require special annotation (for various Control Flow Integrity mechanisms). But TDX assembly functions lack the special annotations and can only be called directly. Annotate both the helpers as '__always_inline' to prod the compiler into maintaining the direct calls. There is no guarantee here, but Peter has volunteered to report the compiler bug if this assumption ever breaks[1]. Fixes: 1e66a7e27539 ("x86/virt/tdx: Handle SEAMCALL no entropy error in common code") Fixes: df01f5ae07dd ("x86/virt/tdx: Add SEAMCALL error printing for module initialization") Signed-off-by: Kai Huang Signed-off-by: Dave Hansen Cc: stable@vger.kernel.org Link: https://lore.kernel.org/lkml/20250605145914.GW39944@noisy.programming.kicks-ass.net/ [1] Link: https://lore.kernel.org/all/20250606130737.30713-1-kai.huang%40intel.com --- arch/x86/include/asm/tdx.h | 2 +- arch/x86/virt/vmx/tdx/tdx.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 8b19294600c4..7ddef3a69866 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -106,7 +106,7 @@ void tdx_init(void); typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args); -static inline u64 sc_retry(sc_func_t func, u64 fn, +static __always_inline u64 sc_retry(sc_func_t func, u64 fn, struct tdx_module_args *args) { int retry = RDRAND_RETRY_LOOPS; diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 2457d13c3f9e..c7a9a087ccaf 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -75,8 +75,9 @@ static inline void seamcall_err_ret(u64 fn, u64 err, args->r9, args->r10, args->r11); } -static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func, - u64 fn, struct tdx_module_args *args) +static __always_inline int sc_retry_prerr(sc_func_t func, + sc_err_func_t err_func, + u64 fn, struct tdx_module_args *args) { u64 sret = sc_retry(func, fn, args); -- cgit v1.2.3 From 1dbf30fdb5e57fb2c39f17f35f2b544d5de34397 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 3 Jun 2025 14:14:41 +0300 Subject: x86/mm/pat: don't collapse pages without PSE set Collapsing pages to a leaf PMD or PUD should be done only if X86_FEATURE_PSE is available, which is not the case when running e.g. as a Xen PV guest. Fixes: 41d88484c71c ("x86/mm/pat: restore large ROX pages after fragmentation") Signed-off-by: Juergen Gross Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250528123557.12847-3-jgross@suse.com --- arch/x86/mm/pat/set_memory.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 46edc11726b7..8834c76f91c9 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -1257,6 +1257,9 @@ static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, pgprot_t pgprot; int i = 0; + if (!cpu_feature_enabled(X86_FEATURE_PSE)) + return 0; + addr &= PMD_MASK; pte = pte_offset_kernel(pmd, addr); first = *pte; -- cgit v1.2.3 From 47410d839fcda6890cb82828f874f97710982f24 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 3 Jun 2025 14:14:42 +0300 Subject: x86/Kconfig: only enable ROX cache in execmem when STRICT_MODULE_RWX is set Currently ROX cache in execmem is enabled regardless of STRICT_MODULE_RWX setting. This breaks an assumption that module memory is writable when STRICT_MODULE_RWX is disabled, for instance for kernel debuggin. Only enable ROX cache in execmem when STRICT_MODULE_RWX is set to restore the original behaviour of module text permissions. Fixes: 64f6a4e10c05 ("x86: re-enable EXECMEM_ROX support") Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250603111446.2609381-3-rppt@kernel.org --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 340e5468980e..71019b3b54ea 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -89,7 +89,7 @@ config X86 select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE - select ARCH_HAS_EXECMEM_ROX if X86_64 + select ARCH_HAS_EXECMEM_ROX if X86_64 && STRICT_MODULE_RWX select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL -- cgit v1.2.3 From 0b0cae7119a0ec9449d7261b5e672a5fed765068 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 3 Jun 2025 14:14:43 +0300 Subject: x86/its: move its_pages array to struct mod_arch_specific The of pages with ITS thunks allocated for modules are tracked by an array in 'struct module'. Since this is very architecture specific data structure, move it to 'struct mod_arch_specific'. No functional changes. Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches") Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250603111446.2609381-4-rppt@kernel.org --- arch/x86/include/asm/module.h | 8 ++++++++ arch/x86/kernel/alternative.c | 19 ++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index e988bac0a4a1..3c2de4ce3b10 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,12 +5,20 @@ #include #include +struct its_array { +#ifdef CONFIG_MITIGATION_ITS + void **pages; + int num; +#endif +}; + struct mod_arch_specific { #ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; #endif + struct its_array its_pages; }; #endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ecfe7b497cad..b50fe6ce4655 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -173,8 +173,8 @@ void its_fini_mod(struct module *mod) its_page = NULL; mutex_unlock(&text_mutex); - for (int i = 0; i < mod->its_num_pages; i++) { - void *page = mod->its_page_array[i]; + for (int i = 0; i < mod->arch.its_pages.num; i++) { + void *page = mod->arch.its_pages.pages[i]; execmem_restore_rox(page, PAGE_SIZE); } } @@ -184,11 +184,11 @@ void its_free_mod(struct module *mod) if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; - for (int i = 0; i < mod->its_num_pages; i++) { - void *page = mod->its_page_array[i]; + for (int i = 0; i < mod->arch.its_pages.num; i++) { + void *page = mod->arch.its_pages.pages[i]; execmem_free(page); } - kfree(mod->its_page_array); + kfree(mod->arch.its_pages.pages); } #endif /* CONFIG_MODULES */ @@ -201,14 +201,15 @@ static void *its_alloc(void) #ifdef CONFIG_MODULES if (its_mod) { - void *tmp = krealloc(its_mod->its_page_array, - (its_mod->its_num_pages+1) * sizeof(void *), + struct its_array *pages = &its_mod->arch.its_pages; + void *tmp = krealloc(pages->pages, + (pages->num+1) * sizeof(void *), GFP_KERNEL); if (!tmp) return NULL; - its_mod->its_page_array = tmp; - its_mod->its_page_array[its_mod->its_num_pages++] = page; + pages->pages = tmp; + pages->pages[pages->num++] = page; execmem_make_temp_rw(page, PAGE_SIZE); } -- cgit v1.2.3 From a82b26451de126a5ae130361081986bc459afe9b Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 3 Jun 2025 14:14:44 +0300 Subject: x86/its: explicitly manage permissions for ITS pages execmem_alloc() sets permissions differently depending on the kernel configuration, CPU support for PSE and whether a page is allocated before or after mark_rodata_ro(). Add tracking for pages allocated for ITS when patching the core kernel and make sure the permissions for ITS pages are explicitly managed for both kernel and module allocations. Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches") Signed-off-by: Peter Zijlstra (Intel) Co-developed-by: Mike Rapoport (Microsoft) Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Nikolay Borisov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250603111446.2609381-5-rppt@kernel.org --- arch/x86/kernel/alternative.c | 74 ++++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b50fe6ce4655..6455f7f751b3 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -116,6 +116,24 @@ static struct module *its_mod; #endif static void *its_page; static unsigned int its_offset; +struct its_array its_pages; + +static void *__its_alloc(struct its_array *pages) +{ + void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + if (!page) + return NULL; + + void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *), + GFP_KERNEL); + if (!tmp) + return NULL; + + pages->pages = tmp; + pages->pages[pages->num++] = page; + + return no_free_ptr(page); +} /* Initialize a thunk with the "jmp *reg; int3" instructions. */ static void *its_init_thunk(void *thunk, int reg) @@ -151,6 +169,21 @@ static void *its_init_thunk(void *thunk, int reg) return thunk + offset; } +static void its_pages_protect(struct its_array *pages) +{ + for (int i = 0; i < pages->num; i++) { + void *page = pages->pages[i]; + execmem_restore_rox(page, PAGE_SIZE); + } +} + +static void its_fini_core(void) +{ + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + its_pages_protect(&its_pages); + kfree(its_pages.pages); +} + #ifdef CONFIG_MODULES void its_init_mod(struct module *mod) { @@ -173,10 +206,8 @@ void its_fini_mod(struct module *mod) its_page = NULL; mutex_unlock(&text_mutex); - for (int i = 0; i < mod->arch.its_pages.num; i++) { - void *page = mod->arch.its_pages.pages[i]; - execmem_restore_rox(page, PAGE_SIZE); - } + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + its_pages_protect(&mod->arch.its_pages); } void its_free_mod(struct module *mod) @@ -194,28 +225,23 @@ void its_free_mod(struct module *mod) static void *its_alloc(void) { - void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + struct its_array *pages = &its_pages; + void *page; +#ifdef CONFIG_MODULE + if (its_mod) + pages = &its_mod->arch.its_pages; +#endif + + page = __its_alloc(pages); if (!page) return NULL; -#ifdef CONFIG_MODULES - if (its_mod) { - struct its_array *pages = &its_mod->arch.its_pages; - void *tmp = krealloc(pages->pages, - (pages->num+1) * sizeof(void *), - GFP_KERNEL); - if (!tmp) - return NULL; - - pages->pages = tmp; - pages->pages[pages->num++] = page; + execmem_make_temp_rw(page, PAGE_SIZE); + if (pages == &its_pages) + set_memory_x((unsigned long)page, 1); - execmem_make_temp_rw(page, PAGE_SIZE); - } -#endif /* CONFIG_MODULES */ - - return no_free_ptr(page); + return page; } static void *its_allocate_thunk(int reg) @@ -269,7 +295,9 @@ u8 *its_static_thunk(int reg) return thunk; } -#endif +#else +static inline void its_fini_core(void) {} +#endif /* CONFIG_MITIGATION_ITS */ /* * Nomenclature for variable names to simplify and clarify this code and ease @@ -2339,6 +2367,8 @@ void __init alternative_instructions(void) apply_retpolines(__retpoline_sites, __retpoline_sites_end); apply_returns(__return_sites, __return_sites_end); + its_fini_core(); + /* * Adjust all CALL instructions to point to func()-10, including * those in .altinstr_replacement. -- cgit v1.2.3 From 7cd9a11dd0c3d1dd225795ed1b5b53132888e7b5 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Tue, 3 Jun 2025 14:14:45 +0300 Subject: Revert "mm/execmem: Unify early execmem_cache behaviour" The commit d6d1e3e6580c ("mm/execmem: Unify early execmem_cache behaviour") changed early behaviour of execemem ROX cache to allow its usage in early x86 code that allocates text pages when CONFIG_MITGATION_ITS is enabled. The permission management of the pages allocated from execmem for ITS mitigation is now completely contained in arch/x86/kernel/alternatives.c and therefore there is no need to special case early allocations in execmem. This reverts commit d6d1e3e6580ca35071ad474381f053cbf1fb6414. Signed-off-by: Mike Rapoport (Microsoft) Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20250603111446.2609381-6-rppt@kernel.org --- arch/x86/mm/init_32.c | 3 --- arch/x86/mm/init_64.c | 3 --- 2 files changed, 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 607d6a2e66e2..8a34fff6ab2b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include @@ -749,8 +748,6 @@ void mark_rodata_ro(void) pr_info("Write protecting kernel text and read-only data: %luk\n", size >> 10); - execmem_cache_make_ro(); - kernel_set_to_readonly = 1; #ifdef CONFIG_CPA_DEBUG diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ee66fae9ebcc..fdb6cab524f0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include @@ -1392,8 +1391,6 @@ void mark_rodata_ro(void) (end - start) >> 10); set_memory_ro(start, (end - start) >> PAGE_SHIFT); - execmem_cache_make_ro(); - kernel_set_to_readonly = 1; /* -- cgit v1.2.3 From 179a8427fcbffe36ccfed5e138d7c9b6180caff9 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 12 May 2025 22:16:34 +0000 Subject: KVM: SEV: Disable SEV-SNP support on initialization failure During platform init, SNP initialization may fail for several reasons, such as firmware command failures and incompatible versions. However, the KVM capability may continue to advertise support for it. The platform may have SNP enabled but if SNP_INIT fails then SNP is not supported by KVM. During KVM module initialization query the SNP platform status to obtain the SNP initialization state and use it as an additional condition to determine support for SEV-SNP. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Co-developed-by: Pratik R. Sampat Signed-off-by: Pratik R. Sampat Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Reviewed-by: Pankaj Gupta Reviewed-by: Pavan Kumar Paluri Message-ID: <20250512221634.12045-1-Ashish.Kalra@amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5a69b657dae9..459c3b791fd4 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2871,6 +2871,33 @@ void __init sev_set_cpu_caps(void) } } +static bool is_sev_snp_initialized(void) +{ + struct sev_user_data_snp_status *status; + struct sev_data_snp_addr buf; + bool initialized = false; + int ret, error = 0; + + status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO); + if (!status) + return false; + + buf.address = __psp_pa(status); + ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error); + if (ret) { + pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n", + ret, error, error); + goto out; + } + + initialized = !!status->state; + +out: + snp_free_firmware_page(status); + + return initialized; +} + void __init sev_hardware_setup(void) { unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count; @@ -2975,6 +3002,14 @@ void __init sev_hardware_setup(void) sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP); out: + if (sev_enabled) { + init_args.probe = true; + if (sev_platform_init(&init_args)) + sev_supported = sev_es_supported = sev_snp_supported = false; + else if (sev_snp_supported) + sev_snp_supported = is_sev_snp_initialized(); + } + if (boot_cpu_has(X86_FEATURE_SEV)) pr_info("SEV %s (ASIDs %u - %u)\n", sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" : @@ -3001,15 +3036,6 @@ out: sev_supported_vmsa_features = 0; if (sev_es_debug_swap_enabled) sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP; - - if (!sev_enabled) - return; - - /* - * Do both SNP and SEV initialization at KVM module load. - */ - init_args.probe = true; - sev_platform_init(&init_args); } void sev_hardware_unsetup(void) -- cgit v1.2.3 From 403d1338a4a59cfebb4ded53fa35fbd5119f36b1 Mon Sep 17 00:00:00 2001 From: Magnus Lindholm Date: Tue, 18 Feb 2025 18:55:14 +0100 Subject: mm: pgtable: fix pte_swp_exclusive Make pte_swp_exclusive return bool instead of int. This will better reflect how pte_swp_exclusive is actually used in the code. This fixes swap/swapoff problems on Alpha due pte_swp_exclusive not returning correct values when _PAGE_SWP_EXCLUSIVE bit resides in upper 32-bits of PTE (like on alpha). Suggested-by: Al Viro Signed-off-by: Magnus Lindholm Cc: Sam James Link: https://lore.kernel.org/lkml/20250218175735.19882-2-linmag7@gmail.com/ Link: https://lore.kernel.org/lkml/20250602041118.GA2675383@ZenIV/ [ Applied as the 'sed' script Al suggested - Linus ] Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgtable.h | 2 +- arch/arc/include/asm/pgtable-bits-arcv2.h | 2 +- arch/arm/include/asm/pgtable.h | 2 +- arch/arm64/include/asm/pgtable.h | 2 +- arch/csky/include/asm/pgtable.h | 2 +- arch/hexagon/include/asm/pgtable.h | 2 +- arch/loongarch/include/asm/pgtable.h | 2 +- arch/m68k/include/asm/mcf_pgtable.h | 2 +- arch/m68k/include/asm/motorola_pgtable.h | 2 +- arch/m68k/include/asm/sun3_pgtable.h | 2 +- arch/microblaze/include/asm/pgtable.h | 2 +- arch/mips/include/asm/pgtable.h | 4 ++-- arch/nios2/include/asm/pgtable.h | 2 +- arch/openrisc/include/asm/pgtable.h | 2 +- arch/parisc/include/asm/pgtable.h | 2 +- arch/powerpc/include/asm/book3s/32/pgtable.h | 2 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +- arch/powerpc/include/asm/nohash/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/s390/include/asm/pgtable.h | 2 +- arch/sh/include/asm/pgtable_32.h | 2 +- arch/sparc/include/asm/pgtable_32.h | 2 +- arch/sparc/include/asm/pgtable_64.h | 2 +- arch/um/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- arch/xtensa/include/asm/pgtable.h | 2 +- 26 files changed, 27 insertions(+), 27 deletions(-) (limited to 'arch') diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 2676017f42f1..44e7aedac6e8 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -327,7 +327,7 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index 8ebec1b21d24..3084c53f402d 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -130,7 +130,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 7f1c3b4e3e04..86378eec7757 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -301,7 +301,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(swp) __pte((swp).val) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_isset(pte, L_PTE_SWP_EXCLUSIVE); } diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 88db8a0c0b37..192d86e1cc76 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -563,7 +563,7 @@ static inline pte_t pte_swp_mkexclusive(pte_t pte) return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE)); } -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & PTE_SWP_EXCLUSIVE; } diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index b8378431aeff..5a394be09c35 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -200,7 +200,7 @@ static inline pte_t pte_mkyoung(pte_t pte) return pte; } -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 9fbdfdbc539f..fbf24d1d1ca6 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -387,7 +387,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) (((type & 0x1f) << 1) | \ ((offset & 0x3ffff8) << 10) | ((offset & 0x7) << 7)) }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index a3f17914dbab..b30185302c07 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -301,7 +301,7 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset) #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) }) #define __swp_entry_to_pmd(x) ((pmd_t) { (x).val | _PAGE_HUGE }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index f5c596b211d4..d79fef609194 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -268,7 +268,7 @@ extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) (__pte((x).val)) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index 040ac3bad713..14fee64d3e60 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -185,7 +185,7 @@ extern pgd_t kernel_pg_dir[128]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index 73745dc0ec0e..858cbe936f5b 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -169,7 +169,7 @@ extern pgd_t kernel_pg_dir[PTRS_PER_PGD]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index b1bb2c65dd04..bae1abfa6f6b 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -398,7 +398,7 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 2 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 2 }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 4852b005a72d..ae73ecf4c41a 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -534,7 +534,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #endif #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte.pte_low & _PAGE_SWP_EXCLUSIVE; } @@ -551,7 +551,7 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return pte; } #else -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index e98578e27e26..844dce55569f 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -259,7 +259,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __swp_entry_to_pte(swp) ((pte_t) { (swp).val }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 71bfb8c8c482..5bd6463bd514 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -411,7 +411,7 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 80f5e2a28413..1a86a4370b29 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -425,7 +425,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 42c3af90d1f0..92d21c6faf1e 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -365,7 +365,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) >> 3 }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val << 3 }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 6ed93e290c2f..a2ddcbb3fcb9 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -693,7 +693,7 @@ static inline pte_t pte_swp_mkexclusive(pte_t pte) return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_EXCLUSIVE)); } -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return !!(pte_raw(pte) & cpu_to_be64(_PAGE_SWP_EXCLUSIVE)); } diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 8d1f0b7062eb..7d6b9e5b286e 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -286,7 +286,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); } -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index a11816bbf9e7..438ce7df24c3 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -1028,7 +1028,7 @@ static inline pud_t pud_modify(pud_t pud, pgprot_t newprot) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 1c661ac62ce8..6d8bc27a366e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -915,7 +915,7 @@ static inline int pmd_protnone(pmd_t pmd) } #endif -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index 71b18741cc11..5f51af18997b 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -470,7 +470,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) /* In both cases, we borrow bit 6 to store the exclusive marker in swap PTEs. */ #define _PAGE_SWP_EXCLUSIVE _PAGE_USER -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte.pte_low & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 1454ebe91539..7c199c003ffe 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -348,7 +348,7 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & SRMMU_SWP_EXCLUSIVE; } diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 4af03e3c161b..669cd02469a1 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1023,7 +1023,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index ca2a519d53ab..24fdea6f88c3 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -314,7 +314,7 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); ((swp_entry_t) { pte_val(pte_mkuptodate(pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_get_bits(pte, _PAGE_SWP_EXCLUSIVE); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 774430c3abff..97954c936c54 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1561,7 +1561,7 @@ static inline pte_t pte_swp_mkexclusive(pte_t pte) return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); } -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE; } diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index cb1725c40e36..d62aa1c316fc 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -349,7 +349,7 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -static inline int pte_swp_exclusive(pte_t pte) +static inline bool pte_swp_exclusive(pte_t pte) { return pte_val(pte) & _PAGE_SWP_EXCLUSIVE; } -- cgit v1.2.3 From aa2024c01a9afba6728b362626f868811ca872ee Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 Jun 2025 20:10:18 -0400 Subject: KVM: x86/mmu: Embed direct bits into gpa for KVM_PRE_FAULT_MEMORY Bug[*] reported for TDX case when enabling KVM_PRE_FAULT_MEMORY in QEMU. It turns out that @gpa passed to kvm_mmu_do_page_fault() doesn't have shared bit set when the memory attribute of it is shared, and it leads to wrong root in tdp_mmu_get_root_for_fault(). Fix it by embedding the direct bits in the gpa that is passed to kvm_tdp_map_page(), when the memory of the gpa is not private. [*] https://lore.kernel.org/qemu-devel/4a757796-11c2-47f1-ae0d-335626e818fd@intel.com/ Reported-by: Xiaoyao Li Closes: https://lore.kernel.org/qemu-devel/4a757796-11c2-47f1-ae0d-335626e818fd@intel.com/ Signed-off-by: Paolo Bonzini Signed-off-by: Xiaoyao Li Message-ID: <20250611001018.2179964-1-xiaoyao.li@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index cbc84c6abc2e..a4040578b537 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4896,6 +4896,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, { u64 error_code = PFERR_GUEST_FINAL_MASK; u8 level = PG_LEVEL_4K; + u64 direct_bits; u64 end; int r; @@ -4910,15 +4911,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, if (r) return r; + direct_bits = 0; if (kvm_arch_has_private_mem(vcpu->kvm) && kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa))) error_code |= PFERR_PRIVATE_ACCESS; + else + direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm)); /* * Shadow paging uses GVA for kvm page fault, so restrict to * two-dimensional paging. */ - r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level); + r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level); if (r < 0) return r; -- cgit v1.2.3 From 8046d29dde17002523f94d3e6e0ebe486ce52166 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 12 Jun 2025 00:47:44 -0400 Subject: KVM: x86/mmu: Reject direct bits in gpa passed to KVM_PRE_FAULT_MEMORY Only let userspace pass the same addresses that were used in KVM_SET_USER_MEMORY_REGION (or KVM_SET_USER_MEMORY_REGION2); gpas in the the upper half of the address space are an implementation detail of TDX and KVM. Extracted from a patch by Sean Christopherson . Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a4040578b537..4e06e2e89a8f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4903,6 +4903,9 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, if (!vcpu->kvm->arch.pre_fault_allowed) return -EOPNOTSUPP; + if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa))) + return -EINVAL; + /* * reload is efficient when called repeatedly, so we can do it on * every iteration. -- cgit v1.2.3 From 650768c512faba8070bf4cfbb28c95eb5cd203f3 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 27 May 2025 13:56:33 +0530 Subject: arm64: Restrict pagetable teardown to avoid false warning Commit 9c006972c3fe ("arm64: mmu: drop pXd_present() checks from pXd_free_pYd_table()") removes the pxd_present() checks because the caller checks pxd_present(). But, in case of vmap_try_huge_pud(), the caller only checks pud_present(); pud_free_pmd_page() recurses on each pmd through pmd_free_pte_page(), wherein the pmd may be none. Thus it is possible to hit a warning in the latter, since pmd_none => !pmd_table(). Thus, add a pmd_present() check in pud_free_pmd_page(). This problem was found by code inspection. Fixes: 9c006972c3fe ("arm64: mmu: drop pXd_present() checks from pXd_free_pYd_table()") Cc: stable@vger.kernel.org Reported-by: Ryan Roberts Acked-by: David Hildenbrand Signed-off-by: Dev Jain Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Reviewed-by: Ryan Roberts Link: https://lore.kernel.org/r/20250527082633.61073-1-dev.jain@arm.com Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8fcf59ba39db..00ab1d648db6 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1305,7 +1305,8 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr) next = addr; end = addr + PUD_SIZE; do { - pmd_free_pte_page(pmdp, next); + if (pmd_present(pmdp_get(pmdp))) + pmd_free_pte_page(pmdp, next); } while (pmdp++, next += PMD_SIZE, next != end); pud_clear(pudp); -- cgit v1.2.3 From d2be3270f40b1330f8c1c9512c59dd085a758ac0 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 11 Jun 2025 17:28:13 +0100 Subject: arm64/gcs: Don't call gcs_free() during flush_gcs() Currently we call gcs_free() during flush_gcs() to reset the thread state for GCS. This includes unmapping any kernel allocated GCS, but this is redundant when doing a flush_thread() since we are reinitialising the thread memory too. Inline the reinitialisation of the thread struct. Signed-off-by: Mark Brown Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20250611-arm64-gcs-flush-thread-v1-1-cc26feeddabd@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/process.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a5ca15daeb8a..5954cec19660 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -288,7 +288,9 @@ static void flush_gcs(void) if (!system_supports_gcs()) return; - gcs_free(current); + current->thread.gcspr_el0 = 0; + current->thread.gcs_base = 0; + current->thread.gcs_size = 0; current->thread.gcs_el0_mode = 0; write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1); write_sysreg_s(0, SYS_GCSPR_EL0); -- cgit v1.2.3 From 39dfc971e42d886e7df01371cd1bef505076d84c Mon Sep 17 00:00:00 2001 From: Tengda Wu Date: Wed, 4 Jun 2025 00:55:33 +0000 Subject: arm64/ptrace: Fix stack-out-of-bounds read in regs_get_kernel_stack_nth() KASAN reports a stack-out-of-bounds read in regs_get_kernel_stack_nth(). Call Trace: [ 97.283505] BUG: KASAN: stack-out-of-bounds in regs_get_kernel_stack_nth+0xa8/0xc8 [ 97.284677] Read of size 8 at addr ffff800089277c10 by task 1.sh/2550 [ 97.285732] [ 97.286067] CPU: 7 PID: 2550 Comm: 1.sh Not tainted 6.6.0+ #11 [ 97.287032] Hardware name: linux,dummy-virt (DT) [ 97.287815] Call trace: [ 97.288279] dump_backtrace+0xa0/0x128 [ 97.288946] show_stack+0x20/0x38 [ 97.289551] dump_stack_lvl+0x78/0xc8 [ 97.290203] print_address_description.constprop.0+0x84/0x3c8 [ 97.291159] print_report+0xb0/0x280 [ 97.291792] kasan_report+0x84/0xd0 [ 97.292421] __asan_load8+0x9c/0xc0 [ 97.293042] regs_get_kernel_stack_nth+0xa8/0xc8 [ 97.293835] process_fetch_insn+0x770/0xa30 [ 97.294562] kprobe_trace_func+0x254/0x3b0 [ 97.295271] kprobe_dispatcher+0x98/0xe0 [ 97.295955] kprobe_breakpoint_handler+0x1b0/0x210 [ 97.296774] call_break_hook+0xc4/0x100 [ 97.297451] brk_handler+0x24/0x78 [ 97.298073] do_debug_exception+0xac/0x178 [ 97.298785] el1_dbg+0x70/0x90 [ 97.299344] el1h_64_sync_handler+0xcc/0xe8 [ 97.300066] el1h_64_sync+0x78/0x80 [ 97.300699] kernel_clone+0x0/0x500 [ 97.301331] __arm64_sys_clone+0x70/0x90 [ 97.302084] invoke_syscall+0x68/0x198 [ 97.302746] el0_svc_common.constprop.0+0x11c/0x150 [ 97.303569] do_el0_svc+0x38/0x50 [ 97.304164] el0_svc+0x44/0x1d8 [ 97.304749] el0t_64_sync_handler+0x100/0x130 [ 97.305500] el0t_64_sync+0x188/0x190 [ 97.306151] [ 97.306475] The buggy address belongs to stack of task 1.sh/2550 [ 97.307461] and is located at offset 0 in frame: [ 97.308257] __se_sys_clone+0x0/0x138 [ 97.308910] [ 97.309241] This frame has 1 object: [ 97.309873] [48, 184) 'args' [ 97.309876] [ 97.310749] The buggy address belongs to the virtual mapping at [ 97.310749] [ffff800089270000, ffff800089279000) created by: [ 97.310749] dup_task_struct+0xc0/0x2e8 [ 97.313347] [ 97.313674] The buggy address belongs to the physical page: [ 97.314604] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x14f69a [ 97.315885] flags: 0x15ffffe00000000(node=1|zone=2|lastcpupid=0xfffff) [ 97.316957] raw: 015ffffe00000000 0000000000000000 dead000000000122 0000000000000000 [ 97.318207] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 [ 97.319445] page dumped because: kasan: bad access detected [ 97.320371] [ 97.320694] Memory state around the buggy address: [ 97.321511] ffff800089277b00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 97.322681] ffff800089277b80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 [ 97.323846] >ffff800089277c00: 00 00 f1 f1 f1 f1 f1 f1 00 00 00 00 00 00 00 00 [ 97.325023] ^ [ 97.325683] ffff800089277c80: 00 00 00 00 00 00 00 00 00 f3 f3 f3 f3 f3 f3 f3 [ 97.326856] ffff800089277d00: f3 f3 00 00 00 00 00 00 00 00 00 00 00 00 00 00 This issue seems to be related to the behavior of some gcc compilers and was also fixed on the s390 architecture before: commit d93a855c31b7 ("s390/ptrace: Avoid KASAN false positives in regs_get_kernel_stack_nth()") As described in that commit, regs_get_kernel_stack_nth() has confirmed that `addr` is on the stack, so reading the value at `*addr` should be allowed. Use READ_ONCE_NOCHECK() helper to silence the KASAN check for this case. Fixes: 0a8ea52c3eb1 ("arm64: Add HAVE_REGS_AND_STACK_ACCESS_API feature") Signed-off-by: Tengda Wu Link: https://lore.kernel.org/r/20250604005533.1278992-1-wutengda@huaweicloud.com [will: Use '*addr' as the argument to READ_ONCE_NOCHECK()] Signed-off-by: Will Deacon --- arch/arm64/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index a360e52db02f..ee94b72bf8fb 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -141,7 +141,7 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) addr += n; if (regs_within_kernel_stack(regs, (unsigned long)addr)) - return *addr; + return READ_ONCE_NOCHECK(*addr); else return 0; } -- cgit v1.2.3 From b93755f408325170edb2156c6a894ed1cae5f4f6 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 12 May 2025 20:14:55 +0200 Subject: powerpc/vdso: Fix build of VDSO32 with pcrel Building vdso32 on power10 with pcrel leads to following errors: VDSO32A arch/powerpc/kernel/vdso/gettimeofday-32.o arch/powerpc/kernel/vdso/gettimeofday.S: Assembler messages: arch/powerpc/kernel/vdso/gettimeofday.S:40: Error: syntax error; found `@', expected `,' arch/powerpc/kernel/vdso/gettimeofday.S:71: Info: macro invoked from here arch/powerpc/kernel/vdso/gettimeofday.S:40: Error: junk at end of line: `@notoc' arch/powerpc/kernel/vdso/gettimeofday.S:71: Info: macro invoked from here ... make[2]: *** [arch/powerpc/kernel/vdso/Makefile:85: arch/powerpc/kernel/vdso/gettimeofday-32.o] Error 1 make[1]: *** [arch/powerpc/Makefile:388: vdso_prepare] Error 2 Once the above is fixed, the following happens: VDSO32C arch/powerpc/kernel/vdso/vgettimeofday-32.o cc1: error: '-mpcrel' requires '-mcmodel=medium' make[2]: *** [arch/powerpc/kernel/vdso/Makefile:89: arch/powerpc/kernel/vdso/vgettimeofday-32.o] Error 1 make[1]: *** [arch/powerpc/Makefile:388: vdso_prepare] Error 2 make: *** [Makefile:251: __sub-make] Error 2 Make sure pcrel version of CFUNC() macro is used only for powerpc64 builds and remove -mpcrel for powerpc32 builds. Fixes: 7e3a68be42e1 ("powerpc/64: vmlinux support building with PCREL addresing") Signed-off-by: Christophe Leroy Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/1fa3453f07d42a50a70114da9905bf7b73304fca.1747073669.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc_asm.h | 2 +- arch/powerpc/kernel/vdso/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 02897f4b0dbf..b891910fce8a 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -183,7 +183,7 @@ /* * Used to name C functions called from asm */ -#ifdef CONFIG_PPC_KERNEL_PCREL +#if defined(__powerpc64__) && defined(CONFIG_PPC_KERNEL_PCREL) #define CFUNC(name) name@notoc #else #define CFUNC(name) name diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile index e8824f933326..8834dfe9d727 100644 --- a/arch/powerpc/kernel/vdso/Makefile +++ b/arch/powerpc/kernel/vdso/Makefile @@ -53,7 +53,7 @@ ldflags-$(CONFIG_LD_ORPHAN_WARN) += -Wl,--orphan-handling=$(CONFIG_LD_ORPHAN_WAR ldflags-y += $(filter-out $(CC_AUTO_VAR_INIT_ZERO_ENABLER) $(CC_FLAGS_FTRACE) -Wa$(comma)%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS)) CC32FLAGS := -m32 -CC32FLAGSREMOVE := -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc +CC32FLAGSREMOVE := -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc -mpcrel ifdef CONFIG_CC_IS_CLANG # This flag is supported by clang for 64-bit but not 32-bit so it will cause # an unused command line flag warning for this file. -- cgit v1.2.3 From 33bc69cf6655cf60829a803a45275f11a74899e5 Mon Sep 17 00:00:00 2001 From: Narayana Murty N Date: Thu, 8 May 2025 02:29:28 -0400 Subject: powerpc/eeh: Fix missing PE bridge reconfiguration during VFIO EEH recovery VFIO EEH recovery for PCI passthrough devices fails on PowerNV and pseries platforms due to missing host-side PE bridge reconfiguration. In the current implementation, eeh_pe_configure() only performs RTAS or OPAL-based bridge reconfiguration for native host devices, but skips it entirely for PEs managed through VFIO in guest passthrough scenarios. This leads to incomplete EEH recovery when a PCI error affects a passthrough device assigned to a QEMU/KVM guest. Although VFIO triggers the EEH recovery flow through VFIO_EEH_PE_ENABLE ioctl, the platform-specific bridge reconfiguration step is silently bypassed. As a result, the PE's config space is not fully restored, causing subsequent config space access failures or EEH freeze-on-access errors inside the guest. This patch fixes the issue by ensuring that eeh_pe_configure() always invokes the platform's configure_bridge() callback (e.g., pseries_eeh_phb_configure_bridge) even for VFIO-managed PEs. This ensures that RTAS or OPAL calls to reconfigure the PE bridge are correctly issued on the host side, restoring the PE's configuration space after an EEH event. This fix is essential for reliable EEH recovery in QEMU/KVM guests using VFIO PCI passthrough on PowerNV and pseries systems. Tested with: - QEMU/KVM guest using VFIO passthrough (IBM Power9,(lpar)Power11 host) - Injected EEH errors with pseries EEH errinjct tool on host, recovery verified on qemu guest. - Verified successful config space access and CAP_EXP DevCtl restoration after recovery Fixes: 212d16cdca2d ("powerpc/eeh: EEH support for VFIO PCI device") Signed-off-by: Narayana Murty N Reviewed-by: Vaibhav Jain Reviewed-by: Ganesh Goudar Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250508062928.146043-1-nnmlinux@linux.ibm.com --- arch/powerpc/kernel/eeh.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 83fe99861eb1..ca7f7bb2b478 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -1509,6 +1509,8 @@ int eeh_pe_configure(struct eeh_pe *pe) /* Invalid PE ? */ if (!pe) return -ENODEV; + else + ret = eeh_ops->configure_bridge(pe); return ret; } -- cgit v1.2.3 From 4e6d080acfda5344ccbc63afe778830e22be4be9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= Date: Wed, 11 Jun 2025 23:07:49 +0200 Subject: powerpc/microwatt: Fix model property in device tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The standard property for the model name is called "model". Signed-off-by: J. Neuschäfer Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250611-microwatt-v2-1-80847bbc5f9c@posteo.net --- arch/powerpc/boot/dts/microwatt.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/boot/dts/microwatt.dts b/arch/powerpc/boot/dts/microwatt.dts index c4e4d2a9b460..b7eac4e56019 100644 --- a/arch/powerpc/boot/dts/microwatt.dts +++ b/arch/powerpc/boot/dts/microwatt.dts @@ -4,7 +4,7 @@ / { #size-cells = <0x02>; #address-cells = <0x02>; - model-name = "microwatt"; + model = "microwatt"; compatible = "microwatt-soc"; aliases { -- cgit v1.2.3 From e75cb6010838f61b9d63e921d1763a8ab177e38e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= Date: Wed, 11 Jun 2025 21:01:01 +0200 Subject: powerpc: dts: mpc8315erdb: Add GPIO controller node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MPC8315E SoC and variants have a GPIO controller at IMMR + 0xc00. This node was previously missing from the device tree. Signed-off-by: J. Neuschäfer Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250611-mpc-gpio-v1-1-02d1f75336e2@posteo.net --- arch/powerpc/boot/dts/mpc8315erdb.dts | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/boot/dts/mpc8315erdb.dts b/arch/powerpc/boot/dts/mpc8315erdb.dts index e09b37d7489d..a89cb3139ca8 100644 --- a/arch/powerpc/boot/dts/mpc8315erdb.dts +++ b/arch/powerpc/boot/dts/mpc8315erdb.dts @@ -6,6 +6,7 @@ */ /dts-v1/; +#include / { compatible = "fsl,mpc8315erdb"; @@ -358,6 +359,15 @@ interrupt-parent = <&ipic>; fsl,mpc8313-wakeup-timer = <>m1>; }; + + gpio: gpio-controller@c00 { + compatible = "fsl,mpc8314-gpio"; + reg = <0xc00 0x100>; + interrupts = <74 IRQ_TYPE_LEVEL_LOW>; + interrupt-parent = <&ipic>; + gpio-controller; + #gpio-cells = <2>; + }; }; pci0: pci@e0008500 { -- cgit v1.2.3 From b0823d5fbacb1c551d793cbfe7af24e0d1fa45ed Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 12 Jun 2025 07:38:18 -0700 Subject: perf/x86/intel: Fix crash in icl_update_topdown_event() The perf_fuzzer found a hard-lockup crash on a RaptorLake machine: Oops: general protection fault, maybe for address 0xffff89aeceab400: 0000 CPU: 23 UID: 0 PID: 0 Comm: swapper/23 Tainted: [W]=WARN Hardware name: Dell Inc. Precision 9660/0VJ762 RIP: 0010:native_read_pmc+0x7/0x40 Code: cc e8 8d a9 01 00 48 89 03 5b cd cc cc cc cc 0f 1f ... RSP: 000:fffb03100273de8 EFLAGS: 00010046 .... Call Trace: icl_update_topdown_event+0x165/0x190 ? ktime_get+0x38/0xd0 intel_pmu_read_event+0xf9/0x210 __perf_event_read+0xf9/0x210 CPUs 16-23 are E-core CPUs that don't support the perf metrics feature. The icl_update_topdown_event() should not be invoked on these CPUs. It's a regression of commit: f9bdf1f95339 ("perf/x86/intel: Avoid disable PMU if !cpuc->enabled in sample read") The bug introduced by that commit is that the is_topdown_event() function is mistakenly used to replace the is_topdown_count() call to check if the topdown functions for the perf metrics feature should be invoked. Fix it. Fixes: f9bdf1f95339 ("perf/x86/intel: Avoid disable PMU if !cpuc->enabled in sample read") Closes: https://lore.kernel.org/lkml/352f0709-f026-cd45-e60c-60dfd97f73f3@maine.edu/ Reported-by: Vince Weaver Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Vince Weaver Cc: stable@vger.kernel.org # v6.15+ Link: https://lore.kernel.org/r/20250612143818.2889040-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 741b229f0718..c2fb729c270e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2826,7 +2826,7 @@ static void intel_pmu_read_event(struct perf_event *event) * If the PEBS counters snapshotting is enabled, * the topdown event is available in PEBS records. */ - if (is_topdown_event(event) && !is_pebs_counter_event_group(event)) + if (is_topdown_count(event) && !is_pebs_counter_event_group(event)) static_call(intel_pmu_update_topdown_event)(event, NULL); else intel_pmu_drain_pebs_buffer(); -- cgit v1.2.3 From ab107276607af90b13a5994997e19b7b9731e251 Mon Sep 17 00:00:00 2001 From: Madhavan Srinivasan Date: Sat, 17 May 2025 19:52:37 +0530 Subject: powerpc: Fix struct termio related ioctl macros Since termio interface is now obsolete, include/uapi/asm/ioctls.h has some constant macros referring to "struct termio", this caused build failure at userspace. In file included from /usr/include/asm/ioctl.h:12, from /usr/include/asm/ioctls.h:5, from tst-ioctls.c:3: tst-ioctls.c: In function 'get_TCGETA': tst-ioctls.c:12:10: error: invalid application of 'sizeof' to incomplete type 'struct termio' 12 | return TCGETA; | ^~~~~~ Even though termios.h provides "struct termio", trying to juggle definitions around to make it compile could introduce regressions. So better to open code it. Reported-by: Tulio Magno Suggested-by: Nicholas Piggin Tested-by: Justin M. Forbes Reviewed-by: Michael Ellerman Closes: https://lore.kernel.org/linuxppc-dev/8734dji5wl.fsf@ascii.art.br/ Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250517142237.156665-1-maddy@linux.ibm.com --- arch/powerpc/include/uapi/asm/ioctls.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/uapi/asm/ioctls.h b/arch/powerpc/include/uapi/asm/ioctls.h index 2c145da3b774..b5211e413829 100644 --- a/arch/powerpc/include/uapi/asm/ioctls.h +++ b/arch/powerpc/include/uapi/asm/ioctls.h @@ -23,10 +23,10 @@ #define TCSETSW _IOW('t', 21, struct termios) #define TCSETSF _IOW('t', 22, struct termios) -#define TCGETA _IOR('t', 23, struct termio) -#define TCSETA _IOW('t', 24, struct termio) -#define TCSETAW _IOW('t', 25, struct termio) -#define TCSETAF _IOW('t', 28, struct termio) +#define TCGETA 0x40147417 /* _IOR('t', 23, struct termio) */ +#define TCSETA 0x80147418 /* _IOW('t', 24, struct termio) */ +#define TCSETAW 0x80147419 /* _IOW('t', 25, struct termio) */ +#define TCSETAF 0x8014741c /* _IOW('t', 28, struct termio) */ #define TCSBRK _IO('t', 29) #define TCXONC _IO('t', 30) -- cgit v1.2.3 From 594902c986e269660302f09df9ec4bf1cf017b77 Mon Sep 17 00:00:00 2001 From: Qinyun Tan Date: Sat, 31 May 2025 02:20:53 +0800 Subject: x86,fs/resctrl: Remove inappropriate references to cacheinfo in the resctrl subsystem In the resctrl subsystem's Sub-NUMA Cluster (SNC) mode, the rdt_mon_domain structure representing a NUMA node relies on the cacheinfo interface (rdt_mon_domain::ci) to store L3 cache information (e.g., shared_cpu_map) for monitoring. The L3 cache information of a SNC NUMA node determines which domains are summed for the "top level" L3-scoped events. rdt_mon_domain::ci is initialized using the first online CPU of a NUMA node. When this CPU goes offline, its shared_cpu_map is cleared to contain only the offline CPU itself. Subsequently, attempting to read counters via smp_call_on_cpu(offline_cpu) fails (and error ignored), returning zero values for "top-level events" without any error indication. Replace the cacheinfo references in struct rdt_mon_domain and struct rmid_read with the cacheinfo ID (a unique identifier for the L3 cache). rdt_domain_hdr::cpu_mask contains the online CPUs associated with that domain. When reading "top-level events", select a CPU from rdt_domain_hdr::cpu_mask and utilize its L3 shared_cpu_map to determine valid CPUs for reading RMID counter via the MSR interface. Considering all CPUs associated with the L3 cache improves the chances of picking a housekeeping CPU on which the counter reading work can be queued, avoiding an unnecessary IPI. Fixes: 328ea68874642 ("x86/resctrl: Prepare for new Sub-NUMA Cluster (SNC) monitor files") Signed-off-by: Qinyun Tan Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Tested-by: Tony Luck Link: https://lore.kernel.org/20250530182053.37502-2-qinyuntan@linux.alibaba.com --- arch/x86/kernel/cpu/resctrl/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 7109cbfcad4f..187d527ef73b 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -498,6 +498,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) struct rdt_hw_mon_domain *hw_dom; struct rdt_domain_hdr *hdr; struct rdt_mon_domain *d; + struct cacheinfo *ci; int err; lockdep_assert_held(&domain_list_lock); @@ -525,12 +526,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r) d = &hw_dom->d_resctrl; d->hdr.id = id; d->hdr.type = RESCTRL_MON_DOMAIN; - d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); - if (!d->ci) { + ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE); + if (!ci) { pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name); mon_domain_free(hw_dom); return; } + d->ci_id = ci->id; cpumask_set_cpu(cpu, &d->hdr.cpu_mask); arch_mon_domain_online(r, d); -- cgit v1.2.3 From 9d4204a8106fe7dc80e3f2e440c8f2ba1ba47319 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 15 Jun 2025 18:06:54 -0700 Subject: lib/crypto/poly1305: Fix arm64's poly1305_blocks_arch() For some reason arm64's Poly1305 code got changed to ignore the padbit argument. As a result, the output is incorrect when the message length is not a multiple of 16 (which is not reached with the standard ChaCha20Poly1305, but bcachefs could reach this). Fix this. Fixes: a59e5468a921 ("crypto: arm64/poly1305 - Add block-only interface") Reported-by: Kent Overstreet Tested-by: Kent Overstreet Link: https://lore.kernel.org/r/20250616010654.367302-1-ebiggers@kernel.org Signed-off-by: Eric Biggers --- arch/arm64/lib/crypto/poly1305-glue.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/lib/crypto/poly1305-glue.c b/arch/arm64/lib/crypto/poly1305-glue.c index 6a661cf04821..c9a74766785b 100644 --- a/arch/arm64/lib/crypto/poly1305-glue.c +++ b/arch/arm64/lib/crypto/poly1305-glue.c @@ -38,14 +38,14 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src, unsigned int todo = min_t(unsigned int, len, SZ_4K); kernel_neon_begin(); - poly1305_blocks_neon(state, src, todo, 1); + poly1305_blocks_neon(state, src, todo, padbit); kernel_neon_end(); len -= todo; src += todo; } while (len); } else - poly1305_blocks(state, src, len, 1); + poly1305_blocks(state, src, len, padbit); } EXPORT_SYMBOL_GPL(poly1305_blocks_arch); -- cgit v1.2.3 From 6aba0cb5bba6141158d5449f2cf53187b7f755f9 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 5 Jun 2025 11:44:46 +0530 Subject: RISC-V: KVM: Fix the size parameter check in SBI SFENCE calls As-per the SBI specification, an SBI remote fence operation applies to the entire address space if either: 1) start_addr and size are both 0 2) size is equal to 2^XLEN-1 >From the above, only #1 is checked by SBI SFENCE calls so fix the size parameter check in SBI SFENCE calls to cover #2 as well. Fixes: 13acfec2dbcc ("RISC-V: KVM: Add remote HFENCE functions based on VCPU requests") Reviewed-by: Atish Patra Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250605061458.196003-2-apatel@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_replace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 5fbf3f94f1e8..9752d2ffff68 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -103,7 +103,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - if (cp->a2 == 0 && cp->a3 == 0) + if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL) kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask); else kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask, @@ -111,7 +111,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - if (cp->a2 == 0 && cp->a3 == 0) + if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL) kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, hbase, hmask, cp->a4); else -- cgit v1.2.3 From 2e7be162996640bbe3b6da694cc064c511b8a5d9 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 5 Jun 2025 11:44:47 +0530 Subject: RISC-V: KVM: Don't treat SBI HFENCE calls as NOPs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SBI specification clearly states that SBI HFENCE calls should return SBI_ERR_NOT_SUPPORTED when one of the target hart doesn’t support hypervisor extension (aka nested virtualization in-case of KVM RISC-V). Fixes: c7fa3c48de86 ("RISC-V: KVM: Treat SBI HFENCE calls as NOPs") Reviewed-by: Atish Patra Signed-off-by: Anup Patel Link: https://lore.kernel.org/r/20250605061458.196003-3-apatel@ventanamicro.com Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi_replace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 9752d2ffff68..b17fad091bab 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -127,9 +127,9 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID: /* * Until nested virtualization is implemented, the - * SBI HFENCE calls should be treated as NOPs + * SBI HFENCE calls should return not supported + * hence fallthrough. */ - break; default: retdata->err_val = SBI_ERR_NOT_SUPPORTED; } -- cgit v1.2.3 From 94a17f2dc90bc7eae36c0f478515d4bd1c23e877 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 10 Jun 2025 15:24:20 -0700 Subject: x86/mm: Disable INVLPGB when PTI is enabled PTI uses separate ASIDs (aka. PCIDs) for kernel and user address spaces. When the kernel needs to flush the user address space, it just sets a bit in a bitmap and then flushes the entire PCID on the next switch to userspace. This bitmap is a single 'unsigned long' which is plenty for all 6 dynamic ASIDs. But, unfortunately, the INVLPGB support brings along a bunch more user ASIDs, as many as ~2k more. The bitmap can't address that many. Fortunately, the bitmap is only needed for PTI and all the CPUs with INVLPGB are AMD CPUs that aren't vulnerable to Meltdown and don't need PTI. The only way someone can run into an issue in practice is by booting with pti=on on a newer AMD CPU. Disable INVLPGB if PTI is enabled. Avoid overrunning the small bitmap. Note: this will be fixed up properly by making the bitmap bigger. For now, just avoid the mostly theoretical bug. Fixes: 4afeb0ed1753 ("x86/mm: Enable broadcast TLB invalidation for multi-threaded processes") Signed-off-by: Dave Hansen Acked-by: Rik van Riel Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20250610222420.E8CBF472%40davehans-spike.ostc.intel.com --- arch/x86/mm/pti.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 190299834011..c0c40b67524e 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -98,6 +98,11 @@ void __init pti_check_boottime_disable(void) return; setup_force_cpu_cap(X86_FEATURE_PTI); + + if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { + pr_debug("PTI enabled, disabling INVLPGB\n"); + setup_clear_cpu_cap(X86_FEATURE_INVLPGB); + } } static int __init pti_parse_cmdline(char *arg) -- cgit v1.2.3 From 3c902383f2da91cba3821b73aa6edd49f4db6023 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 16 Jun 2025 12:04:32 +0200 Subject: x86/its: Fix an ifdef typo in its_alloc() Commit a82b26451de1 ("x86/its: explicitly manage permissions for ITS pages") reworks its_alloc() and introduces a typo in an ifdef conditional, referring to CONFIG_MODULE instead of CONFIG_MODULES. Fix this typo in its_alloc(). Fixes: a82b26451de1 ("x86/its: explicitly manage permissions for ITS pages") Signed-off-by: Lukas Bulwahn Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250616100432.22941-1-lukas.bulwahn%40redhat.com --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 6455f7f751b3..9ae80fa904a2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -228,7 +228,7 @@ static void *its_alloc(void) struct its_array *pages = &its_pages; void *page; -#ifdef CONFIG_MODULE +#ifdef CONFIG_MODULES if (its_mod) pages = &its_mod->arch.its_pages; #endif -- cgit v1.2.3 From cb6075bc62dc6a9cd7ab3572758685fdf78e3e20 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 6 Jun 2025 13:10:34 -0400 Subject: x86/mm: Fix early boot use of INVPLGB The INVLPGB instruction has limits on how many pages it can invalidate at once. That limit is enumerated in CPUID, read by the kernel, and stored in 'invpgb_count_max'. Ranged invalidation, like invlpgb_kernel_range_flush() break up their invalidations so that they do not exceed the limit. However, early boot code currently attempts to do ranged invalidation before populating 'invlpgb_count_max'. There is a for loop which is basically: for (...; addr < end; addr += invlpgb_count_max*PAGE_SIZE) If invlpgb_kernel_range_flush is called before the kernel has read the value of invlpgb_count_max from the hardware, the normally bounded loop can become an infinite loop if invlpgb_count_max is initialized to zero. Fix that issue by initializing invlpgb_count_max to 1. This way INVPLGB at early boot time will be a little bit slower than normal (with initialized invplgb_count_max), and not an instant hang at bootup time. Fixes: b7aa05cbdc52 ("x86/mm: Add INVLPGB support code") Signed-off-by: Rik van Riel Signed-off-by: Dave Hansen Link: https://lore.kernel.org/all/20250606171112.4013261-3-riel%40surriel.com --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 93da466dfe2c..b2ad8d13211a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -31,7 +31,7 @@ #include "cpu.h" -u16 invlpgb_count_max __ro_after_init; +u16 invlpgb_count_max __ro_after_init = 1; static inline int rdmsrq_amd_safe(unsigned msr, u64 *p) { -- cgit v1.2.3 From 2aebf5ee43bf0ed225a09a30cf515d9f2813b759 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Wed, 18 Jun 2025 09:05:23 +0900 Subject: x86/alternatives: Fix int3 handling failure from broken text_poke array Since smp_text_poke_single() does not expect there is another text_poke request is queued, it can make text_poke_array not sorted or cause a buffer overflow on the text_poke_array.vec[]. This will cause an Oops in int3 because of bsearch failing; CPU 0 CPU 1 CPU 2 ----- ----- ----- smp_text_poke_batch_add() smp_text_poke_single() <<-- Adds out of order [Fails o find address in text_poke_array ] OOPS! Or unhandled page fault because of a buffer overflow; CPU 0 CPU 1 ----- ----- smp_text_poke_batch_add() <<+ ... | smp_text_poke_batch_add() <<-- Adds TEXT_POKE_ARRAY_MAX times. smp_text_poke_single() { __smp_text_poke_batch_add() <<-- Adds entry at TEXT_POKE_ARRAY_MAX + 1 smp_text_poke_batch_finish() [Unhandled page fault because text_poke_array.nr_entries is overwritten] BUG! } Use smp_text_poke_batch_add() instead of __smp_text_poke_batch_add() so that it correctly flush the queue if needed. Closes: https://lore.kernel.org/all/CA+G9fYsLu0roY3DV=tKyqP7FEKbOEETRvTDhnpPxJGbA=Cg+4w@mail.gmail.com/ Fixes: c8976ade0c1b ("x86/alternatives: Simplify smp_text_poke_single() by using tp_vec and existing APIs") Reported-by: Linux Kernel Functional Testing Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (Google) Tested-by: Linux Kernel Functional Testing Link: https://lkml.kernel.org/r/\ 175020512308.3582717.13631440385506146631.stgit@mhiramat.tok.corp.google.com --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9ae80fa904a2..ea1d984166cd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -3138,6 +3138,6 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c */ void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - __smp_text_poke_batch_add(addr, opcode, len, emulate); + smp_text_poke_batch_add(addr, opcode, len, emulate); smp_text_poke_batch_finish(); } -- cgit v1.2.3 From 8a8ff069c7ad9a359c54683329883e2432cff191 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 15 Jun 2025 16:11:38 +0100 Subject: KVM: arm64: nv: Fix tracking of shadow list registers Wei-Lin reports that the tracking of shadow list registers is majorly broken when resync'ing the L2 state after a run, as we confuse the guest's LR index with the host's, potentially losing the interrupt state. While this could be fixed by adding yet another side index to track it (Wei-Lin's fix), it may be better to refactor this code to avoid having a side index altogether, limiting the risk to introduce this class of bugs. A key observation is that the shadow index is always the number of bits in the lr_map bitmap. With that, the parallel indexing scheme can be completely dropped. While doing this, introduce a couple of helpers that abstract the index conversion and some of the LR repainting, making the whole exercise much simpler. Reported-by: Wei-Lin Chang Reviewed-by: Wei-Lin Chang Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20250614145721.2504524-1-r09922117@csie.ntu.edu.tw Link: https://lore.kernel.org/r/86qzzkc5xa.wl-maz@kernel.org --- arch/arm64/kvm/vgic/vgic-v3-nested.c | 81 +++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 39 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c index d22a8ad7bcc5..a50fb7e6841f 100644 --- a/arch/arm64/kvm/vgic/vgic-v3-nested.c +++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c @@ -36,6 +36,11 @@ struct shadow_if { static DEFINE_PER_CPU(struct shadow_if, shadow_if); +static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx) +{ + return hweight16(shadow_if->lr_map & (BIT(idx) - 1)); +} + /* * Nesting GICv3 support * @@ -209,6 +214,29 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) return reg; } +static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr) +{ + struct vgic_irq *irq; + + if (!(lr & ICH_LR_HW)) + return lr; + + /* We have the HW bit set, check for validity of pINTID */ + irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); + /* If there was no real mapping, nuke the HW bit */ + if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI) + lr &= ~ICH_LR_HW; + + /* Translate the virtual mapping to the real one, even if invalid */ + if (irq) { + lr &= ~ICH_LR_PHYS_ID_MASK; + lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); + vgic_put_irq(vcpu->kvm, irq); + } + + return lr; +} + /* * For LRs which have HW bit set such as timer interrupts, we modify them to * have the host hardware interrupt number instead of the virtual one programmed @@ -217,58 +245,37 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu) static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu, struct vgic_v3_cpu_if *s_cpu_if) { - unsigned long lr_map = 0; - int index = 0; + struct shadow_if *shadow_if; + + shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif); + shadow_if->lr_map = 0; for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) { u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); - struct vgic_irq *irq; if (!(lr & ICH_LR_STATE)) - lr = 0; - - if (!(lr & ICH_LR_HW)) - goto next; - - /* We have the HW bit set, check for validity of pINTID */ - irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); - if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) { - /* There was no real mapping, so nuke the HW bit */ - lr &= ~ICH_LR_HW; - if (irq) - vgic_put_irq(vcpu->kvm, irq); - goto next; - } - - /* Translate the virtual mapping to the real one */ - lr &= ~ICH_LR_PHYS_ID_MASK; - lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid); + continue; - vgic_put_irq(vcpu->kvm, irq); + lr = translate_lr_pintid(vcpu, lr); -next: - s_cpu_if->vgic_lr[index] = lr; - if (lr) { - lr_map |= BIT(i); - index++; - } + s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr; + shadow_if->lr_map |= BIT(i); } - container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map; - s_cpu_if->used_lrs = index; + s_cpu_if->used_lrs = hweight16(shadow_if->lr_map); } void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) { struct shadow_if *shadow_if = get_shadow_if(); - int i, index = 0; + int i; for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) { u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i)); struct vgic_irq *irq; if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE)) - goto next; + continue; /* * If we had a HW lr programmed by the guest hypervisor, we @@ -277,15 +284,13 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu) */ irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr)); if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */ - goto next; + continue; - lr = __gic_v3_get_lr(index); + lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i)); if (!(lr & ICH_LR_STATE)) irq->active = false; vgic_put_irq(vcpu->kvm, irq); - next: - index++; } } @@ -368,13 +373,11 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu) val = __vcpu_sys_reg(vcpu, ICH_LRN(i)); val &= ~ICH_LR_STATE; - val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE; + val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE; __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val); - s_cpu_if->vgic_lr[i] = 0; } - shadow_if->lr_map = 0; vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0; } -- cgit v1.2.3 From 1fbe6861a6d9a942fb8ab8677ddf1ecb86b1af60 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 11 Jun 2025 15:45:04 -0700 Subject: KVM: arm64: Explicitly treat routing entry type changes as changes Explicitly treat type differences as GSI routing changes, as comparing MSI data between two entries could get a false negative, e.g. if userspace changed the type but left the type-specific data as- Note, the same bug was fixed in x86 by commit bcda70c56f3e ("KVM: x86: Explicitly treat routing entry type changes as changes"). Fixes: 4bf3693d36af ("KVM: arm64: Unmap vLPIs affected by changes to GSI routing information") Signed-off-by: Sean Christopherson Reviewed-by: Oliver Upton Link: https://lore.kernel.org/r/20250611224604.313496-3-seanjc@google.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/arm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index de2b4e9c9f9f..38a91bb5d4c7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2764,7 +2764,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return memcmp(&old->msi, &new->msi, sizeof(new->msi)); -- cgit v1.2.3 From cade3d57e456e69f67aa9894bf89dc8678796bb7 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:12 +0100 Subject: KVM: arm64: VHE: Synchronize restore of host debug registers When KVM runs in non-protected VHE mode, there's no context synchronization event between __debug_switch_to_host() restoring the host debug registers and __kvm_vcpu_run() unmasking debug exceptions. Due to this, it's theoretically possible for the host to take an unexpected debug exception due to the stale guest configuration. This cannot happen in NVHE/HVHE mode as debug exceptions are masked in the hyp code, and the exception return to the host will provide the necessary context synchronization before debug exceptions can be taken. For now, avoid the problem by adding an ISB after VHE hyp code restores the host debug registers. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250617133718.4014181-2-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 502a5b73ee70..73881e1dc267 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -167,6 +167,9 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); + + if (has_vhe()) + isb(); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ -- cgit v1.2.3 From 257d0aa8e2502754bc758faceceb6ff59318af60 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:13 +0100 Subject: KVM: arm64: VHE: Synchronize CPTR trap deactivation Currently there is no ISB between __deactivate_cptr_traps() disabling traps that affect EL2 and fpsimd_lazy_switch_to_host() manipulating registers potentially affected by CPTR traps. When NV is not in use, this is safe because the relevant registers are only accessed when guest_owns_fp_regs() && vcpu_has_sve(vcpu), and this also implies that SVE traps affecting EL2 have been deactivated prior to __guest_entry(). When NV is in use, a guest hypervisor may have configured SVE traps for a nested context, and so it is necessary to have an ISB between __deactivate_cptr_traps() and fpsimd_lazy_switch_to_host(). Due to the current lack of an ISB, when a guest hypervisor enables SVE traps in CPTR, the host can take an unexpected SVE trap from within fpsimd_lazy_switch_to_host(), e.g. | Unhandled 64-bit el1h sync exception on CPU1, ESR 0x0000000066000000 -- SVE | CPU: 1 UID: 0 PID: 164 Comm: kvm-vcpu-0 Not tainted 6.15.0-rc4-00138-ga05e0f012c05 #3 PREEMPT | Hardware name: FVP Base RevC (DT) | pstate: 604023c9 (nZCv DAIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) | pc : __kvm_vcpu_run+0x6f4/0x844 | lr : __kvm_vcpu_run+0x150/0x844 | sp : ffff800083903a60 | x29: ffff800083903a90 x28: ffff000801f4a300 x27: 0000000000000000 | x26: 0000000000000000 x25: ffff000801f90000 x24: ffff000801f900f0 | x23: ffff800081ff7720 x22: 0002433c807d623f x21: ffff000801f90000 | x20: ffff00087f730730 x19: 0000000000000000 x18: 0000000000000000 | x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 | x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 | x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 | x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffff000801f90d70 | x5 : 0000000000001000 x4 : ffff8007fd739000 x3 : ffff000801f90000 | x2 : 0000000000000000 x1 : 00000000000003cc x0 : ffff800082f9d000 | Kernel panic - not syncing: Unhandled exception | CPU: 1 UID: 0 PID: 164 Comm: kvm-vcpu-0 Not tainted 6.15.0-rc4-00138-ga05e0f012c05 #3 PREEMPT | Hardware name: FVP Base RevC (DT) | Call trace: | show_stack+0x18/0x24 (C) | dump_stack_lvl+0x60/0x80 | dump_stack+0x18/0x24 | panic+0x168/0x360 | __panic_unhandled+0x68/0x74 | el1h_64_irq_handler+0x0/0x24 | el1h_64_sync+0x6c/0x70 | __kvm_vcpu_run+0x6f4/0x844 (P) | kvm_arm_vcpu_enter_exit+0x64/0xa0 | kvm_arch_vcpu_ioctl_run+0x21c/0x870 | kvm_vcpu_ioctl+0x1a8/0x9d0 | __arm64_sys_ioctl+0xb4/0xf4 | invoke_syscall+0x48/0x104 | el0_svc_common.constprop.0+0x40/0xe0 | do_el0_svc+0x1c/0x28 | el0_svc+0x30/0xcc | el0t_64_sync_handler+0x10c/0x138 | el0t_64_sync+0x198/0x19c | SMP: stopping secondary CPUs | Kernel Offset: disabled | CPU features: 0x0000,000002c0,02df4fb9,97ee773f | Memory Limit: none | ---[ end Kernel panic - not syncing: Unhandled exception ]--- Fix this by adding an ISB between __deactivate_traps() and fpsimd_lazy_switch_to_host(). Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-3-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/vhe/switch.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 09df2b42bc1b..20df44b49ceb 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -667,6 +667,9 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); + /* Ensure CPTR trap deactivation has taken effect */ + isb(); + fpsimd_lazy_switch_to_host(vcpu); sysreg_restore_host_state_vhe(host_ctxt); -- cgit v1.2.3 From e62dd507844fa47f0fdc29f3be5a90a83f297820 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:14 +0100 Subject: KVM: arm64: Reorganise CPTR trap manipulation The NVHE/HVHE and VHE modes have separate implementations of __activate_cptr_traps() and __deactivate_cptr_traps() in their respective switch.c files. There's some duplication of logic, and it's not currently possible to reuse this logic elsewhere. Move the logic into the common switch.h header so that it can be reused, and de-duplicate the common logic. This rework changes the way SVE traps are deactivated in VHE mode, aligning it with NVHE/HVHE modes: * Before this patch, VHE's __deactivate_cptr_traps() would unconditionally enable SVE for host EL2 (but not EL0), regardless of whether the ARM64_SVE cpucap was set. * After this patch, VHE's __deactivate_cptr_traps() will take the ARM64_SVE cpucap into account. When ARM64_SVE is not set, SVE will be trapped from EL2 and below. The old and new behaviour are both benign: * When ARM64_SVE is not set, the host will not touch SVE state, and will not reconfigure SVE traps. Host EL0 access to SVE will be trapped as expected. * When ARM64_SVE is set, the host will configure EL0 SVE traps before returning to EL0 as part of reloading the EL0 FPSIMD/SVE/SME state. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-4-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 130 ++++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/switch.c | 59 --------------- arch/arm64/kvm/hyp/vhe/switch.c | 81 -------------------- 3 files changed, 130 insertions(+), 140 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 76dfda116e56..8a77fcccbcf6 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -65,6 +65,136 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } } +static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA; + + /* + * Always trap SME since it's not supported in KVM. + * TSM is RES1 if SME isn't implemented. + */ + val |= CPTR_EL2_TSM; + + if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) + val |= CPTR_EL2_TZ; + + if (!guest_owns_fp_regs()) + val |= CPTR_EL2_TFP; + + write_sysreg(val, cptr_el2); +} + +static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + /* + * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to + * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, + * except for some missing controls, such as TAM. + * In this case, CPTR_EL2.TAM has the same position with or without + * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM + * shift value for trapping the AMU accesses. + */ + u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA; + u64 cptr; + + if (guest_owns_fp_regs()) { + val |= CPACR_EL1_FPEN; + if (vcpu_has_sve(vcpu)) + val |= CPACR_EL1_ZEN; + } + + if (!vcpu_has_nv(vcpu)) + goto write; + + /* + * The architecture is a bit crap (what a surprise): an EL2 guest + * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, + * as they are RES0 in the guest's view. To work around it, trap the + * sucker using the very same bit it can't set... + */ + if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) + val |= CPTR_EL2_TCPAC; + + /* + * Layer the guest hypervisor's trap configuration on top of our own if + * we're in a nested context. + */ + if (is_hyp_ctxt(vcpu)) + goto write; + + cptr = vcpu_sanitised_cptr_el2(vcpu); + + /* + * Pay attention, there's some interesting detail here. + * + * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two + * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): + * + * - CPTR_EL2.xEN = x0, traps are enabled + * - CPTR_EL2.xEN = x1, traps are disabled + * + * In other words, bit[0] determines if guest accesses trap or not. In + * the interest of simplicity, clear the entire field if the guest + * hypervisor has traps enabled to dispel any illusion of something more + * complicated taking place. + */ + if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_FPEN; + if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) + val &= ~CPACR_EL1_ZEN; + + if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) + val |= cptr & CPACR_EL1_E0POE; + + val |= cptr & CPTR_EL2_TCPAC; + +write: + write_sysreg(val, cpacr_el1); +} + +static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (!guest_owns_fp_regs()) + __activate_traps_fpsimd32(vcpu); + + if (has_vhe() || has_hvhe()) + __activate_cptr_traps_vhe(vcpu); + else + __activate_cptr_traps_nvhe(vcpu); +} + +static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPTR_NVHE_EL2_RES1; + + if (!cpus_have_final_cap(ARM64_SVE)) + val |= CPTR_EL2_TZ; + if (!cpus_have_final_cap(ARM64_SME)) + val |= CPTR_EL2_TSM; + + write_sysreg(val, cptr_el2); +} + +static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu) +{ + u64 val = CPACR_EL1_FPEN; + + if (cpus_have_final_cap(ARM64_SVE)) + val |= CPACR_EL1_ZEN; + if (cpus_have_final_cap(ARM64_SME)) + val |= CPACR_EL1_SMEN; + + write_sysreg(val, cpacr_el1); +} + +static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) +{ + if (has_vhe() || has_hvhe()) + __deactivate_cptr_traps_vhe(vcpu); + else + __deactivate_cptr_traps_nvhe(vcpu); +} + #define reg_to_fgt_masks(reg) \ ({ \ struct fgt_masks *m; \ diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 73affe1333a4..0e752b515d0f 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -47,65 +47,6 @@ struct fgt_masks hdfgwtr2_masks; extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); -static void __activate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 val = CPTR_EL2_TAM; /* Same bit irrespective of E2H */ - - if (!guest_owns_fp_regs()) - __activate_traps_fpsimd32(vcpu); - - if (has_hvhe()) { - val |= CPACR_EL1_TTA; - - if (guest_owns_fp_regs()) { - val |= CPACR_EL1_FPEN; - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; - } - - write_sysreg(val, cpacr_el1); - } else { - val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1; - - /* - * Always trap SME since it's not supported in KVM. - * TSM is RES1 if SME isn't implemented. - */ - val |= CPTR_EL2_TSM; - - if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs()) - val |= CPTR_EL2_TZ; - - if (!guest_owns_fp_regs()) - val |= CPTR_EL2_TFP; - - write_sysreg(val, cptr_el2); - } -} - -static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) -{ - if (has_hvhe()) { - u64 val = CPACR_EL1_FPEN; - - if (cpus_have_final_cap(ARM64_SVE)) - val |= CPACR_EL1_ZEN; - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN; - - write_sysreg(val, cpacr_el1); - } else { - u64 val = CPTR_NVHE_EL2_RES1; - - if (!cpus_have_final_cap(ARM64_SVE)) - val |= CPTR_EL2_TZ; - if (!cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; - - write_sysreg(val, cptr_el2); - } -} - static void __activate_traps(struct kvm_vcpu *vcpu) { ___activate_traps(vcpu, vcpu->arch.hcr_el2); diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 20df44b49ceb..32b4814eb2b7 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -90,87 +90,6 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu) return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE); } -static void __activate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 cptr; - - /* - * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to - * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2, - * except for some missing controls, such as TAM. - * In this case, CPTR_EL2.TAM has the same position with or without - * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM - * shift value for trapping the AMU accesses. - */ - u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM; - - if (guest_owns_fp_regs()) { - val |= CPACR_EL1_FPEN; - if (vcpu_has_sve(vcpu)) - val |= CPACR_EL1_ZEN; - } else { - __activate_traps_fpsimd32(vcpu); - } - - if (!vcpu_has_nv(vcpu)) - goto write; - - /* - * The architecture is a bit crap (what a surprise): an EL2 guest - * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA, - * as they are RES0 in the guest's view. To work around it, trap the - * sucker using the very same bit it can't set... - */ - if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu)) - val |= CPTR_EL2_TCPAC; - - /* - * Layer the guest hypervisor's trap configuration on top of our own if - * we're in a nested context. - */ - if (is_hyp_ctxt(vcpu)) - goto write; - - cptr = vcpu_sanitised_cptr_el2(vcpu); - - /* - * Pay attention, there's some interesting detail here. - * - * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two - * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest): - * - * - CPTR_EL2.xEN = x0, traps are enabled - * - CPTR_EL2.xEN = x1, traps are disabled - * - * In other words, bit[0] determines if guest accesses trap or not. In - * the interest of simplicity, clear the entire field if the guest - * hypervisor has traps enabled to dispel any illusion of something more - * complicated taking place. - */ - if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0))) - val &= ~CPACR_EL1_FPEN; - if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0))) - val &= ~CPACR_EL1_ZEN; - - if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP)) - val |= cptr & CPACR_EL1_E0POE; - - val |= cptr & CPTR_EL2_TCPAC; - -write: - write_sysreg(val, cpacr_el1); -} - -static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu) -{ - u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN; - - if (cpus_have_final_cap(ARM64_SME)) - val |= CPACR_EL1_SMEN_EL1EN; - - write_sysreg(val, cpacr_el1); -} - static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; -- cgit v1.2.3 From 59e6e101a6fa542a365dd5858affd18ba3e84cb8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:15 +0100 Subject: KVM: arm64: Remove ad-hoc CPTR manipulation from fpsimd_sve_sync() There's no need for fpsimd_sve_sync() to write to CPTR/CPACR. All relevant traps are always disabled earlier within __kvm_vcpu_run(), when __deactivate_cptr_traps() configures CPTR/CPACR. With irrelevant details elided, the flow is: handle___kvm_vcpu_run(...) { flush_hyp_vcpu(...) { fpsimd_sve_flush(...); } __kvm_vcpu_run(...) { __activate_traps(...) { __activate_cptr_traps(...); } do { __guest_enter(...); } while (...); __deactivate_traps(....) { __deactivate_cptr_traps(...); } } sync_hyp_vcpu(...) { fpsimd_sve_sync(...); } } Remove the unnecessary write to CPTR/CPACR. An ISB is still necessary, so a comment is added to describe this requirement. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-5-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index e9198e56e784..3206b2c07f82 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -69,7 +69,10 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu) if (!guest_owns_fp_regs()) return; - cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); + /* + * Traps have been disabled by __deactivate_cptr_traps(), but there + * hasn't necessarily been a context synchronization event yet. + */ isb(); if (vcpu_has_sve(vcpu)) -- cgit v1.2.3 From 186b58bacd74d9b7892869f7c7d20cf865a3c237 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:16 +0100 Subject: KVM: arm64: Remove ad-hoc CPTR manipulation from kvm_hyp_handle_fpsimd() The hyp code FPSIMD/SVE/SME trap handling logic has some rather messy open-coded manipulation of CPTR/CPACR. This is benign for non-nested guests, but broken for nested guests, as the guest hypervisor's CPTR configuration is not taken into account. Consider the case where L0 provides FPSIMD+SVE to an L1 guest hypervisor, and the L1 guest hypervisor only provides FPSIMD to an L2 guest (with L1 configuring CPTR/CPACR to trap SVE usage from L2). If the L2 guest triggers an FPSIMD trap to the L0 hypervisor, kvm_hyp_handle_fpsimd() will see that the vCPU supports FPSIMD+SVE, and will configure CPTR/CPACR to NOT trap FPSIMD+SVE before returning to the L2 guest. Consequently the L2 guest would be able to manipulate SVE state even though the L1 hypervisor had configured CPTR/CPACR to forbid this. Clean this up, and fix the nested virt issue by always using __deactivate_cptr_traps() and __activate_cptr_traps() to manage the CPTR traps. This removes the need for the ad-hoc fixup in kvm_hyp_save_fpsimd_host(), and ensures that any guest hypervisor configuration of CPTR/CPACR is taken into account. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-6-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/include/hyp/switch.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 8a77fcccbcf6..2ad57b117385 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -616,11 +616,6 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu) */ if (system_supports_sve()) { __hyp_sve_save_host(); - - /* Re-enable SVE traps if not supported for the guest vcpu. */ - if (!vcpu_has_sve(vcpu)) - cpacr_clear_set(CPACR_EL1_ZEN, 0); - } else { __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs)); } @@ -671,10 +666,7 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve())) - cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN); - else - cpacr_clear_set(0, CPACR_EL1_FPEN); + __deactivate_cptr_traps(vcpu); isb(); /* Write out the host state if it's in the registers */ @@ -696,6 +688,13 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED; + /* + * Re-enable traps necessary for the current state of the guest, e.g. + * those enabled by a guest hypervisor. The ERET to the guest will + * provide the necessary context synchronization. + */ + __activate_cptr_traps(vcpu); + return true; } -- cgit v1.2.3 From 3a300a33e4063fb44c7887bec3aecd2fd6966df8 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:17 +0100 Subject: KVM: arm64: Remove cpacr_clear_set() We no longer use cpacr_clear_set(). Remove cpacr_clear_set() and its helper functions. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-7-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_emulate.h | 62 ------------------------------------ 1 file changed, 62 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index bd020fc28aa9..0720898f563e 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -561,68 +561,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu) vcpu_set_flag((v), e); \ } while (0) -#define __build_check_all_or_none(r, bits) \ - BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits)) - -#define __cpacr_to_cptr_clr(clr, set) \ - ({ \ - u64 cptr = 0; \ - \ - if ((set) & CPACR_EL1_FPEN) \ - cptr |= CPTR_EL2_TFP; \ - if ((set) & CPACR_EL1_ZEN) \ - cptr |= CPTR_EL2_TZ; \ - if ((set) & CPACR_EL1_SMEN) \ - cptr |= CPTR_EL2_TSM; \ - if ((clr) & CPACR_EL1_TTA) \ - cptr |= CPTR_EL2_TTA; \ - if ((clr) & CPTR_EL2_TAM) \ - cptr |= CPTR_EL2_TAM; \ - if ((clr) & CPTR_EL2_TCPAC) \ - cptr |= CPTR_EL2_TCPAC; \ - \ - cptr; \ - }) - -#define __cpacr_to_cptr_set(clr, set) \ - ({ \ - u64 cptr = 0; \ - \ - if ((clr) & CPACR_EL1_FPEN) \ - cptr |= CPTR_EL2_TFP; \ - if ((clr) & CPACR_EL1_ZEN) \ - cptr |= CPTR_EL2_TZ; \ - if ((clr) & CPACR_EL1_SMEN) \ - cptr |= CPTR_EL2_TSM; \ - if ((set) & CPACR_EL1_TTA) \ - cptr |= CPTR_EL2_TTA; \ - if ((set) & CPTR_EL2_TAM) \ - cptr |= CPTR_EL2_TAM; \ - if ((set) & CPTR_EL2_TCPAC) \ - cptr |= CPTR_EL2_TCPAC; \ - \ - cptr; \ - }) - -#define cpacr_clear_set(clr, set) \ - do { \ - BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0); \ - BUILD_BUG_ON((clr) & CPACR_EL1_E0POE); \ - __build_check_all_or_none((clr), CPACR_EL1_FPEN); \ - __build_check_all_or_none((set), CPACR_EL1_FPEN); \ - __build_check_all_or_none((clr), CPACR_EL1_ZEN); \ - __build_check_all_or_none((set), CPACR_EL1_ZEN); \ - __build_check_all_or_none((clr), CPACR_EL1_SMEN); \ - __build_check_all_or_none((set), CPACR_EL1_SMEN); \ - \ - if (has_vhe() || has_hvhe()) \ - sysreg_clear_set(cpacr_el1, clr, set); \ - else \ - sysreg_clear_set(cptr_el2, \ - __cpacr_to_cptr_clr(clr, set), \ - __cpacr_to_cptr_set(clr, set));\ - } while (0) - /* * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE * format if E2H isn't set. -- cgit v1.2.3 From 04c5355b2a94ff3191ce63ab035fb7f04d036869 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 17 Jun 2025 14:37:18 +0100 Subject: KVM: arm64: VHE: Centralize ISBs when returning to host The VHE hyp code has recently gained a few ISBs. Simplify this to one unconditional ISB in __kvm_vcpu_run_vhe(), and remove the unnecessary ISB from the kvm_call_hyp_ret() macro. While kvm_call_hyp_ret() is also used to invoke __vgic_v3_get_gic_config(), but no ISB is necessary in that case either. For the moment, an ISB is left in kvm_call_hyp(), as there are many more users, and removing the ISB would require a more thorough audit. Suggested-by: Marc Zyngier Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Fuad Tabba Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Will Deacon Link: https://lore.kernel.org/r/20250617133718.4014181-8-mark.rutland@arm.com Signed-off-by: Marc Zyngier --- arch/arm64/include/asm/kvm_host.h | 6 ++---- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 3 --- arch/arm64/kvm/hyp/vhe/switch.c | 25 ++++++++++++------------- 3 files changed, 14 insertions(+), 20 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 5ccca509dff1..d27079968341 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1289,9 +1289,8 @@ void kvm_arm_resume_guest(struct kvm *kvm); }) /* - * The couple of isb() below are there to guarantee the same behaviour - * on VHE as on !VHE, where the eret to EL1 acts as a context - * synchronization event. + * The isb() below is there to guarantee the same behaviour on VHE as on !VHE, + * where the eret to EL1 acts as a context synchronization event. */ #define kvm_call_hyp(f, ...) \ do { \ @@ -1309,7 +1308,6 @@ void kvm_arm_resume_guest(struct kvm *kvm); \ if (has_vhe()) { \ ret = f(__VA_ARGS__); \ - isb(); \ } else { \ ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__); \ } \ diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 73881e1dc267..502a5b73ee70 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -167,9 +167,6 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu) __debug_save_state(guest_dbg, guest_ctxt); __debug_restore_state(host_dbg, host_ctxt); - - if (has_vhe()) - isb(); } #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */ diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 32b4814eb2b7..477f1580ffea 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -558,10 +558,10 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) host_ctxt = host_data_ptr(host_ctxt); guest_ctxt = &vcpu->arch.ctxt; - sysreg_save_host_state_vhe(host_ctxt); - fpsimd_lazy_switch_to_guest(vcpu); + sysreg_save_host_state_vhe(host_ctxt); + /* * Note that ARM erratum 1165522 requires us to configure both stage 1 * and stage 2 translation for the guest context before we clear @@ -586,18 +586,23 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) __deactivate_traps(vcpu); - /* Ensure CPTR trap deactivation has taken effect */ + sysreg_restore_host_state_vhe(host_ctxt); + + __debug_switch_to_host(vcpu); + + /* + * Ensure that all system register writes above have taken effect + * before returning to the host. In VHE mode, CPTR traps for + * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be + * manipulated after the ISB. + */ isb(); fpsimd_lazy_switch_to_host(vcpu); - sysreg_restore_host_state_vhe(host_ctxt); - if (guest_owns_fp_regs()) __fpsimd_save_fpexc32(vcpu); - __debug_switch_to_host(vcpu); - return exit_code; } NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe); @@ -627,12 +632,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) */ local_daif_restore(DAIF_PROCCTX_NOIRQ); - /* - * When we exit from the guest we change a number of CPU configuration - * parameters, such as traps. We rely on the isb() in kvm_call_hyp*() - * to make sure these changes take effect before running the host or - * additional guests. - */ return ret; } -- cgit v1.2.3 From b5aafcb4efd2bdacbc37753cf807d69faa6a7304 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:19 +0800 Subject: KVM: TDX: Add new TDVMCALL status code for unsupported subfuncs Add the new TDVMCALL status code TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED and return it for unimplemented TDVMCALL subfunctions. Returning TDVMCALL_STATUS_INVALID_OPERAND when a subfunction is not implemented is vague because TDX guests can't tell the error is due to the subfunction is not supported or an invalid input of the subfunction. New GHCI spec adds TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED to avoid the ambiguity. Use it instead of TDVMCALL_STATUS_INVALID_OPERAND. Before the change, for common guest implementations, when a TDX guest receives TDVMCALL_STATUS_INVALID_OPERAND, it has two cases: 1. Some operand is invalid. It could change the operand to another value retry. 2. The subfunction is not supported. For case 1, an invalid operand usually means the guest implementation bug. Since the TDX guest can't tell which case is, the best practice for handling TDVMCALL_STATUS_INVALID_OPERAND is stopping calling such leaf, treating the failure as fatal if the TDVMCALL is essential or ignoring it if the TDVMCALL is optional. With this change, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED could be sent to old TDX guest that do not know about it, but it is expected that the guest will make the same action as TDVMCALL_STATUS_INVALID_OPERAND. Currently, no known TDX guest checks TDVMCALL_STATUS_INVALID_OPERAND specifically; for example Linux just checks for success. Signed-off-by: Binbin Wu [Return it for untrapped KVM_HC_MAP_GPA_RANGE. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/shared/tdx.h | 1 + arch/x86/kvm/vmx/tdx.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 2f3820342598..d8525e6ef50a 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -80,6 +80,7 @@ #define TDVMCALL_STATUS_RETRY 0x0000000000000001ULL #define TDVMCALL_STATUS_INVALID_OPERAND 0x8000000000000000ULL #define TDVMCALL_STATUS_ALIGN_ERROR 0x8000000000000002ULL +#define TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED 0x8000000000000003ULL /* * Bitmasks of exposed registers (with VMM). diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b952bc673271..5d100c240ab3 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1212,11 +1212,13 @@ static int tdx_map_gpa(struct kvm_vcpu *vcpu) /* * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE - * bit set. If not, the error code is not defined in GHCI for TDX, use - * TDVMCALL_STATUS_INVALID_OPERAND for this case. + * bit set. This is a base call so it should always be supported, but + * KVM has no way to ensure that userspace implements the GHCI correctly. + * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error + * to the guest. */ if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) { - ret = TDVMCALL_STATUS_INVALID_OPERAND; + ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; goto error; } @@ -1476,7 +1478,7 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) break; } - tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED); return 1; } -- cgit v1.2.3 From cf207eac06f661fb692f405d5ab8230df884ee52 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:20 +0800 Subject: KVM: TDX: Handle TDG.VP.VMCALL Handle TDVMCALL for GetQuote to generate a TD-Quote. GetQuote is a doorbell-like interface used by TDX guests to request VMM to generate a TD-Quote signed by a service hosting TD-Quoting Enclave operating on the host. A TDX guest passes a TD Report (TDREPORT_STRUCT) in a shared-memory area as parameter. Host VMM can access it and queue the operation for a service hosting TD-Quoting enclave. When completed, the Quote is returned via the same shared-memory area. KVM only checks the GPA from the TDX guest has the shared-bit set and drops the shared-bit before exiting to userspace to avoid bleeding the shared-bit into KVM's exit ABI. KVM forwards the request to userspace VMM (e.g. QEMU) and userspace VMM queues the operation asynchronously. KVM sets the return code according to the 'ret' field set by userspace to notify the TDX guest whether the request has been queued successfully or not. When the request has been queued successfully, the TDX guest can poll the status field in the shared-memory area to check whether the Quote generation is completed or not. When completed, the generated Quote is returned via the same buffer. Add KVM_EXIT_TDX as a new exit reason to userspace. Userspace is required to handle the KVM exit reason as the initial support for TDX, by reentering KVM to ensure that the TDVMCALL is complete. While at it, add a note that KVM_EXIT_HYPERCALL also requires reentry with KVM_RUN. Signed-off-by: Binbin Wu Tested-by: Mikko Ylinen Acked-by: Kai Huang [Adjust userspace API. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 5d100c240ab3..b619a3478983 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1465,6 +1465,36 @@ static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) return 1; } +static int tdx_complete_simple(struct kvm_vcpu *vcpu) +{ + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret); + return 1; +} + +static int tdx_get_quote(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + u64 gpa = tdx->vp_enter_args.r12; + u64 size = tdx->vp_enter_args.r13; + + /* The gpa of buffer must have shared bit set. */ + if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) { + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE; + vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED; + vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm)); + vcpu->run->tdx.get_quote.size = size; + + vcpu->arch.complete_userspace_io = tdx_complete_simple; + + return 0; +} + static int handle_tdvmcall(struct kvm_vcpu *vcpu) { switch (tdvmcall_leaf(vcpu)) { @@ -1474,6 +1504,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu) return tdx_report_fatal_error(vcpu); case TDVMCALL_GET_TD_VM_CALL_INFO: return tdx_get_td_vm_call_info(vcpu); + case TDVMCALL_GET_QUOTE: + return tdx_get_quote(vcpu); default: break; } -- cgit v1.2.3 From 25e8b1dd4883e6c251c3db5b347f3c8ae4ade921 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Tue, 10 Jun 2025 10:14:21 +0800 Subject: KVM: TDX: Exit to userspace for GetTdVmCallInfo Exit to userspace for TDG.VP.VMCALL via KVM_EXIT_TDX, to allow userspace to provide information about the support of TDVMCALLs when r12 is 1 for the TDVMCALLs beyond the GHCI base API. GHCI spec defines the GHCI base TDVMCALLs: , , , , <#VE.RequestMMIO>, , , and . They must be supported by VMM to support TDX guests. For GetTdVmCallInfo - When leaf (r12) to enumerate TDVMCALL functionality is set to 0, successful execution indicates all GHCI base TDVMCALLs listed above are supported. Update the KVM TDX document with the set of the GHCI base APIs. - When leaf (r12) to enumerate TDVMCALL functionality is set to 1, it indicates the TDX guest is querying the supported TDVMCALLs beyond the GHCI base TDVMCALLs. Exit to userspace to let userspace set the TDVMCALL sub-function bit(s) accordingly to the leaf outputs. KVM could set the TDVMCALL bit(s) supported by itself when the TDVMCALLs don't need support from userspace after returning from userspace and before entering guest. Currently, no such TDVMCALLs implemented, KVM just sets the values returned from userspace. Suggested-by: Paolo Bonzini Signed-off-by: Binbin Wu [Adjust userspace API. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/tdx.c | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index b619a3478983..1ad20c273f3b 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1451,18 +1451,53 @@ error: return 1; } +static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_tdx *tdx = to_tdx(vcpu); + + tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret); + + /* + * For now, there is no TDVMCALL beyond GHCI base API supported by KVM + * directly without the support from userspace, just set the value + * returned from userspace. + */ + tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11; + tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12; + tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13; + tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14; + + return 1; +} + static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu) { struct vcpu_tdx *tdx = to_tdx(vcpu); - if (tdx->vp_enter_args.r12) - tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); - else { + switch (tdx->vp_enter_args.r12) { + case 0: tdx->vp_enter_args.r11 = 0; + tdx->vp_enter_args.r12 = 0; tdx->vp_enter_args.r13 = 0; tdx->vp_enter_args.r14 = 0; + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS); + return 1; + case 1: + vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12; + vcpu->run->exit_reason = KVM_EXIT_TDX; + vcpu->run->tdx.flags = 0; + vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO; + vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS; + vcpu->run->tdx.get_tdvmcall_info.r11 = 0; + vcpu->run->tdx.get_tdvmcall_info.r12 = 0; + vcpu->run->tdx.get_tdvmcall_info.r13 = 0; + vcpu->run->tdx.get_tdvmcall_info.r14 = 0; + vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info; + return 0; + default: + tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND); + return 1; } - return 1; } static int tdx_complete_simple(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From c55c7a85e02a7bfee20a3ffebdff7cbeb41613ef Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 6 Jun 2025 20:44:25 +0800 Subject: um: ubd: Add missing error check in start_io_thread() The subsequent call to os_set_fd_block() overwrites the previous return value. OR the two return values together to fix it. Fixes: f88f0bdfc32f ("um: UBD Improvements") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250606124428.148164-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/ubd_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c index c5e6545f6fcf..8e8a8bf518b6 100644 --- a/arch/um/drivers/ubd_user.c +++ b/arch/um/drivers/ubd_user.c @@ -41,7 +41,7 @@ int start_io_thread(struct os_helper_thread **td_out, int *fd_out) *fd_out = fds[1]; err = os_set_fd_block(*fd_out, 0); - err = os_set_fd_block(kernel_fd, 0); + err |= os_set_fd_block(kernel_fd, 0); if (err) { printk("start_io_thread - failed to set nonblocking I/O.\n"); goto out_close; -- cgit v1.2.3 From bc4e2ae08183d9017f43bf88aa7cfdf84e76f573 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 6 Jun 2025 20:44:27 +0800 Subject: um: vfio: Prevent duplicate device assignments Ensure devices are assigned only once. Reject subsequent requests for duplicate assignments. Fixes: a0e2cb6a9063 ("um: Add VFIO-based virtual PCI driver") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250606124428.148164-4-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/vfio_kern.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch') diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c index b51fc9888ae1..13b971a2bd43 100644 --- a/arch/um/drivers/vfio_kern.c +++ b/arch/um/drivers/vfio_kern.c @@ -570,6 +570,17 @@ static void uml_vfio_release_device(struct uml_vfio_device *dev) kfree(dev); } +static struct uml_vfio_device *uml_vfio_find_device(const char *device) +{ + struct uml_vfio_device *dev; + + list_for_each_entry(dev, ¨_vfio_devices, list) { + if (!strcmp(dev->name, device)) + return dev; + } + return NULL; +} + static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp) { struct uml_vfio_device *dev; @@ -582,6 +593,9 @@ static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *k uml_vfio_container.fd = fd; } + if (uml_vfio_find_device(device)) + return -EEXIST; + dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; -- cgit v1.2.3 From 8948941276024fcfb08fdb3d678867dc1f7b1c16 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 6 Jun 2025 20:44:28 +0800 Subject: um: Use correct data source in fpregs_legacy_set() Read from the buffer pointed to by 'from' instead of '&buf', as 'buf' contains no valid data when 'ubuf' is NULL. Fixes: b1e1bd2e6943 ("um: Add helper functions to get/set state for SECCOMP") Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250606124428.148164-5-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/x86/um/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c index 3275870330fe..fae8aabad10f 100644 --- a/arch/x86/um/ptrace.c +++ b/arch/x86/um/ptrace.c @@ -161,7 +161,7 @@ static int fpregs_legacy_set(struct task_struct *target, from = kbuf; } - return um_fxsr_from_i387(fxsave, &buf); + return um_fxsr_from_i387(fxsave, from); } #endif -- cgit v1.2.3 From 2d65fc13be85c336c56af7077f08ccd3a3a15a4a Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 23 Jun 2025 19:08:29 +0800 Subject: um: vector: Reduce stack usage in vector_eth_configure() When compiling with clang (19.1.7), initializing *vp using a compound literal may result in excessive stack usage. Fix it by initializing the required fields of *vp individually. Without this patch: $ objdump -d arch/um/drivers/vector_kern.o | ./scripts/checkstack.pl x86_64 0 ... 0x0000000000000540 vector_eth_configure [vector_kern.o]:1472 ... With this patch: $ objdump -d arch/um/drivers/vector_kern.o | ./scripts/checkstack.pl x86_64 0 ... 0x0000000000000540 vector_eth_configure [vector_kern.o]:208 ... Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506221017.WtB7Usua-lkp@intel.com/ Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250623110829.314864-1-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/vector_kern.c | 42 +++++++++++++----------------------------- 1 file changed, 13 insertions(+), 29 deletions(-) (limited to 'arch') diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index f292e0b4ff8b..9bbbddfe866b 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1625,35 +1625,19 @@ static void vector_eth_configure( device->dev = dev; - *vp = ((struct vector_private) - { - .list = LIST_HEAD_INIT(vp->list), - .dev = dev, - .unit = n, - .options = get_transport_options(def), - .rx_irq = 0, - .tx_irq = 0, - .parsed = def, - .max_packet = get_mtu(def) + ETH_HEADER_OTHER, - /* TODO - we need to calculate headroom so that ip header - * is 16 byte aligned all the time - */ - .headroom = get_headroom(def), - .form_header = NULL, - .verify_header = NULL, - .header_rxbuffer = NULL, - .header_txbuffer = NULL, - .header_size = 0, - .rx_header_size = 0, - .rexmit_scheduled = false, - .opened = false, - .transport_data = NULL, - .in_write_poll = false, - .coalesce = 2, - .req_size = get_req_size(def), - .in_error = false, - .bpf = NULL - }); + INIT_LIST_HEAD(&vp->list); + vp->dev = dev; + vp->unit = n; + vp->options = get_transport_options(def); + vp->parsed = def; + vp->max_packet = get_mtu(def) + ETH_HEADER_OTHER; + /* + * TODO - we need to calculate headroom so that ip header + * is 16 byte aligned all the time + */ + vp->headroom = get_headroom(def); + vp->coalesce = 2; + vp->req_size = get_req_size(def); dev->features = dev->hw_features = (NETIF_F_SG | NETIF_F_FRAGLIST); INIT_WORK(&vp->reset_tx, vector_reset_tx); -- cgit v1.2.3