From 89079520cef65d6da1e864eab4464effe5396e23 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Fri, 11 Apr 2025 10:46:00 +0800
Subject: RISC-V: vDSO: Wire up getrandom() vDSO implementation

Hook up the generic vDSO implementation to the generic vDSO getrandom
implementation by providing the required __arch_chacha20_blocks_nostack
and getrandom_syscall implementations. Also wire up the selftests.

The benchmark result:

	vdso: 25000000 times in 2.466341333 seconds
	libc: 25000000 times in 41.447720005 seconds
	syscall: 25000000 times in 41.043926672 seconds

	vdso: 25000000 x 256 times in 162.286219353 seconds
	libc: 25000000 x 256 times in 2953.855018685 seconds
	syscall: 25000000 x 256 times in 2796.268546000 seconds

Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Link: https://lore.kernel.org/r/20250411024600.16045-1-xry111@xry111.site
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/Kconfig                         |   1 +
 arch/riscv/include/asm/vdso/getrandom.h    |  30 ++++
 arch/riscv/kernel/vdso/Makefile            |  12 ++
 arch/riscv/kernel/vdso/getrandom.c         |  10 ++
 arch/riscv/kernel/vdso/vdso.lds.S          |   1 +
 arch/riscv/kernel/vdso/vgetrandom-chacha.S | 244 +++++++++++++++++++++++++++++
 6 files changed, 298 insertions(+)
 create mode 100644 arch/riscv/include/asm/vdso/getrandom.h
 create mode 100644 arch/riscv/kernel/vdso/getrandom.c
 create mode 100644 arch/riscv/kernel/vdso/vgetrandom-chacha.S

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index bbec87b79309..fbca724302ab 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -218,6 +218,7 @@ config RISCV
 	select THREAD_INFO_IN_TASK
 	select TRACE_IRQFLAGS_SUPPORT
 	select UACCESS_MEMCPY if !MMU
+	select VDSO_GETRANDOM if HAVE_GENERIC_VDSO
 	select USER_STACKTRACE_SUPPORT
 	select ZONE_DMA32 if 64BIT
 
diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..8dc92441702a
--- /dev/null
+++ b/arch/riscv/include/asm/vdso/getrandom.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/unistd.h>
+
+static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags)
+{
+	register long ret asm("a0");
+	register long nr asm("a7") = __NR_getrandom;
+	register void *buffer asm("a0") = _buffer;
+	register size_t len asm("a1") = _len;
+	register unsigned int flags asm("a2") = _flags;
+
+	asm volatile ("ecall\n"
+		      : "+r" (ret)
+		      : "r" (nr), "r" (buffer), "r" (len), "r" (flags)
+		      : "memory");
+
+	return ret;
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile
index ad73607abc28..7575ef088adc 100644
--- a/arch/riscv/kernel/vdso/Makefile
+++ b/arch/riscv/kernel/vdso/Makefile
@@ -13,9 +13,17 @@ vdso-syms += flush_icache
 vdso-syms += hwprobe
 vdso-syms += sys_hwprobe
 
+ifdef CONFIG_VDSO_GETRANDOM
+vdso-syms += getrandom
+endif
+
 # Files to link into the vdso
 obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o
 
+ifdef CONFIG_VDSO_GETRANDOM
+obj-vdso += vgetrandom-chacha.o
+endif
+
 ccflags-y := -fno-stack-protector
 ccflags-y += -DDISABLE_BRANCH_PROFILING
 ccflags-y += -fno-builtin
@@ -24,6 +32,10 @@ ifneq ($(c-gettimeofday-y),)
   CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y)
 endif
 
+ifneq ($(c-getrandom-y),)
+  CFLAGS_getrandom.o += -fPIC -include $(c-getrandom-y)
+endif
+
 CFLAGS_hwprobe.o += -fPIC
 
 # Build rules
diff --git a/arch/riscv/kernel/vdso/getrandom.c b/arch/riscv/kernel/vdso/getrandom.c
new file mode 100644
index 000000000000..f21922e8cebd
--- /dev/null
+++ b/arch/riscv/kernel/vdso/getrandom.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+#include <linux/types.h>
+
+ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+	return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+}
diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S
index 8e86965a8aae..abc69cda0445 100644
--- a/arch/riscv/kernel/vdso/vdso.lds.S
+++ b/arch/riscv/kernel/vdso/vdso.lds.S
@@ -80,6 +80,7 @@ VERSION
 #ifndef COMPAT_VDSO
 		__vdso_riscv_hwprobe;
 #endif
+		__vdso_getrandom;
 	local: *;
 	};
 }
diff --git a/arch/riscv/kernel/vdso/vgetrandom-chacha.S b/arch/riscv/kernel/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..d793cadc78a6
--- /dev/null
+++ b/arch/riscv/kernel/vdso/vgetrandom-chacha.S
@@ -0,0 +1,244 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ *
+ * Based on arch/loongarch/vdso/vgetrandom-chacha.S.
+ */
+
+#include <asm/asm.h>
+#include <linux/linkage.h>
+
+.text
+
+.macro	ROTRI	rd rs imm
+	slliw	t0, \rs, 32 - \imm
+	srliw	\rd, \rs, \imm
+	or	\rd, \rd, t0
+.endm
+
+.macro	OP_4REG	op d0 d1 d2 d3 s0 s1 s2 s3
+	\op	\d0, \d0, \s0
+	\op	\d1, \d1, \s1
+	\op	\d2, \d2, \s2
+	\op	\d3, \d3, \s3
+.endm
+
+/*
+ *	a0: output bytes
+ * 	a1: 32-byte key input
+ *	a2: 8-byte counter input/output
+ *	a3: number of 64-byte blocks to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+#define output		a0
+#define key		a1
+#define counter		a2
+#define nblocks		a3
+#define i		a4
+#define state0		s0
+#define state1		s1
+#define state2		s2
+#define state3		s3
+#define state4		s4
+#define state5		s5
+#define state6		s6
+#define state7		s7
+#define state8		s8
+#define state9		s9
+#define state10		s10
+#define state11		s11
+#define state12		a5
+#define state13		a6
+#define state14		a7
+#define state15		t1
+#define cnt		t2
+#define copy0		t3
+#define copy1		t4
+#define copy2		t5
+#define copy3		t6
+
+/* Packs to be used with OP_4REG */
+#define line0		state0, state1, state2, state3
+#define line1		state4, state5, state6, state7
+#define line2		state8, state9, state10, state11
+#define line3		state12, state13, state14, state15
+
+#define line1_perm	state5, state6, state7, state4
+#define line2_perm	state10, state11, state8, state9
+#define line3_perm	state15, state12, state13, state14
+
+#define copy		copy0, copy1, copy2, copy3
+
+#define _16		16, 16, 16, 16
+#define _20		20, 20, 20, 20
+#define _24		24, 24, 24, 24
+#define _25		25, 25, 25, 25
+
+	addi		sp, sp, -12*SZREG
+	REG_S		s0,         (sp)
+	REG_S		s1,    SZREG(sp)
+	REG_S		s2,  2*SZREG(sp)
+	REG_S		s3,  3*SZREG(sp)
+	REG_S		s4,  4*SZREG(sp)
+	REG_S		s5,  5*SZREG(sp)
+	REG_S		s6,  6*SZREG(sp)
+	REG_S		s7,  7*SZREG(sp)
+	REG_S		s8,  8*SZREG(sp)
+	REG_S		s9,  9*SZREG(sp)
+	REG_S		s10, 10*SZREG(sp)
+	REG_S		s11, 11*SZREG(sp)
+
+	ld		cnt, (counter)
+
+	li		copy0, 0x61707865
+	li		copy1, 0x3320646e
+	li		copy2, 0x79622d32
+	li		copy3, 0x6b206574
+
+.Lblock:
+	/* state[0,1,2,3] = "expand 32-byte k" */
+	mv		state0, copy0
+	mv		state1, copy1
+	mv		state2, copy2
+	mv		state3, copy3
+
+	/* state[4,5,..,11] = key */
+	lw		state4,   (key)
+	lw		state5,  4(key)
+	lw		state6,  8(key)
+	lw		state7,  12(key)
+	lw		state8,  16(key)
+	lw		state9,  20(key)
+	lw		state10, 24(key)
+	lw		state11, 28(key)
+
+	/* state[12,13] = counter */
+	mv		state12, cnt
+	srli		state13, cnt, 32
+
+	/* state[14,15] = 0 */
+	mv		state14, zero
+	mv		state15, zero
+
+	li		i, 10
+.Lpermute:
+	/* odd round */
+	OP_4REG	addw	line0, line1
+	OP_4REG	xor	line3, line0
+	OP_4REG	ROTRI	line3, _16
+
+	OP_4REG	addw	line2, line3
+	OP_4REG	xor	line1, line2
+	OP_4REG	ROTRI	line1, _20
+
+	OP_4REG	addw	line0, line1
+	OP_4REG	xor	line3, line0
+	OP_4REG	ROTRI	line3, _24
+
+	OP_4REG	addw	line2, line3
+	OP_4REG	xor	line1, line2
+	OP_4REG	ROTRI	line1, _25
+
+	/* even round */
+	OP_4REG	addw	line0, line1_perm
+	OP_4REG	xor	line3_perm, line0
+	OP_4REG	ROTRI	line3_perm, _16
+
+	OP_4REG	addw	line2_perm, line3_perm
+	OP_4REG	xor	line1_perm, line2_perm
+	OP_4REG	ROTRI	line1_perm, _20
+
+	OP_4REG	addw	line0, line1_perm
+	OP_4REG	xor	line3_perm, line0
+	OP_4REG	ROTRI	line3_perm, _24
+
+	OP_4REG	addw	line2_perm, line3_perm
+	OP_4REG	xor	line1_perm, line2_perm
+	OP_4REG	ROTRI	line1_perm, _25
+
+	addi		i, i, -1
+	bnez		i, .Lpermute
+
+	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
+	OP_4REG	addw	line0, copy
+	sw		state0,   (output)
+	sw		state1,  4(output)
+	sw		state2,  8(output)
+	sw		state3, 12(output)
+
+	/* from now on state[0,1,2,3] are scratch registers  */
+
+	/* state[0,1,2,3] = lo(key) */
+	lw		state0,   (key)
+	lw		state1,  4(key)
+	lw		state2,  8(key)
+	lw		state3, 12(key)
+
+	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
+	OP_4REG	addw	line1, line0
+	sw		state4, 16(output)
+	sw		state5, 20(output)
+	sw		state6, 24(output)
+	sw		state7, 28(output)
+
+	/* state[0,1,2,3] = hi(key) */
+	lw		state0, 16(key)
+	lw		state1, 20(key)
+	lw		state2, 24(key)
+	lw		state3, 28(key)
+
+	/* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */
+	OP_4REG	addw	line2, line0
+	sw		state8,  32(output)
+	sw		state9,  36(output)
+	sw		state10, 40(output)
+	sw		state11, 44(output)
+
+	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
+	addw		state12, state12, cnt
+	srli		state0, cnt, 32
+	addw		state13, state13, state0
+	sw		state12, 48(output)
+	sw		state13, 52(output)
+	sw		state14, 56(output)
+	sw		state15, 60(output)
+
+	/* ++counter */
+	addi		cnt, cnt, 1
+
+	/* output += 64 */
+	addi		output, output, 64
+	/* --nblocks */
+	addi		nblocks, nblocks, -1
+	bnez		nblocks, .Lblock
+
+	/* counter = [cnt_lo, cnt_hi] */
+	sd		cnt, (counter)
+
+	/* Zero out the potentially sensitive regs, in case nothing uses these
+	 * again.  As at now copy[0,1,2,3] just contains "expand 32-byte k" and
+	 * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we
+	 * only need to zero state[12,...,15].
+	 */
+	mv		state12, zero
+	mv		state13, zero
+	mv		state14, zero
+	mv		state15, zero
+
+	REG_L		s0,         (sp)
+	REG_L		s1,    SZREG(sp)
+	REG_L		s2,  2*SZREG(sp)
+	REG_L		s3,  3*SZREG(sp)
+	REG_L		s4,  4*SZREG(sp)
+	REG_L		s5,  5*SZREG(sp)
+	REG_L		s6,  6*SZREG(sp)
+	REG_L		s7,  7*SZREG(sp)
+	REG_L		s8,  8*SZREG(sp)
+	REG_L		s9,  9*SZREG(sp)
+	REG_L		s10, 10*SZREG(sp)
+	REG_L		s11, 11*SZREG(sp)
+	addi		sp, sp, 12*SZREG
+
+	ret
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
-- 
cgit v1.2.3


From bafa451a96d0f1404aa1a5a267f78767a55fac71 Mon Sep 17 00:00:00 2001
From: Joel Stanley <joel@jms.id.au>
Date: Tue, 15 Apr 2025 21:58:31 +0930
Subject: riscv: defconfig: Remove EXPERT

The setting of EXPERT is a leftover from when the riscv defconfig was
first added. As mentioned in the EXPERT Kconfig help text it is not
intended to be set in the usual case.

Upon removal a bunch of intrusive debug-related kernel options are no
longer set, which is good. A few may want to come back in the future but
let those be advocated for on a case by case basis.

NAMESPACES, SYSFS_SYSCALL and MEDIA_SUPPORT_FILTER default on and thus
fall out of the defconfig.

Set VIDEO_CADENCE_CSI2RX=y to ensure VIDEO_CADENCE_CSI2RX stays enabled.

Set DEBUG_KERNEL=y in line with other arch defconfigs. This turns on
tracing.

Signed-off-by: Joel Stanley <joel@jms.id.au>
Link: https://lore.kernel.org/r/20250415122832.982610-1-joel@jms.id.au
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/configs/defconfig | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig
index 3c8e16d71e17..286f490ead37 100644
--- a/arch/riscv/configs/defconfig
+++ b/arch/riscv/configs/defconfig
@@ -18,12 +18,9 @@ CONFIG_CGROUP_DEVICE=y
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_CGROUP_PERF=y
 CONFIG_CGROUP_BPF=y
-CONFIG_NAMESPACES=y
 CONFIG_USER_NS=y
 CONFIG_CHECKPOINT_RESTORE=y
 CONFIG_BLK_DEV_INITRD=y
-CONFIG_EXPERT=y
-# CONFIG_SYSFS_SYSCALL is not set
 CONFIG_PROFILING=y
 CONFIG_ARCH_MICROCHIP=y
 CONFIG_ARCH_SIFIVE=y
@@ -181,6 +178,7 @@ CONFIG_REGULATOR_FIXED_VOLTAGE=y
 CONFIG_REGULATOR_AXP20X=y
 CONFIG_REGULATOR_GPIO=y
 CONFIG_MEDIA_SUPPORT=m
+CONFIG_MEDIA_PLATFORM_SUPPORT=y
 CONFIG_VIDEO_CADENCE_CSI2RX=m
 CONFIG_DRM=m
 CONFIG_DRM_RADEON=m
@@ -294,25 +292,7 @@ CONFIG_DEFAULT_SECURITY_DAC=y
 CONFIG_CRYPTO_USER_API_HASH=y
 CONFIG_CRYPTO_DEV_VIRTIO=y
 CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_FS=y
-CONFIG_DEBUG_PAGEALLOC=y
-CONFIG_SCHED_STACK_END_CHECK=y
-CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_VM_PGFLAGS=y
-CONFIG_DEBUG_MEMORY_INIT=y
-CONFIG_DEBUG_PER_CPU_MAPS=y
-CONFIG_SOFTLOCKUP_DETECTOR=y
-CONFIG_WQ_WATCHDOG=y
-CONFIG_DEBUG_RT_MUTEXES=y
-CONFIG_DEBUG_SPINLOCK=y
-CONFIG_DEBUG_MUTEXES=y
-CONFIG_DEBUG_RWSEMS=y
-CONFIG_DEBUG_ATOMIC_SLEEP=y
-CONFIG_DEBUG_LIST=y
-CONFIG_DEBUG_PLIST=y
-CONFIG_DEBUG_SG=y
-# CONFIG_RCU_TRACE is not set
-CONFIG_RCU_EQS_DEBUG=y
-# CONFIG_FTRACE is not set
 # CONFIG_RUNTIME_TESTING_MENU is not set
 CONFIG_MEMTEST=y
-- 
cgit v1.2.3


From 96e0b355883006554a0bee3697da475971d6bba8 Mon Sep 17 00:00:00 2001
From: Ross Stutterheim <ross.stutterheim@garmin.com>
Date: Wed, 16 Apr 2025 14:50:06 +0100
Subject: ARM: 9447/1: arm/memremap: fix arch_memremap_can_ram_remap()

arm/memremap: fix arch_memremap_can_ram_remap()

commit 260364d112bc ("arm[64]/memremap: don't abuse pfn_valid() to ensure
presence of linear map") added the definition of
arch_memremap_can_ram_remap() for arm[64] specific filtering of what pages
can be used from the linear mapping. memblock_is_map_memory() was called
with the pfn of the address given to arch_memremap_can_ram_remap();
however, memblock_is_map_memory() expects to be given an address for arm,
not a pfn.

This results in calls to memremap() returning a newly mapped area when
it should return an address in the existing linear mapping.

Fix this by removing the address to pfn translation and pass the
address directly.

Fixes: 260364d112bc ("arm[64]/memremap: don't abuse pfn_valid() to ensure presence of linear map")
Signed-off-by: Ross Stutterheim <ross.stutterheim@garmin.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: stable@vger.kernel.org
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 arch/arm/mm/ioremap.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c
index 748698e91a4b..27e64f782cb3 100644
--- a/arch/arm/mm/ioremap.c
+++ b/arch/arm/mm/ioremap.c
@@ -515,7 +515,5 @@ void __init early_ioremap_init(void)
 bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size,
 				 unsigned long flags)
 {
-	unsigned long pfn = PHYS_PFN(offset);
-
-	return memblock_is_map_memory(pfn);
+	return memblock_is_map_memory(offset);
 }
-- 
cgit v1.2.3


From a2c6c1c23bed3506fd87e00c88540ac47832c867 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 7 Apr 2025 10:03:21 +0300
Subject: x86/PCI: Drop 'pci' suffix from intel_mid_pci.c

CE4100 PCI specific code has no 'pci' suffix in the filename,
intel_mid_pci.c is the only one that duplicates the folder name in its
filename, drop that redundancy.

While at it, group the respective modules in the Makefile.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Link: https://patch.msgid.link/20250407070321.3761063-1-andriy.shevchenko@linux.intel.com
---
 arch/x86/pci/Makefile        |   6 +-
 arch/x86/pci/intel_mid.c     | 406 +++++++++++++++++++++++++++++++++++++++++++
 arch/x86/pci/intel_mid_pci.c | 406 -------------------------------------------
 3 files changed, 409 insertions(+), 409 deletions(-)
 create mode 100644 arch/x86/pci/intel_mid.c
 delete mode 100644 arch/x86/pci/intel_mid_pci.c

(limited to 'arch')

diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 4933fb337983..c1efd5b0d198 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -8,13 +8,13 @@ obj-$(CONFIG_PCI_OLPC)		+= olpc.o
 obj-$(CONFIG_PCI_XEN)		+= xen.o
 
 obj-y				+= fixup.o
-obj-$(CONFIG_X86_INTEL_CE)      += ce4100.o
 obj-$(CONFIG_ACPI)		+= acpi.o
 obj-y				+= legacy.o irq.o
 
-obj-$(CONFIG_X86_NUMACHIP)	+= numachip.o
+obj-$(CONFIG_X86_INTEL_CE)	+= ce4100.o
+obj-$(CONFIG_X86_INTEL_MID)	+= intel_mid.o
 
-obj-$(CONFIG_X86_INTEL_MID)	+= intel_mid_pci.o
+obj-$(CONFIG_X86_NUMACHIP)	+= numachip.o
 
 obj-y				+= common.o early.o
 obj-y				+= bus_numa.o
diff --git a/arch/x86/pci/intel_mid.c b/arch/x86/pci/intel_mid.c
new file mode 100644
index 000000000000..b433b1753016
--- /dev/null
+++ b/arch/x86/pci/intel_mid.c
@@ -0,0 +1,406 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel MID PCI support
+ *   Copyright (c) 2008 Intel Corporation
+ *     Jesse Barnes <jesse.barnes@intel.com>
+ *
+ * Moorestown has an interesting PCI implementation:
+ *   - configuration space is memory mapped (as defined by MCFG)
+ *   - Lincroft devices also have a real, type 1 configuration space
+ *   - Early Lincroft silicon has a type 1 access bug that will cause
+ *     a hang if non-existent devices are accessed
+ *   - some devices have the "fixed BAR" capability, which means
+ *     they can't be relocated or modified; check for that during
+ *     BAR sizing
+ *
+ * So, we use the MCFG space for all reads and writes, but also send
+ * Lincroft writes to type 1 space.  But only read/write if the device
+ * actually exists, otherwise return all 1s for reads and bit bucket
+ * the writes.
+ */
+
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/dmi.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/smp.h>
+
+#include <asm/cpu_device_id.h>
+#include <asm/segment.h>
+#include <asm/pci_x86.h>
+#include <asm/hw_irq.h>
+#include <asm/io_apic.h>
+#include <asm/intel-family.h>
+#include <asm/intel-mid.h>
+#include <asm/acpi.h>
+
+#define PCIE_CAP_OFFSET	0x100
+
+/* Quirks for the listed devices */
+#define PCI_DEVICE_ID_INTEL_MRFLD_MMC	0x1190
+#define PCI_DEVICE_ID_INTEL_MRFLD_HSU	0x1191
+
+/* Fixed BAR fields */
+#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00	/* Fixed BAR (TBD) */
+#define PCI_FIXED_BAR_0_SIZE	0x04
+#define PCI_FIXED_BAR_1_SIZE	0x08
+#define PCI_FIXED_BAR_2_SIZE	0x0c
+#define PCI_FIXED_BAR_3_SIZE	0x10
+#define PCI_FIXED_BAR_4_SIZE	0x14
+#define PCI_FIXED_BAR_5_SIZE	0x1c
+
+static int pci_soc_mode;
+
+/**
+ * fixed_bar_cap - return the offset of the fixed BAR cap if found
+ * @bus: PCI bus
+ * @devfn: device in question
+ *
+ * Look for the fixed BAR cap on @bus and @devfn, returning its offset
+ * if found or 0 otherwise.
+ */
+static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn)
+{
+	int pos;
+	u32 pcie_cap = 0, cap_data;
+
+	pos = PCIE_CAP_OFFSET;
+
+	if (!raw_pci_ext_ops)
+		return 0;
+
+	while (pos) {
+		if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
+					  devfn, pos, 4, &pcie_cap))
+			return 0;
+
+		if (PCI_EXT_CAP_ID(pcie_cap) == 0x0000 ||
+			PCI_EXT_CAP_ID(pcie_cap) == 0xffff)
+			break;
+
+		if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) {
+			raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
+					      devfn, pos + 4, 4, &cap_data);
+			if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR)
+				return pos;
+		}
+
+		pos = PCI_EXT_CAP_NEXT(pcie_cap);
+	}
+
+	return 0;
+}
+
+static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
+				   int reg, int len, u32 val, int offset)
+{
+	u32 size;
+	unsigned int domain, busnum;
+	int bar = (reg - PCI_BASE_ADDRESS_0) >> 2;
+
+	domain = pci_domain_nr(bus);
+	busnum = bus->number;
+
+	if (val == ~0 && len == 4) {
+		unsigned long decode;
+
+		raw_pci_ext_ops->read(domain, busnum, devfn,
+			       offset + 8 + (bar * 4), 4, &size);
+
+		/* Turn the size into a decode pattern for the sizing code */
+		if (size) {
+			decode = size - 1;
+			decode |= decode >> 1;
+			decode |= decode >> 2;
+			decode |= decode >> 4;
+			decode |= decode >> 8;
+			decode |= decode >> 16;
+			decode++;
+			decode = ~(decode - 1);
+		} else {
+			decode = 0;
+		}
+
+		/*
+		 * If val is all ones, the core code is trying to size the reg,
+		 * so update the mmconfig space with the real size.
+		 *
+		 * Note: this assumes the fixed size we got is a power of two.
+		 */
+		return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4,
+				       decode);
+	}
+
+	/* This is some other kind of BAR write, so just do it. */
+	return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val);
+}
+
+/**
+ * type1_access_ok - check whether to use type 1
+ * @bus: bus number
+ * @devfn: device & function in question
+ * @reg: configuration register offset
+ *
+ * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at
+ * all, the we can go ahead with any reads & writes.  If it's on a Lincroft,
+ * but doesn't exist, avoid the access altogether to keep the chip from
+ * hanging.
+ */
+static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
+{
+	/*
+	 * This is a workaround for A0 LNC bug where PCI status register does
+	 * not have new CAP bit set. can not be written by SW either.
+	 *
+	 * PCI header type in real LNC indicates a single function device, this
+	 * will prevent probing other devices under the same function in PCI
+	 * shim. Therefore, use the header type in shim instead.
+	 */
+	if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
+		return false;
+	if (bus == 0 && (devfn == PCI_DEVFN(2, 0)
+				|| devfn == PCI_DEVFN(0, 0)
+				|| devfn == PCI_DEVFN(3, 0)))
+		return true;
+	return false; /* Langwell on others */
+}
+
+static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
+		    int size, u32 *value)
+{
+	if (type1_access_ok(bus->number, devfn, where))
+		return pci_direct_conf1.read(pci_domain_nr(bus), bus->number,
+					devfn, where, size, value);
+	return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
+			      devfn, where, size, value);
+}
+
+static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
+		     int size, u32 value)
+{
+	int offset;
+
+	/*
+	 * On MRST, there is no PCI ROM BAR, this will cause a subsequent read
+	 * to ROM BAR return 0 then being ignored.
+	 */
+	if (where == PCI_ROM_ADDRESS)
+		return 0;
+
+	/*
+	 * Devices with fixed BARs need special handling:
+	 *   - BAR sizing code will save, write ~0, read size, restore
+	 *   - so writes to fixed BARs need special handling
+	 *   - other writes to fixed BAR devices should go through mmconfig
+	 */
+	offset = fixed_bar_cap(bus, devfn);
+	if (offset &&
+	    (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) {
+		return pci_device_update_fixed(bus, devfn, where, size, value,
+					       offset);
+	}
+
+	/*
+	 * On Moorestown update both real & mmconfig space
+	 * Note: early Lincroft silicon can't handle type 1 accesses to
+	 *       non-existent devices, so just eat the write in that case.
+	 */
+	if (type1_access_ok(bus->number, devfn, where))
+		return pci_direct_conf1.write(pci_domain_nr(bus), bus->number,
+					      devfn, where, size, value);
+	return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn,
+			       where, size, value);
+}
+
+static const struct x86_cpu_id intel_mid_cpu_ids[] = {
+	X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, NULL),
+	{}
+};
+
+static int intel_mid_pci_irq_enable(struct pci_dev *dev)
+{
+	const struct x86_cpu_id *id;
+	struct irq_alloc_info info;
+	bool polarity_low;
+	u16 model = 0;
+	int ret;
+	u8 gsi;
+
+	if (dev->irq_managed && dev->irq > 0)
+		return 0;
+
+	ret = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
+	if (ret) {
+		dev_warn(&dev->dev, "Failed to read interrupt line: %d\n", ret);
+		return pcibios_err_to_errno(ret);
+	}
+
+	id = x86_match_cpu(intel_mid_cpu_ids);
+	if (id)
+		model = id->model;
+
+	switch (model) {
+	case VFM_MODEL(INTEL_ATOM_SILVERMONT_MID):
+		polarity_low = false;
+
+		/* Special treatment for IRQ0 */
+		if (gsi == 0) {
+			/*
+			 * Skip HS UART common registers device since it has
+			 * IRQ0 assigned and not used by the kernel.
+			 */
+			if (dev->device == PCI_DEVICE_ID_INTEL_MRFLD_HSU)
+				return -EBUSY;
+			/*
+			 * TNG has IRQ0 assigned to eMMC controller. But there
+			 * are also other devices with bogus PCI configuration
+			 * that have IRQ0 assigned. This check ensures that
+			 * eMMC gets it. The rest of devices still could be
+			 * enabled without interrupt line being allocated.
+			 */
+			if (dev->device != PCI_DEVICE_ID_INTEL_MRFLD_MMC)
+				return 0;
+		}
+		break;
+	default:
+		polarity_low = true;
+		break;
+	}
+
+	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low);
+
+	/*
+	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
+	 * IOAPIC RTE entries, so we just enable RTE for the device.
+	 */
+	ret = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+	if (ret < 0)
+		return ret;
+
+	dev->irq = ret;
+	dev->irq_managed = 1;
+
+	return 0;
+}
+
+static void intel_mid_pci_irq_disable(struct pci_dev *dev)
+{
+	if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
+	    dev->irq > 0) {
+		mp_unmap_irq(dev->irq);
+		dev->irq_managed = 0;
+	}
+}
+
+static const struct pci_ops intel_mid_pci_ops __initconst = {
+	.read = pci_read,
+	.write = pci_write,
+};
+
+/**
+ * intel_mid_pci_init - installs intel_mid_pci_ops
+ *
+ * Moorestown has an interesting PCI implementation (see above).
+ * Called when the early platform detection installs it.
+ */
+int __init intel_mid_pci_init(void)
+{
+	pr_info("Intel MID platform detected, using MID PCI ops\n");
+	pci_mmcfg_late_init();
+	pcibios_enable_irq = intel_mid_pci_irq_enable;
+	pcibios_disable_irq = intel_mid_pci_irq_disable;
+	pci_root_ops = intel_mid_pci_ops;
+	pci_soc_mode = 1;
+	/* Continue with standard init */
+	acpi_noirq_set();
+	return 1;
+}
+
+/*
+ * Langwell devices are not true PCI devices; they are not subject to 10 ms
+ * d3 to d0 delay required by PCI spec.
+ */
+static void pci_d3delay_fixup(struct pci_dev *dev)
+{
+	/*
+	 * PCI fixups are effectively decided compile time. If we have a dual
+	 * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices.
+	 */
+	if (!pci_soc_mode)
+		return;
+	/*
+	 * True PCI devices in Lincroft should allow type 1 access, the rest
+	 * are Langwell fake PCI devices.
+	 */
+	if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID))
+		return;
+	dev->d3hot_delay = 0;
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);
+
+static void mid_power_off_one_device(struct pci_dev *dev)
+{
+	u16 pmcsr;
+
+	/*
+	 * Update current state first, otherwise PCI core enforces PCI_D0 in
+	 * pci_set_power_state() for devices which status was PCI_UNKNOWN.
+	 */
+	pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
+	dev->current_state = (pci_power_t __force)(pmcsr & PCI_PM_CTRL_STATE_MASK);
+
+	pci_set_power_state(dev, PCI_D3hot);
+}
+
+static void mid_power_off_devices(struct pci_dev *dev)
+{
+	int id;
+
+	if (!pci_soc_mode)
+		return;
+
+	id = intel_mid_pwr_get_lss_id(dev);
+	if (id < 0)
+		return;
+
+	/*
+	 * This sets only PMCSR bits. The actual power off will happen in
+	 * arch/x86/platform/intel-mid/pwr.c.
+	 */
+	mid_power_off_one_device(dev);
+}
+
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, mid_power_off_devices);
+
+/*
+ * Langwell devices reside at fixed offsets, don't try to move them.
+ */
+static void pci_fixed_bar_fixup(struct pci_dev *dev)
+{
+	unsigned long offset;
+	u32 size;
+	int i;
+
+	if (!pci_soc_mode)
+		return;
+
+	/* Must have extended configuration space */
+	if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
+		return;
+
+	/* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
+	offset = fixed_bar_cap(dev->bus, dev->devfn);
+	if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||
+	    PCI_DEVFN(2, 2) == dev->devfn)
+		return;
+
+	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
+		pci_read_config_dword(dev, offset + 8 + (i * 4), &size);
+		dev->resource[i].end = dev->resource[i].start + size - 1;
+		dev->resource[i].flags |= IORESOURCE_PCI_FIXED;
+	}
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup);
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
deleted file mode 100644
index b433b1753016..000000000000
--- a/arch/x86/pci/intel_mid_pci.c
+++ /dev/null
@@ -1,406 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Intel MID PCI support
- *   Copyright (c) 2008 Intel Corporation
- *     Jesse Barnes <jesse.barnes@intel.com>
- *
- * Moorestown has an interesting PCI implementation:
- *   - configuration space is memory mapped (as defined by MCFG)
- *   - Lincroft devices also have a real, type 1 configuration space
- *   - Early Lincroft silicon has a type 1 access bug that will cause
- *     a hang if non-existent devices are accessed
- *   - some devices have the "fixed BAR" capability, which means
- *     they can't be relocated or modified; check for that during
- *     BAR sizing
- *
- * So, we use the MCFG space for all reads and writes, but also send
- * Lincroft writes to type 1 space.  But only read/write if the device
- * actually exists, otherwise return all 1s for reads and bit bucket
- * the writes.
- */
-
-#include <linux/sched.h>
-#include <linux/pci.h>
-#include <linux/ioport.h>
-#include <linux/init.h>
-#include <linux/dmi.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/smp.h>
-
-#include <asm/cpu_device_id.h>
-#include <asm/segment.h>
-#include <asm/pci_x86.h>
-#include <asm/hw_irq.h>
-#include <asm/io_apic.h>
-#include <asm/intel-family.h>
-#include <asm/intel-mid.h>
-#include <asm/acpi.h>
-
-#define PCIE_CAP_OFFSET	0x100
-
-/* Quirks for the listed devices */
-#define PCI_DEVICE_ID_INTEL_MRFLD_MMC	0x1190
-#define PCI_DEVICE_ID_INTEL_MRFLD_HSU	0x1191
-
-/* Fixed BAR fields */
-#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00	/* Fixed BAR (TBD) */
-#define PCI_FIXED_BAR_0_SIZE	0x04
-#define PCI_FIXED_BAR_1_SIZE	0x08
-#define PCI_FIXED_BAR_2_SIZE	0x0c
-#define PCI_FIXED_BAR_3_SIZE	0x10
-#define PCI_FIXED_BAR_4_SIZE	0x14
-#define PCI_FIXED_BAR_5_SIZE	0x1c
-
-static int pci_soc_mode;
-
-/**
- * fixed_bar_cap - return the offset of the fixed BAR cap if found
- * @bus: PCI bus
- * @devfn: device in question
- *
- * Look for the fixed BAR cap on @bus and @devfn, returning its offset
- * if found or 0 otherwise.
- */
-static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn)
-{
-	int pos;
-	u32 pcie_cap = 0, cap_data;
-
-	pos = PCIE_CAP_OFFSET;
-
-	if (!raw_pci_ext_ops)
-		return 0;
-
-	while (pos) {
-		if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
-					  devfn, pos, 4, &pcie_cap))
-			return 0;
-
-		if (PCI_EXT_CAP_ID(pcie_cap) == 0x0000 ||
-			PCI_EXT_CAP_ID(pcie_cap) == 0xffff)
-			break;
-
-		if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) {
-			raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
-					      devfn, pos + 4, 4, &cap_data);
-			if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR)
-				return pos;
-		}
-
-		pos = PCI_EXT_CAP_NEXT(pcie_cap);
-	}
-
-	return 0;
-}
-
-static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
-				   int reg, int len, u32 val, int offset)
-{
-	u32 size;
-	unsigned int domain, busnum;
-	int bar = (reg - PCI_BASE_ADDRESS_0) >> 2;
-
-	domain = pci_domain_nr(bus);
-	busnum = bus->number;
-
-	if (val == ~0 && len == 4) {
-		unsigned long decode;
-
-		raw_pci_ext_ops->read(domain, busnum, devfn,
-			       offset + 8 + (bar * 4), 4, &size);
-
-		/* Turn the size into a decode pattern for the sizing code */
-		if (size) {
-			decode = size - 1;
-			decode |= decode >> 1;
-			decode |= decode >> 2;
-			decode |= decode >> 4;
-			decode |= decode >> 8;
-			decode |= decode >> 16;
-			decode++;
-			decode = ~(decode - 1);
-		} else {
-			decode = 0;
-		}
-
-		/*
-		 * If val is all ones, the core code is trying to size the reg,
-		 * so update the mmconfig space with the real size.
-		 *
-		 * Note: this assumes the fixed size we got is a power of two.
-		 */
-		return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4,
-				       decode);
-	}
-
-	/* This is some other kind of BAR write, so just do it. */
-	return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val);
-}
-
-/**
- * type1_access_ok - check whether to use type 1
- * @bus: bus number
- * @devfn: device & function in question
- * @reg: configuration register offset
- *
- * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at
- * all, the we can go ahead with any reads & writes.  If it's on a Lincroft,
- * but doesn't exist, avoid the access altogether to keep the chip from
- * hanging.
- */
-static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
-{
-	/*
-	 * This is a workaround for A0 LNC bug where PCI status register does
-	 * not have new CAP bit set. can not be written by SW either.
-	 *
-	 * PCI header type in real LNC indicates a single function device, this
-	 * will prevent probing other devices under the same function in PCI
-	 * shim. Therefore, use the header type in shim instead.
-	 */
-	if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
-		return false;
-	if (bus == 0 && (devfn == PCI_DEVFN(2, 0)
-				|| devfn == PCI_DEVFN(0, 0)
-				|| devfn == PCI_DEVFN(3, 0)))
-		return true;
-	return false; /* Langwell on others */
-}
-
-static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
-		    int size, u32 *value)
-{
-	if (type1_access_ok(bus->number, devfn, where))
-		return pci_direct_conf1.read(pci_domain_nr(bus), bus->number,
-					devfn, where, size, value);
-	return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
-			      devfn, where, size, value);
-}
-
-static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
-		     int size, u32 value)
-{
-	int offset;
-
-	/*
-	 * On MRST, there is no PCI ROM BAR, this will cause a subsequent read
-	 * to ROM BAR return 0 then being ignored.
-	 */
-	if (where == PCI_ROM_ADDRESS)
-		return 0;
-
-	/*
-	 * Devices with fixed BARs need special handling:
-	 *   - BAR sizing code will save, write ~0, read size, restore
-	 *   - so writes to fixed BARs need special handling
-	 *   - other writes to fixed BAR devices should go through mmconfig
-	 */
-	offset = fixed_bar_cap(bus, devfn);
-	if (offset &&
-	    (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) {
-		return pci_device_update_fixed(bus, devfn, where, size, value,
-					       offset);
-	}
-
-	/*
-	 * On Moorestown update both real & mmconfig space
-	 * Note: early Lincroft silicon can't handle type 1 accesses to
-	 *       non-existent devices, so just eat the write in that case.
-	 */
-	if (type1_access_ok(bus->number, devfn, where))
-		return pci_direct_conf1.write(pci_domain_nr(bus), bus->number,
-					      devfn, where, size, value);
-	return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn,
-			       where, size, value);
-}
-
-static const struct x86_cpu_id intel_mid_cpu_ids[] = {
-	X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_MID, NULL),
-	{}
-};
-
-static int intel_mid_pci_irq_enable(struct pci_dev *dev)
-{
-	const struct x86_cpu_id *id;
-	struct irq_alloc_info info;
-	bool polarity_low;
-	u16 model = 0;
-	int ret;
-	u8 gsi;
-
-	if (dev->irq_managed && dev->irq > 0)
-		return 0;
-
-	ret = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
-	if (ret) {
-		dev_warn(&dev->dev, "Failed to read interrupt line: %d\n", ret);
-		return pcibios_err_to_errno(ret);
-	}
-
-	id = x86_match_cpu(intel_mid_cpu_ids);
-	if (id)
-		model = id->model;
-
-	switch (model) {
-	case VFM_MODEL(INTEL_ATOM_SILVERMONT_MID):
-		polarity_low = false;
-
-		/* Special treatment for IRQ0 */
-		if (gsi == 0) {
-			/*
-			 * Skip HS UART common registers device since it has
-			 * IRQ0 assigned and not used by the kernel.
-			 */
-			if (dev->device == PCI_DEVICE_ID_INTEL_MRFLD_HSU)
-				return -EBUSY;
-			/*
-			 * TNG has IRQ0 assigned to eMMC controller. But there
-			 * are also other devices with bogus PCI configuration
-			 * that have IRQ0 assigned. This check ensures that
-			 * eMMC gets it. The rest of devices still could be
-			 * enabled without interrupt line being allocated.
-			 */
-			if (dev->device != PCI_DEVICE_ID_INTEL_MRFLD_MMC)
-				return 0;
-		}
-		break;
-	default:
-		polarity_low = true;
-		break;
-	}
-
-	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity_low);
-
-	/*
-	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
-	 * IOAPIC RTE entries, so we just enable RTE for the device.
-	 */
-	ret = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
-	if (ret < 0)
-		return ret;
-
-	dev->irq = ret;
-	dev->irq_managed = 1;
-
-	return 0;
-}
-
-static void intel_mid_pci_irq_disable(struct pci_dev *dev)
-{
-	if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
-	    dev->irq > 0) {
-		mp_unmap_irq(dev->irq);
-		dev->irq_managed = 0;
-	}
-}
-
-static const struct pci_ops intel_mid_pci_ops __initconst = {
-	.read = pci_read,
-	.write = pci_write,
-};
-
-/**
- * intel_mid_pci_init - installs intel_mid_pci_ops
- *
- * Moorestown has an interesting PCI implementation (see above).
- * Called when the early platform detection installs it.
- */
-int __init intel_mid_pci_init(void)
-{
-	pr_info("Intel MID platform detected, using MID PCI ops\n");
-	pci_mmcfg_late_init();
-	pcibios_enable_irq = intel_mid_pci_irq_enable;
-	pcibios_disable_irq = intel_mid_pci_irq_disable;
-	pci_root_ops = intel_mid_pci_ops;
-	pci_soc_mode = 1;
-	/* Continue with standard init */
-	acpi_noirq_set();
-	return 1;
-}
-
-/*
- * Langwell devices are not true PCI devices; they are not subject to 10 ms
- * d3 to d0 delay required by PCI spec.
- */
-static void pci_d3delay_fixup(struct pci_dev *dev)
-{
-	/*
-	 * PCI fixups are effectively decided compile time. If we have a dual
-	 * SoC/non-SoC kernel we don't want to mangle d3 on non-SoC devices.
-	 */
-	if (!pci_soc_mode)
-		return;
-	/*
-	 * True PCI devices in Lincroft should allow type 1 access, the rest
-	 * are Langwell fake PCI devices.
-	 */
-	if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID))
-		return;
-	dev->d3hot_delay = 0;
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup);
-
-static void mid_power_off_one_device(struct pci_dev *dev)
-{
-	u16 pmcsr;
-
-	/*
-	 * Update current state first, otherwise PCI core enforces PCI_D0 in
-	 * pci_set_power_state() for devices which status was PCI_UNKNOWN.
-	 */
-	pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
-	dev->current_state = (pci_power_t __force)(pmcsr & PCI_PM_CTRL_STATE_MASK);
-
-	pci_set_power_state(dev, PCI_D3hot);
-}
-
-static void mid_power_off_devices(struct pci_dev *dev)
-{
-	int id;
-
-	if (!pci_soc_mode)
-		return;
-
-	id = intel_mid_pwr_get_lss_id(dev);
-	if (id < 0)
-		return;
-
-	/*
-	 * This sets only PMCSR bits. The actual power off will happen in
-	 * arch/x86/platform/intel-mid/pwr.c.
-	 */
-	mid_power_off_one_device(dev);
-}
-
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, mid_power_off_devices);
-
-/*
- * Langwell devices reside at fixed offsets, don't try to move them.
- */
-static void pci_fixed_bar_fixup(struct pci_dev *dev)
-{
-	unsigned long offset;
-	u32 size;
-	int i;
-
-	if (!pci_soc_mode)
-		return;
-
-	/* Must have extended configuration space */
-	if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
-		return;
-
-	/* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
-	offset = fixed_bar_cap(dev->bus, dev->devfn);
-	if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||
-	    PCI_DEVFN(2, 2) == dev->devfn)
-		return;
-
-	for (i = 0; i < PCI_STD_NUM_BARS; i++) {
-		pci_read_config_dword(dev, offset + 8 + (i * 4), &size);
-		dev->resource[i].end = dev->resource[i].start + size - 1;
-		dev->resource[i].flags |= IORESOURCE_PCI_FIXED;
-	}
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup);
-- 
cgit v1.2.3


From 61a74ad254628ccd9e88838c3c622885dfb6c588 Mon Sep 17 00:00:00 2001
From: Nylon Chen <nylon.chen@sifive.com>
Date: Fri, 11 Apr 2025 15:38:50 +0800
Subject: riscv: misaligned: fix sleeping function called during misaligned
 access handling

Use copy_from_user_nofault() and copy_to_user_nofault() instead of
copy_from/to_user functions in the misaligned access trap handlers.

The following bug report was found when executing misaligned memory
accesses:

BUG: sleeping function called from invalid context at ./include/linux/uaccess.h:162
in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 115, name: two
preempt_count: 0, expected: 0
CPU: 0 UID: 0 PID: 115 Comm: two Not tainted 6.14.0-rc5 #24
Hardware name: riscv-virtio,qemu (DT)
Call Trace:
 [<ffffffff800160ea>] dump_backtrace+0x1c/0x24
 [<ffffffff80002304>] show_stack+0x28/0x34
 [<ffffffff80010fae>] dump_stack_lvl+0x4a/0x68
 [<ffffffff80010fe0>] dump_stack+0x14/0x1c
 [<ffffffff8004e44e>] __might_resched+0xfa/0x104
 [<ffffffff8004e496>] __might_sleep+0x3e/0x62
 [<ffffffff801963c4>] __might_fault+0x1c/0x24
 [<ffffffff80425352>] _copy_from_user+0x28/0xaa
 [<ffffffff8000296c>] handle_misaligned_store+0x204/0x254
 [<ffffffff809eae82>] do_trap_store_misaligned+0x24/0xee
 [<ffffffff809f4f1a>] handle_exception+0x146/0x152

Fixes: b686ecdeacf6 ("riscv: misaligned: Restrict user access to kernel memory")
Fixes: 441381506ba7 ("riscv: misaligned: remove CONFIG_RISCV_M_MODE specific code")

Signed-off-by: Zong Li <zong.li@sifive.com>
Signed-off-by: Nylon Chen <nylon.chen@sifive.com>
Link: https://lore.kernel.org/r/20250411073850.3699180-3-nylon.chen@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/kernel/traps_misaligned.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 4354c87c0376..0173ed80818c 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -441,7 +441,7 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs)
 
 	val.data_u64 = 0;
 	if (user_mode(regs)) {
-		if (copy_from_user(&val, (u8 __user *)addr, len))
+		if (copy_from_user_nofault(&val, (u8 __user *)addr, len))
 			return -1;
 	} else {
 		memcpy(&val, (u8 *)addr, len);
@@ -539,7 +539,7 @@ static int handle_scalar_misaligned_store(struct pt_regs *regs)
 		return -EOPNOTSUPP;
 
 	if (user_mode(regs)) {
-		if (copy_to_user((u8 __user *)addr, &val, len))
+		if (copy_to_user_nofault((u8 __user *)addr, &val, len))
 			return -1;
 	} else {
 		memcpy((u8 *)addr, &val, len);
-- 
cgit v1.2.3


From 7b30b1b04e0d04e56e848c3b9c2952c30de05af9 Mon Sep 17 00:00:00 2001
From: Nylon Chen <nylon.chen@sifive.com>
Date: Fri, 11 Apr 2025 15:38:49 +0800
Subject: riscv: misaligned: Add handling for ZCB instructions

Add support for the Zcb extension's compressed half-word instructions
(C.LHU, C.LH, and C.SH) in the RISC-V misaligned access trap handler.

Signed-off-by: Zong Li <zong.li@sifive.com>
Signed-off-by: Nylon Chen <nylon.chen@sifive.com>
Link: https://lore.kernel.org/r/20250411073850.3699180-2-nylon.chen@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/kernel/traps_misaligned.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 0173ed80818c..7443632445d4 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -88,6 +88,13 @@
 #define INSN_MATCH_C_FSWSP		0xe002
 #define INSN_MASK_C_FSWSP		0xe003
 
+#define INSN_MATCH_C_LHU		0x8400
+#define INSN_MASK_C_LHU			0xfc43
+#define INSN_MATCH_C_LH			0x8440
+#define INSN_MASK_C_LH			0xfc43
+#define INSN_MATCH_C_SH			0x8c00
+#define INSN_MASK_C_SH			0xfc43
+
 #define INSN_LEN(insn)			((((insn) & 0x3) < 0x3) ? 2 : 4)
 
 #if defined(CONFIG_64BIT)
@@ -431,6 +438,13 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs)
 		fp = 1;
 		len = 4;
 #endif
+	} else if ((insn & INSN_MASK_C_LHU) == INSN_MATCH_C_LHU) {
+		len = 2;
+		insn = RVC_RS2S(insn) << SH_RD;
+	} else if ((insn & INSN_MASK_C_LH) == INSN_MATCH_C_LH) {
+		len = 2;
+		shift = 8 * (sizeof(ulong) - len);
+		insn = RVC_RS2S(insn) << SH_RD;
 	} else {
 		regs->epc = epc;
 		return -1;
@@ -530,6 +544,9 @@ static int handle_scalar_misaligned_store(struct pt_regs *regs)
 		len = 4;
 		val.data_ulong = GET_F32_RS2C(insn, regs);
 #endif
+	} else if ((insn & INSN_MASK_C_SH) == INSN_MATCH_C_SH) {
+		len = 2;
+		val.data_ulong = GET_RS2S(insn, regs);
 	} else {
 		regs->epc = epc;
 		return -1;
-- 
cgit v1.2.3


From 55ba5375ba1c45404a917aed4f772da9a82fe723 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 7 Apr 2025 09:25:07 +0200
Subject: MIPS: rb532: gpio: use new line value setter callbacks

struct gpio_chip now has callbacks for setting line values that return
an integer, allowing to indicate failures. Convert the driver to using
them.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/rb532/gpio.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/rb532/gpio.c b/arch/mips/rb532/gpio.c
index ea6ebfea4a67..0e47cd59b6cb 100644
--- a/arch/mips/rb532/gpio.c
+++ b/arch/mips/rb532/gpio.c
@@ -105,13 +105,15 @@ static int rb532_gpio_get(struct gpio_chip *chip, unsigned offset)
 /*
  * Set output GPIO level
  */
-static void rb532_gpio_set(struct gpio_chip *chip,
-				unsigned offset, int value)
+static int rb532_gpio_set(struct gpio_chip *chip, unsigned int offset,
+			  int value)
 {
 	struct rb532_gpio_chip	*gpch;
 
 	gpch = gpiochip_get_data(chip);
 	rb532_set_bit(value, offset, gpch->regbase + GPIOD);
+
+	return 0;
 }
 
 /*
@@ -162,7 +164,7 @@ static struct rb532_gpio_chip rb532_gpio_chip[] = {
 			.direction_input	= rb532_gpio_direction_input,
 			.direction_output	= rb532_gpio_direction_output,
 			.get			= rb532_gpio_get,
-			.set			= rb532_gpio_set,
+			.set_rv			= rb532_gpio_set,
 			.to_irq			= rb532_gpio_to_irq,
 			.base			= 0,
 			.ngpio			= 32,
-- 
cgit v1.2.3


From 64f3322bea9404d2dd1e8bcdc5fb025044e48d96 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 7 Apr 2025 09:25:08 +0200
Subject: MIPS: bcm63xx: gpio: use new line value setter callbacks

struct gpio_chip now has callbacks for setting line values that return
an integer, allowing to indicate failures. Convert the driver to using
them.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/bcm63xx/gpio.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/bcm63xx/gpio.c b/arch/mips/bcm63xx/gpio.c
index 5c4a233db55f..e7a53cd0dec5 100644
--- a/arch/mips/bcm63xx/gpio.c
+++ b/arch/mips/bcm63xx/gpio.c
@@ -35,8 +35,7 @@ static void bcm63xx_gpio_out_low_reg_init(void)
 static DEFINE_SPINLOCK(bcm63xx_gpio_lock);
 static u32 gpio_out_low, gpio_out_high;
 
-static void bcm63xx_gpio_set(struct gpio_chip *chip,
-			     unsigned gpio, int val)
+static int bcm63xx_gpio_set(struct gpio_chip *chip, unsigned int gpio, int val)
 {
 	u32 reg;
 	u32 mask;
@@ -62,6 +61,8 @@ static void bcm63xx_gpio_set(struct gpio_chip *chip,
 		*v &= ~mask;
 	bcm_gpio_writel(*v, reg);
 	spin_unlock_irqrestore(&bcm63xx_gpio_lock, flags);
+
+	return 0;
 }
 
 static int bcm63xx_gpio_get(struct gpio_chip *chip, unsigned gpio)
@@ -130,7 +131,7 @@ static struct gpio_chip bcm63xx_gpio_chip = {
 	.direction_input	= bcm63xx_gpio_direction_input,
 	.direction_output	= bcm63xx_gpio_direction_output,
 	.get			= bcm63xx_gpio_get,
-	.set			= bcm63xx_gpio_set,
+	.set_rv			= bcm63xx_gpio_set,
 	.base			= 0,
 };
 
-- 
cgit v1.2.3


From 68bdc4dc1130ec5a3d9fdc1b4d90cdc9e39a8792 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 7 Apr 2025 09:25:09 +0200
Subject: MIPS: alchemy: gpio: use new line value setter callbacks

struct gpio_chip now has callbacks for setting line values that return
an integer, allowing to indicate failures. Convert the driver to using
them.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/alchemy/common/gpiolib.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/alchemy/common/gpiolib.c b/arch/mips/alchemy/common/gpiolib.c
index 1b16daaa86ae..411f70ceb762 100644
--- a/arch/mips/alchemy/common/gpiolib.c
+++ b/arch/mips/alchemy/common/gpiolib.c
@@ -119,9 +119,11 @@ static int alchemy_gpic_get(struct gpio_chip *chip, unsigned int off)
 	return !!au1300_gpio_get_value(off + AU1300_GPIO_BASE);
 }
 
-static void alchemy_gpic_set(struct gpio_chip *chip, unsigned int off, int v)
+static int alchemy_gpic_set(struct gpio_chip *chip, unsigned int off, int v)
 {
 	au1300_gpio_set_value(off + AU1300_GPIO_BASE, v);
+
+	return 0;
 }
 
 static int alchemy_gpic_dir_input(struct gpio_chip *chip, unsigned int off)
@@ -145,7 +147,7 @@ static struct gpio_chip au1300_gpiochip = {
 	.direction_input	= alchemy_gpic_dir_input,
 	.direction_output	= alchemy_gpic_dir_output,
 	.get			= alchemy_gpic_get,
-	.set			= alchemy_gpic_set,
+	.set_rv			= alchemy_gpic_set,
 	.to_irq			= alchemy_gpic_gpio_to_irq,
 	.base			= AU1300_GPIO_BASE,
 	.ngpio			= AU1300_GPIO_NUM,
-- 
cgit v1.2.3


From 37022f745b58436e7d3c3c924c78d78a139b00e7 Mon Sep 17 00:00:00 2001
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Date: Mon, 7 Apr 2025 09:25:10 +0200
Subject: MIPS: txx9: gpio: use new line value setter callbacks

struct gpio_chip now has callbacks for setting line values that return
an integer, allowing to indicate failures. Convert the drivers to using
them.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/kernel/gpio_txx9.c   | 8 +++++---
 arch/mips/txx9/generic/setup.c | 8 +++++---
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/gpio_txx9.c b/arch/mips/kernel/gpio_txx9.c
index 8c083612df9d..027fb57d0d79 100644
--- a/arch/mips/kernel/gpio_txx9.c
+++ b/arch/mips/kernel/gpio_txx9.c
@@ -32,14 +32,16 @@ static void txx9_gpio_set_raw(unsigned int offset, int value)
 	__raw_writel(val, &txx9_pioptr->dout);
 }
 
-static void txx9_gpio_set(struct gpio_chip *chip, unsigned int offset,
-			  int value)
+static int txx9_gpio_set(struct gpio_chip *chip, unsigned int offset,
+			 int value)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&txx9_gpio_lock, flags);
 	txx9_gpio_set_raw(offset, value);
 	mmiowb();
 	spin_unlock_irqrestore(&txx9_gpio_lock, flags);
+
+	return 0;
 }
 
 static int txx9_gpio_dir_in(struct gpio_chip *chip, unsigned int offset)
@@ -68,7 +70,7 @@ static int txx9_gpio_dir_out(struct gpio_chip *chip, unsigned int offset,
 
 static struct gpio_chip txx9_gpio_chip = {
 	.get = txx9_gpio_get,
-	.set = txx9_gpio_set,
+	.set_rv = txx9_gpio_set,
 	.direction_input = txx9_gpio_dir_in,
 	.direction_output = txx9_gpio_dir_out,
 	.label = "TXx9",
diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c
index 1e67fecd466e..0586ca7668b4 100644
--- a/arch/mips/txx9/generic/setup.c
+++ b/arch/mips/txx9/generic/setup.c
@@ -603,8 +603,8 @@ static int txx9_iocled_get(struct gpio_chip *chip, unsigned int offset)
 	return !!(data->cur_val & (1 << offset));
 }
 
-static void txx9_iocled_set(struct gpio_chip *chip, unsigned int offset,
-			    int value)
+static int txx9_iocled_set(struct gpio_chip *chip, unsigned int offset,
+			   int value)
 {
 	struct txx9_iocled_data *data = gpiochip_get_data(chip);
 	unsigned long flags;
@@ -616,6 +616,8 @@ static void txx9_iocled_set(struct gpio_chip *chip, unsigned int offset,
 	writeb(data->cur_val, data->mmioaddr);
 	mmiowb();
 	spin_unlock_irqrestore(&txx9_iocled_lock, flags);
+
+	return 0;
 }
 
 static int txx9_iocled_dir_in(struct gpio_chip *chip, unsigned int offset)
@@ -653,7 +655,7 @@ void __init txx9_iocled_init(unsigned long baseaddr,
 	if (!iocled->mmioaddr)
 		goto out_free;
 	iocled->chip.get = txx9_iocled_get;
-	iocled->chip.set = txx9_iocled_set;
+	iocled->chip.set_rv = txx9_iocled_set;
 	iocled->chip.direction_input = txx9_iocled_dir_in;
 	iocled->chip.direction_output = txx9_iocled_dir_out;
 	iocled->chip.label = "iocled";
-- 
cgit v1.2.3


From 3b61b6a369d9d81ffaa8a87045782352d08a91aa Mon Sep 17 00:00:00 2001
From: Chris Packham <chris.packham@alliedtelesis.co.nz>
Date: Wed, 16 Apr 2025 08:54:25 +1200
Subject: mips: dts: realtek: Add MDIO controller

Add a device tree node for the MDIO controller on the RTL9300 chips.

Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/boot/dts/realtek/rtl930x.dtsi | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/boot/dts/realtek/rtl930x.dtsi b/arch/mips/boot/dts/realtek/rtl930x.dtsi
index f2e57ea3a60c..101bab72a95f 100644
--- a/arch/mips/boot/dts/realtek/rtl930x.dtsi
+++ b/arch/mips/boot/dts/realtek/rtl930x.dtsi
@@ -69,6 +69,39 @@
 			#size-cells = <0>;
 			status = "disabled";
 		};
+
+		mdio_controller: mdio-controller@ca00 {
+			compatible = "realtek,rtl9301-mdio";
+			reg = <0xca00 0x200>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+			status = "disabled";
+
+			mdio0: mdio-bus@0 {
+				reg = <0>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+				status = "disabled";
+			};
+			mdio1: mdio-bus@1 {
+				reg = <1>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+				status = "disabled";
+			};
+			mdio2: mdio-bus@2 {
+				reg = <2>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+				status = "disabled";
+			};
+			mdio3: mdio-bus@3 {
+				reg = <3>;
+				#address-cells = <1>;
+				#size-cells = <0>;
+				status = "disabled";
+			};
+		};
 	};
 
 	soc: soc@18000000 {
-- 
cgit v1.2.3


From 6d223b8ffcd1593d032b71875def2daa71c53111 Mon Sep 17 00:00:00 2001
From: WangYuli <wangyuli@uniontech.com>
Date: Wed, 16 Apr 2025 11:45:48 +0800
Subject: MIPS: Loongson64: Add missing '#interrupt-cells' for loongson64c_ls7a
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Similar to commit 98a9e2ac3755 ("MIPS: Loongson64: DTS: Fix msi node for ls7a").

Fix follow warnings:
  arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts:28.31-36.4: Warning (interrupt_provider): /bus@10000000/msi-controller@2ff00000: Missing '#interrupt-cells' in interrupt provider
  arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dtb: Warning (interrupt_map): Failed prerequisite 'interrupt_provider'

Fixes: 24af105962c8 ("MIPS: Loongson64: DeviceTree for LS7A PCH")
Tested-by: WangYuli <wangyuli@uniontech.com>
Signed-off-by: WangYuli <wangyuli@uniontech.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts b/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts
index c7ea4f1c0bb2..6c277ab83d4b 100644
--- a/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts
+++ b/arch/mips/boot/dts/loongson/loongson64c_4core_ls7a.dts
@@ -29,6 +29,7 @@
 		compatible = "loongson,pch-msi-1.0";
 		reg = <0 0x2ff00000 0 0x8>;
 		interrupt-controller;
+		#interrupt-cells = <1>;
 		msi-controller;
 		loongson,msi-base-vec = <64>;
 		loongson,msi-num-vecs = <64>;
-- 
cgit v1.2.3


From 0f4ae7c6ecb89bfda026d210dcf8216fb67d2333 Mon Sep 17 00:00:00 2001
From: Khem Raj <raj.khem@gmail.com>
Date: Sat, 29 Mar 2025 08:39:03 -0700
Subject: mips: Add -std= flag specified in KBUILD_CFLAGS to vdso CFLAGS

GCC 15 changed the default C standard dialect from gnu17 to gnu23,
which should not have impacted the kernel because it explicitly requests
the gnu11 standard in the main Makefile. However, mips/vdso code uses
its own CFLAGS without a '-std=' value, which break with this dialect
change because of the kernel's own definitions of bool, false, and true
conflicting with the C23 reserved keywords.

  include/linux/stddef.h:11:9: error: cannot use keyword 'false' as enumeration constant
     11 |         false   = 0,
        |         ^~~~~
  include/linux/stddef.h:11:9: note: 'false' is a keyword with '-std=c23' onwards
  include/linux/types.h:35:33: error: 'bool' cannot be defined via 'typedef'
     35 | typedef _Bool                   bool;
        |                                 ^~~~
  include/linux/types.h:35:33: note: 'bool' is a keyword with '-std=c23' onwards

Add -std as specified in KBUILD_CFLAGS to the decompressor and purgatory
CFLAGS to eliminate these errors and make the C standard version of these
areas match the rest of the kernel.

Signed-off-by: Khem Raj <raj.khem@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/vdso/Makefile | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile
index fb4c493aaffa..69d4593f64fe 100644
--- a/arch/mips/vdso/Makefile
+++ b/arch/mips/vdso/Makefile
@@ -27,6 +27,7 @@ endif
 # offsets.
 cflags-vdso := $(ccflags-vdso) \
 	$(filter -W%,$(filter-out -Wa$(comma)%,$(KBUILD_CFLAGS))) \
+	$(filter -std=%,$(KBUILD_CFLAGS)) \
 	-O3 -g -fPIC -fno-strict-aliasing -fno-common -fno-builtin -G 0 \
 	-mrelax-pic-calls $(call cc-option, -mexplicit-relocs) \
 	-fno-stack-protector -fno-jump-tables -DDISABLE_BRANCH_PROFILING \
-- 
cgit v1.2.3


From 76c43eb507bc1162850fdae6cc44790d1c9a83ea Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@bootlin.com>
Date: Sun, 13 Apr 2025 21:12:32 +0200
Subject: MIPS: SMP: Implement parallel CPU bring up for EyeQ

Added support for starting CPUs in parallel on EyeQ to speed up boot time.

On EyeQ5, booting 8 CPUs is now ~90ms faster.
On EyeQ6, booting 32 CPUs is now ~650ms faster.

Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/Kconfig                |  2 ++
 arch/mips/include/asm/topology.h |  3 +++
 arch/mips/kernel/smp-cps.c       |  2 ++
 arch/mips/kernel/smp.c           | 18 ++++++++++++++++++
 4 files changed, 25 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index fc0772c1bad4..e0e6ce2592b4 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -617,6 +617,7 @@ config EYEQ
 	select USB_UHCI_BIG_ENDIAN_DESC if CPU_BIG_ENDIAN
 	select USB_UHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN
 	select USE_OF
+	select HOTPLUG_PARALLEL if SMP
 	help
 	  Select this to build a kernel supporting EyeQ SoC from Mobileye.
 
@@ -2287,6 +2288,7 @@ config MIPS_CPS
 	select MIPS_CM
 	select MIPS_CPS_PM if HOTPLUG_CPU
 	select SMP
+	select HOTPLUG_SMT if HOTPLUG_PARALLEL
 	select HOTPLUG_CORE_SYNC_DEAD if HOTPLUG_CPU
 	select SYNC_R4K if (CEVT_R4K || CSRC_R4K)
 	select SYS_SUPPORTS_HOTPLUG_CPU
diff --git a/arch/mips/include/asm/topology.h b/arch/mips/include/asm/topology.h
index 0673d2d0f2e6..5158c802eb65 100644
--- a/arch/mips/include/asm/topology.h
+++ b/arch/mips/include/asm/topology.h
@@ -16,6 +16,9 @@
 #define topology_core_id(cpu)			(cpu_core(&cpu_data[cpu]))
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
 #define topology_sibling_cpumask(cpu)		(&cpu_sibling_map[cpu])
+
+extern struct cpumask __cpu_primary_thread_mask;
+#define cpu_primary_thread_mask ((const struct cpumask *)&__cpu_primary_thread_mask)
 #endif
 
 #endif /* __ASM_TOPOLOGY_H */
diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c
index e85bd087467e..02bbd7ecd1b9 100644
--- a/arch/mips/kernel/smp-cps.c
+++ b/arch/mips/kernel/smp-cps.c
@@ -236,6 +236,7 @@ static void __init cps_smp_setup(void)
 			/* Use the number of VPEs in cluster 0 core 0 for smp_num_siblings */
 			if (!cl && !c)
 				smp_num_siblings = core_vpes;
+			cpumask_set_cpu(nvpes, &__cpu_primary_thread_mask);
 
 			for (v = 0; v < min_t(int, core_vpes, NR_CPUS - nvpes); v++) {
 				cpu_set_cluster(&cpu_data[nvpes + v], cl);
@@ -364,6 +365,7 @@ static void __init cps_prepare_cpus(unsigned int max_cpus)
 	cl = cpu_cluster(&current_cpu_data);
 	c = cpu_core(&current_cpu_data);
 	cluster_bootcfg = &mips_cps_cluster_bootcfg[cl];
+	cpu_smt_set_num_threads(core_vpes, core_vpes);
 	core_bootcfg = &cluster_bootcfg->core_config[c];
 	bitmap_set(cluster_bootcfg->core_power, cpu_core(&current_cpu_data), 1);
 	atomic_set(&core_bootcfg->vpe_mask, 1 << cpu_vpe_id(&current_cpu_data));
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 39e193cad2b9..1726744f2b2e 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -56,8 +56,10 @@ EXPORT_SYMBOL(cpu_sibling_map);
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_core_map);
 
+#ifndef CONFIG_HOTPLUG_PARALLEL
 static DECLARE_COMPLETION(cpu_starting);
 static DECLARE_COMPLETION(cpu_running);
+#endif
 
 /*
  * A logical cpu mask containing only one VPE per core to
@@ -74,6 +76,8 @@ static cpumask_t cpu_core_setup_map;
 
 cpumask_t cpu_coherent_mask;
 
+struct cpumask __cpu_primary_thread_mask __read_mostly;
+
 unsigned int smp_max_threads __initdata = UINT_MAX;
 
 static int __init early_nosmt(char *s)
@@ -374,10 +378,15 @@ asmlinkage void start_secondary(void)
 	set_cpu_core_map(cpu);
 
 	cpumask_set_cpu(cpu, &cpu_coherent_mask);
+#ifdef CONFIG_HOTPLUG_PARALLEL
+	cpuhp_ap_sync_alive();
+#endif
 	notify_cpu_starting(cpu);
 
+#ifndef CONFIG_HOTPLUG_PARALLEL
 	/* Notify boot CPU that we're starting & ready to sync counters */
 	complete(&cpu_starting);
+#endif
 
 	synchronise_count_slave(cpu);
 
@@ -386,11 +395,13 @@ asmlinkage void start_secondary(void)
 
 	calculate_cpu_foreign_map();
 
+#ifndef CONFIG_HOTPLUG_PARALLEL
 	/*
 	 * Notify boot CPU that we're up & online and it can safely return
 	 * from __cpu_up
 	 */
 	complete(&cpu_running);
+#endif
 
 	/*
 	 * irq will be enabled in ->smp_finish(), enabling it too early
@@ -447,6 +458,12 @@ void __init smp_prepare_boot_cpu(void)
 	set_cpu_online(0, true);
 }
 
+#ifdef CONFIG_HOTPLUG_PARALLEL
+int arch_cpuhp_kick_ap_alive(unsigned int cpu, struct task_struct *tidle)
+{
+	return mp_ops->boot_secondary(cpu, tidle);
+}
+#else
 int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int err;
@@ -466,6 +483,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
 	wait_for_completion(&cpu_running);
 	return 0;
 }
+#endif
 
 #ifdef CONFIG_PROFILING
 /* Not really SMP stuff ... */
-- 
cgit v1.2.3


From cd956a5cb48aaccfa63291b4efd289d2f1e5b025 Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Sat, 19 Apr 2025 12:27:44 +0200
Subject: mips: ptrace: Improve code formatting and indentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use tabs instead of spaces in regs_query_register_offset() and
syscall_trace_leave(), and properly indent multiple getters.

Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/kernel/ptrace.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index f7107479c7fa..b890d64d352c 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -922,11 +922,13 @@ static const struct pt_regs_offset regoffset_table[] = {
  */
 int regs_query_register_offset(const char *name)
 {
-        const struct pt_regs_offset *roff;
-        for (roff = regoffset_table; roff->name != NULL; roff++)
-                if (!strcmp(roff->name, name))
-                        return roff->offset;
-        return -EINVAL;
+	const struct pt_regs_offset *roff;
+
+	for (roff = regoffset_table; roff->name != NULL; roff++)
+		if (!strcmp(roff->name, name))
+			return roff->offset;
+
+	return -EINVAL;
 }
 
 #if defined(CONFIG_32BIT) || defined(CONFIG_MIPS32_O32)
@@ -937,7 +939,7 @@ static const struct user_regset mips_regsets[] = {
 		.n		= ELF_NGREG,
 		.size		= sizeof(unsigned int),
 		.align		= sizeof(unsigned int),
-		.regset_get		= gpr32_get,
+		.regset_get	= gpr32_get,
 		.set		= gpr32_set,
 	},
 	[REGSET_DSP] = {
@@ -945,7 +947,7 @@ static const struct user_regset mips_regsets[] = {
 		.n		= NUM_DSP_REGS + 1,
 		.size		= sizeof(u32),
 		.align		= sizeof(u32),
-		.regset_get		= dsp32_get,
+		.regset_get	= dsp32_get,
 		.set		= dsp32_set,
 		.active		= dsp_active,
 	},
@@ -955,7 +957,7 @@ static const struct user_regset mips_regsets[] = {
 		.n		= ELF_NFPREG,
 		.size		= sizeof(elf_fpreg_t),
 		.align		= sizeof(elf_fpreg_t),
-		.regset_get		= fpr_get,
+		.regset_get	= fpr_get,
 		.set		= fpr_set,
 	},
 	[REGSET_FP_MODE] = {
@@ -963,7 +965,7 @@ static const struct user_regset mips_regsets[] = {
 		.n		= 1,
 		.size		= sizeof(int),
 		.align		= sizeof(int),
-		.regset_get		= fp_mode_get,
+		.regset_get	= fp_mode_get,
 		.set		= fp_mode_set,
 	},
 #endif
@@ -973,7 +975,7 @@ static const struct user_regset mips_regsets[] = {
 		.n		= NUM_FPU_REGS + 1,
 		.size		= 16,
 		.align		= 16,
-		.regset_get		= msa_get,
+		.regset_get	= msa_get,
 		.set		= msa_set,
 	},
 #endif
@@ -997,7 +999,7 @@ static const struct user_regset mips64_regsets[] = {
 		.n		= ELF_NGREG,
 		.size		= sizeof(unsigned long),
 		.align		= sizeof(unsigned long),
-		.regset_get		= gpr64_get,
+		.regset_get	= gpr64_get,
 		.set		= gpr64_set,
 	},
 	[REGSET_DSP] = {
@@ -1005,7 +1007,7 @@ static const struct user_regset mips64_regsets[] = {
 		.n		= NUM_DSP_REGS + 1,
 		.size		= sizeof(u64),
 		.align		= sizeof(u64),
-		.regset_get		= dsp64_get,
+		.regset_get	= dsp64_get,
 		.set		= dsp64_set,
 		.active		= dsp_active,
 	},
@@ -1015,7 +1017,7 @@ static const struct user_regset mips64_regsets[] = {
 		.n		= 1,
 		.size		= sizeof(int),
 		.align		= sizeof(int),
-		.regset_get		= fp_mode_get,
+		.regset_get	= fp_mode_get,
 		.set		= fp_mode_set,
 	},
 	[REGSET_FPR] = {
@@ -1023,7 +1025,7 @@ static const struct user_regset mips64_regsets[] = {
 		.n		= ELF_NFPREG,
 		.size		= sizeof(elf_fpreg_t),
 		.align		= sizeof(elf_fpreg_t),
-		.regset_get		= fpr_get,
+		.regset_get	= fpr_get,
 		.set		= fpr_set,
 	},
 #endif
@@ -1033,7 +1035,7 @@ static const struct user_regset mips64_regsets[] = {
 		.n		= NUM_FPU_REGS + 1,
 		.size		= 16,
 		.align		= 16,
-		.regset_get		= msa_get,
+		.regset_get	= msa_get,
 		.set		= msa_set,
 	},
 #endif
@@ -1351,7 +1353,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs)
  */
 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
 {
-        /*
+	/*
 	 * We may come here right after calling schedule_user()
 	 * or do_notify_resume(), in which case we can be in RCU
 	 * user mode.
-- 
cgit v1.2.3


From 9f6d908adabc11e5b407743696dbb333894b022e Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Tue, 22 Apr 2025 09:42:55 +0200
Subject: MIPS: BCM63XX: Replace strcpy() with strscpy() in board_prom_init()

strcpy() is deprecated; use strscpy() instead.

Link: https://github.com/KSPP/linux/issues/88
Cc: linux-hardening@vger.kernel.org
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/bcm63xx/boards/board_bcm963xx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/bcm63xx/boards/board_bcm963xx.c b/arch/mips/bcm63xx/boards/board_bcm963xx.c
index 9cc8fbf218a5..c5617b889b1c 100644
--- a/arch/mips/bcm63xx/boards/board_bcm963xx.c
+++ b/arch/mips/bcm63xx/boards/board_bcm963xx.c
@@ -764,7 +764,7 @@ void __init board_prom_init(void)
 			snprintf(cfe_version, 12, "%s", (char *) &cfe[4]);
 		}
 	} else {
-		strcpy(cfe_version, "unknown");
+		strscpy(cfe_version, "unknown");
 	}
 	pr_info("CFE version: %s\n", cfe_version);
 
-- 
cgit v1.2.3


From 3b3704261e851e25983860e4c352f1f73786f4ab Mon Sep 17 00:00:00 2001
From: Thorsten Blum <thorsten.blum@linux.dev>
Date: Fri, 25 Apr 2025 09:46:48 +0200
Subject: MIPS: Replace strcpy() with strscpy() in vpe_elfload()

strcpy() is deprecated; use strscpy() instead.

Link: https://github.com/KSPP/linux/issues/88
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/kernel/vpe.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/vpe.c b/arch/mips/kernel/vpe.c
index 737d0d4fdcd3..2b67c44adab9 100644
--- a/arch/mips/kernel/vpe.c
+++ b/arch/mips/kernel/vpe.c
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/elf.h>
 #include <linux/seq_file.h>
+#include <linux/string.h>
 #include <linux/syscalls.h>
 #include <linux/moduleloader.h>
 #include <linux/interrupt.h>
@@ -582,7 +583,7 @@ static int vpe_elfload(struct vpe *v)
 	struct module mod; /* so we can re-use the relocations code */
 
 	memset(&mod, 0, sizeof(struct module));
-	strcpy(mod.name, "VPE loader");
+	strscpy(mod.name, "VPE loader");
 
 	hdr = (Elf_Ehdr *) v->pbuffer;
 	len = v->plen;
-- 
cgit v1.2.3


From 9456e2c60171ecffc0ea26347418e6bedb64ee78 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Wed, 26 Mar 2025 15:01:12 +0800
Subject: um: xterm: Add Wayland support

Under Wayland, we should check WAYLAND_DISPLAY instead.

Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250326070113.401857-2-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/xterm.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c
index e4316c7981e8..f607af738eac 100644
--- a/arch/um/drivers/xterm.c
+++ b/arch/um/drivers/xterm.c
@@ -97,12 +97,9 @@ static int xterm_open(int input, int output, int primary, void *d,
 	if (access(argv[4], X_OK) < 0)
 		argv[4] = "port-helper";
 
-	/*
-	 * Check that DISPLAY is set, this doesn't guarantee the xterm
-	 * will work but w/o it we can be pretty sure it won't.
-	 */
-	if (getenv("DISPLAY") == NULL) {
-		printk(UM_KERN_ERR "xterm_open: $DISPLAY not set.\n");
+	/* Ensure we are running on Xorg or Wayland. */
+	if (!getenv("DISPLAY") && !getenv("WAYLAND_DISPLAY")) {
+		printk(UM_KERN_ERR "xterm_open : neither $DISPLAY nor $WAYLAND_DISPLAY is set.\n");
 		return -ENODEV;
 	}
 
-- 
cgit v1.2.3


From 22361369c2e0dc01ec7c3b42b582a37ca05a0973 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Wed, 26 Mar 2025 15:01:13 +0800
Subject: um: xterm: Update options for gnome-terminal

The -x option is deprecated and might be removed in a future release
of gnome-terminal. Let's recommend using -- instead.

Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250326070113.401857-3-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/xterm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/drivers/xterm.c b/arch/um/drivers/xterm.c
index f607af738eac..d05918e422f9 100644
--- a/arch/um/drivers/xterm.c
+++ b/arch/um/drivers/xterm.c
@@ -81,7 +81,7 @@ __uml_setup("xterm=", xterm_setup,
 "    '<switch> command arg1 arg2 ...'.\n"
 "    The default values are 'xterm=" CONFIG_XTERM_CHAN_DEFAULT_EMULATOR
      ",-T,-e'.\n"
-"    Values for gnome-terminal are 'xterm=gnome-terminal,-t,-x'.\n\n"
+"    Values for gnome-terminal are 'xterm=gnome-terminal,-t,--'.\n\n"
 );
 
 static int xterm_open(int input, int output, int primary, void *d,
-- 
cgit v1.2.3


From 674d03f6bd6b0f8327f1a4920ff5893557facfbd Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 26 Mar 2025 19:05:00 +0000
Subject: um: Add cmpxchg8b_emu and checksum functions to asm-prototypes.h

With CONFIG_GENDWARFKSYMS, um builds fail due to missing prototypes
in asm/asm-prototypes.h. Add declarations for cmpxchg8b_emu and the
exported checksum functions, including csum_partial_copy_generic as
it's also exported.

Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: linux-kbuild@vger.kernel.org
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202503251216.lE4t9Ikj-lkp@intel.com/
Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Link: https://patch.msgid.link/20250326190500.847236-2-samitolvanen@google.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/include/asm/asm-prototypes.h | 5 +++++
 arch/x86/um/asm/checksum.h           | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/um/include/asm/asm-prototypes.h b/arch/um/include/asm/asm-prototypes.h
index 5898a26daa0d..408b31d59127 100644
--- a/arch/um/include/asm/asm-prototypes.h
+++ b/arch/um/include/asm/asm-prototypes.h
@@ -1 +1,6 @@
 #include <asm-generic/asm-prototypes.h>
+#include <asm/checksum.h>
+
+#ifdef CONFIG_UML_X86
+extern void cmpxchg8b_emu(void);
+#endif
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
index b07824500363..ddc144657efa 100644
--- a/arch/x86/um/asm/checksum.h
+++ b/arch/x86/um/asm/checksum.h
@@ -20,6 +20,9 @@
  */
 extern __wsum csum_partial(const void *buff, int len, __wsum sum);
 
+/* Do not call this directly. Declared for export type visibility. */
+extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
+
 /**
  * csum_fold - Fold and invert a 32bit checksum.
  * sum: 32bit unfolded sum
-- 
cgit v1.2.3


From 82c8e1280c74f80fbacb9efbba6dd7b23446e536 Mon Sep 17 00:00:00 2001
From: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Date: Mon, 31 Mar 2025 16:31:50 +0800
Subject: um: Remove duplicate arch.h header

./arch/um/kernel/trap.c: arch.h is included more than once.

Reported-by: Abaci Robot <abaci@linux.alibaba.com>
Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=19877
Signed-off-by: Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
Link: https://patch.msgid.link/20250331083150.72598-1-jiapeng.chong@linux.alibaba.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/kernel/trap.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index ce073150dc20..39820bde49e6 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -16,7 +16,6 @@
 #include <kern_util.h>
 #include <os.h>
 #include <skas.h>
-#include <arch.h>
 
 /*
  * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by
-- 
cgit v1.2.3


From 49caacf1004d1e1fc40cfab165f104d051867c6e Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Tue, 8 Apr 2025 09:45:23 +0200
Subject: um: do not send SIGALRM to userspace in time-travel mode

We send a SIGALRM to userspace processes to interrupt them. Really,
doing so is only needed if they are actually executing at the time (to
ensure we return to kernelspace). Unfortunately, we do not have that
information readily available. We can however be sure that this is never
the case when we are in time-travel mode with infinite CPU.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250408074524.300153-1-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/kernel/time.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 1394568c0210..ae0fa2173778 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -856,11 +856,16 @@ static struct clock_event_device timer_clockevent = {
 
 static irqreturn_t um_timer(int irq, void *dev)
 {
-	if (get_current()->mm != NULL)
-	{
-        /* userspace - relay signal, results in correct userspace timers */
+	/*
+	 * Interrupt the (possibly) running userspace process, technically this
+	 * should only happen if userspace is currently executing.
+	 * With infinite CPU time-travel, we can only get here when userspace
+	 * is not executing. Do not notify there and avoid spurious scheduling.
+	 */
+	if (time_travel_mode != TT_MODE_INFCPU &&
+	    time_travel_mode != TT_MODE_EXTERNAL &&
+	    get_current()->mm)
 		os_alarm_process(get_current()->mm->context.id.pid);
-	}
 
 	(*timer_clockevent.event_handler)(&timer_clockevent);
 
-- 
cgit v1.2.3


From 6767e8784cd2e8b386a62330ea6864949d983a3e Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Tue, 8 Apr 2025 09:45:24 +0200
Subject: um: use proper care when taking mmap lock during segfault

Segfaults can occur at times where the mmap lock cannot be taken. If
that happens the segfault handler may not be able to take the mmap lock.

Fix the code to use the same approach as most other architectures.
Unfortunately, this requires copying code from mm/memory.c and modifying
it slightly as UML does not have exception tables.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250408074524.300153-2-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/kernel/trap.c | 129 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 117 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 39820bde49e6..8a18f4054793 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -17,6 +17,122 @@
 #include <os.h>
 #include <skas.h>
 
+/*
+ * NOTE: UML does not have exception tables. As such, this is almost a copy
+ * of the code in mm/memory.c, only adjusting the logic to simply check whether
+ * we are coming from the kernel instead of doing an additional lookup in the
+ * exception table.
+ * We can do this simplification because we never get here if the exception was
+ * fixable.
+ */
+static inline bool get_mmap_lock_carefully(struct mm_struct *mm, bool is_user)
+{
+	if (likely(mmap_read_trylock(mm)))
+		return true;
+
+	if (!is_user)
+		return false;
+
+	return !mmap_read_lock_killable(mm);
+}
+
+static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
+{
+	/*
+	 * We don't have this operation yet.
+	 *
+	 * It should be easy enough to do: it's basically a
+	 *    atomic_long_try_cmpxchg_acquire()
+	 * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
+	 * it also needs the proper lockdep magic etc.
+	 */
+	return false;
+}
+
+static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, bool is_user)
+{
+	mmap_read_unlock(mm);
+	if (!is_user)
+		return false;
+
+	return !mmap_write_lock_killable(mm);
+}
+
+/*
+ * Helper for page fault handling.
+ *
+ * This is kind of equivalend to "mmap_read_lock()" followed
+ * by "find_extend_vma()", except it's a lot more careful about
+ * the locking (and will drop the lock on failure).
+ *
+ * For example, if we have a kernel bug that causes a page
+ * fault, we don't want to just use mmap_read_lock() to get
+ * the mm lock, because that would deadlock if the bug were
+ * to happen while we're holding the mm lock for writing.
+ *
+ * So this checks the exception tables on kernel faults in
+ * order to only do this all for instructions that are actually
+ * expected to fault.
+ *
+ * We can also actually take the mm lock for writing if we
+ * need to extend the vma, which helps the VM layer a lot.
+ */
+static struct vm_area_struct *
+um_lock_mm_and_find_vma(struct mm_struct *mm,
+			unsigned long addr, bool is_user)
+{
+	struct vm_area_struct *vma;
+
+	if (!get_mmap_lock_carefully(mm, is_user))
+		return NULL;
+
+	vma = find_vma(mm, addr);
+	if (likely(vma && (vma->vm_start <= addr)))
+		return vma;
+
+	/*
+	 * Well, dang. We might still be successful, but only
+	 * if we can extend a vma to do so.
+	 */
+	if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
+		mmap_read_unlock(mm);
+		return NULL;
+	}
+
+	/*
+	 * We can try to upgrade the mmap lock atomically,
+	 * in which case we can continue to use the vma
+	 * we already looked up.
+	 *
+	 * Otherwise we'll have to drop the mmap lock and
+	 * re-take it, and also look up the vma again,
+	 * re-checking it.
+	 */
+	if (!mmap_upgrade_trylock(mm)) {
+		if (!upgrade_mmap_lock_carefully(mm, is_user))
+			return NULL;
+
+		vma = find_vma(mm, addr);
+		if (!vma)
+			goto fail;
+		if (vma->vm_start <= addr)
+			goto success;
+		if (!(vma->vm_flags & VM_GROWSDOWN))
+			goto fail;
+	}
+
+	if (expand_stack_locked(vma, addr))
+		goto fail;
+
+success:
+	mmap_write_downgrade(mm);
+	return vma;
+
+fail:
+	mmap_write_unlock(mm);
+	return NULL;
+}
+
 /*
  * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by
  * segv().
@@ -43,21 +159,10 @@ int handle_page_fault(unsigned long address, unsigned long ip,
 	if (is_user)
 		flags |= FAULT_FLAG_USER;
 retry:
-	mmap_read_lock(mm);
-	vma = find_vma(mm, address);
-	if (!vma)
-		goto out;
-	if (vma->vm_start <= address)
-		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto out;
-	if (is_user && !ARCH_IS_STACKGROW(address))
-		goto out;
-	vma = expand_stack(mm, address);
+	vma = um_lock_mm_and_find_vma(mm, address, is_user);
 	if (!vma)
 		goto out_nosemaphore;
 
-good_area:
 	*code_out = SEGV_ACCERR;
 	if (is_write) {
 		if (!(vma->vm_flags & VM_WRITE))
-- 
cgit v1.2.3


From a0e2cb6a90634f3dc80f16e882a683ee5761b0b0 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Sun, 13 Apr 2025 23:44:21 +0800
Subject: um: Add VFIO-based virtual PCI driver

Implement a new virtual PCI driver based on the VFIO framework.
This driver allows users to pass through PCI devices to UML via
VFIO. Currently, only MSI-X capable devices are supported, and
it is assumed that drivers will use MSI-X.

Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250413154421.517878-1-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/Kconfig     |   8 +
 arch/um/drivers/Makefile    |   2 +
 arch/um/drivers/vfio_kern.c | 642 ++++++++++++++++++++++++++++++++++++++++++++
 arch/um/drivers/vfio_user.c | 327 ++++++++++++++++++++++
 arch/um/drivers/vfio_user.h |  44 +++
 5 files changed, 1023 insertions(+)
 create mode 100644 arch/um/drivers/vfio_kern.c
 create mode 100644 arch/um/drivers/vfio_user.c
 create mode 100644 arch/um/drivers/vfio_user.h

(limited to 'arch')

diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index 9cb196070614..d7bb447ff958 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -367,3 +367,11 @@ config UML_PCI_OVER_VIRTIO_DEVICE_ID
 	  There's no official device ID assigned (yet), set the one you
 	  wish to use for experimentation here. The default of -1 is
 	  not valid and will cause the driver to fail at probe.
+
+config UML_PCI_OVER_VFIO
+	bool "Enable VFIO-based PCI passthrough"
+	select UML_PCI
+	help
+	  This driver provides support for VFIO-based PCI passthrough.
+	  Currently, only MSI-X capable devices are supported, and it
+	  is assumed that drivers will use MSI-X.
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index 0a5820343ad3..336be56b8975 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -19,6 +19,7 @@ port-objs := port_kern.o port_user.o
 harddog-objs := harddog_kern.o
 harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o
 rtc-objs := rtc_kern.o rtc_user.o
+vfio_uml-objs := vfio_kern.o vfio_user.o
 
 LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a)
 
@@ -62,6 +63,7 @@ obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o
 obj-$(CONFIG_UML_RTC) += rtc.o
 obj-$(CONFIG_UML_PCI) += virt-pci.o
 obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o
+obj-$(CONFIG_UML_PCI_OVER_VFIO) += vfio_uml.o
 
 # pcap_user.o must be added explicitly.
 USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o vector_user.o
diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c
new file mode 100644
index 000000000000..b51fc9888ae1
--- /dev/null
+++ b/arch/um/drivers/vfio_kern.c
@@ -0,0 +1,642 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Ant Group
+ * Author: Tiwei Bie <tiwei.btw@antgroup.com>
+ */
+
+#define pr_fmt(fmt) "vfio-uml: " fmt
+
+#include <linux/module.h>
+#include <linux/logic_iomem.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <irq_kern.h>
+#include <init.h>
+#include <os.h>
+
+#include "virt-pci.h"
+#include "vfio_user.h"
+
+#define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev)
+
+struct uml_vfio_intr_ctx {
+	struct uml_vfio_device *dev;
+	int irq;
+};
+
+struct uml_vfio_device {
+	const char *name;
+	int group;
+
+	struct um_pci_device pdev;
+	struct uml_vfio_user_device udev;
+	struct uml_vfio_intr_ctx *intr_ctx;
+
+	int msix_cap;
+	int msix_bar;
+	int msix_offset;
+	int msix_size;
+	u32 *msix_data;
+
+	struct list_head list;
+};
+
+struct uml_vfio_group {
+	int id;
+	int fd;
+	int users;
+	struct list_head list;
+};
+
+static struct {
+	int fd;
+	int users;
+} uml_vfio_container = { .fd = -1 };
+static DEFINE_MUTEX(uml_vfio_container_mtx);
+
+static LIST_HEAD(uml_vfio_groups);
+static DEFINE_MUTEX(uml_vfio_groups_mtx);
+
+static LIST_HEAD(uml_vfio_devices);
+
+static int uml_vfio_set_container(int group_fd)
+{
+	int err;
+
+	guard(mutex)(&uml_vfio_container_mtx);
+
+	err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd);
+	if (err)
+		return err;
+
+	uml_vfio_container.users++;
+	if (uml_vfio_container.users > 1)
+		return 0;
+
+	err = uml_vfio_user_setup_iommu(uml_vfio_container.fd);
+	if (err) {
+		uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
+		uml_vfio_container.users--;
+	}
+	return err;
+}
+
+static void uml_vfio_unset_container(int group_fd)
+{
+	guard(mutex)(&uml_vfio_container_mtx);
+
+	uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
+	uml_vfio_container.users--;
+}
+
+static int uml_vfio_open_group(int group_id)
+{
+	struct uml_vfio_group *group;
+	int err;
+
+	guard(mutex)(&uml_vfio_groups_mtx);
+
+	list_for_each_entry(group, &uml_vfio_groups, list) {
+		if (group->id == group_id) {
+			group->users++;
+			return group->fd;
+		}
+	}
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return -ENOMEM;
+
+	group->fd = uml_vfio_user_open_group(group_id);
+	if (group->fd < 0) {
+		err = group->fd;
+		goto free_group;
+	}
+
+	err = uml_vfio_set_container(group->fd);
+	if (err)
+		goto close_group;
+
+	group->id = group_id;
+	group->users = 1;
+
+	list_add(&group->list, &uml_vfio_groups);
+
+	return group->fd;
+
+close_group:
+	os_close_file(group->fd);
+free_group:
+	kfree(group);
+	return err;
+}
+
+static int uml_vfio_release_group(int group_fd)
+{
+	struct uml_vfio_group *group;
+
+	guard(mutex)(&uml_vfio_groups_mtx);
+
+	list_for_each_entry(group, &uml_vfio_groups, list) {
+		if (group->fd == group_fd) {
+			group->users--;
+			if (group->users == 0) {
+				uml_vfio_unset_container(group_fd);
+				os_close_file(group_fd);
+				list_del(&group->list);
+				kfree(group);
+			}
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static irqreturn_t uml_vfio_interrupt(int unused, void *opaque)
+{
+	struct uml_vfio_intr_ctx *ctx = opaque;
+	struct uml_vfio_device *dev = ctx->dev;
+	int index = ctx - dev->intr_ctx;
+	int irqfd = dev->udev.irqfd[index];
+	int irq = dev->msix_data[index];
+	uint64_t v;
+	int r;
+
+	do {
+		r = os_read_file(irqfd, &v, sizeof(v));
+		if (r == sizeof(v))
+			generic_handle_irq(irq);
+	} while (r == sizeof(v) || r == -EINTR);
+	WARN(r != -EAGAIN, "read returned %d\n", r);
+
+	return IRQ_HANDLED;
+}
+
+static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index)
+{
+	struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
+	int err, irqfd;
+
+	if (ctx->irq >= 0)
+		return 0;
+
+	irqfd = uml_vfio_user_activate_irq(&dev->udev, index);
+	if (irqfd < 0)
+		return irqfd;
+
+	ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ,
+				  uml_vfio_interrupt, 0,
+				  "vfio-uml", ctx);
+	if (ctx->irq < 0) {
+		err = ctx->irq;
+		goto deactivate;
+	}
+
+	err = add_sigio_fd(irqfd);
+	if (err)
+		goto free_irq;
+
+	return 0;
+
+free_irq:
+	um_free_irq(ctx->irq, ctx);
+	ctx->irq = -1;
+deactivate:
+	uml_vfio_user_deactivate_irq(&dev->udev, index);
+	return err;
+}
+
+static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index)
+{
+	struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
+
+	if (ctx->irq >= 0) {
+		ignore_sigio_fd(dev->udev.irqfd[index]);
+		um_free_irq(ctx->irq, ctx);
+		uml_vfio_user_deactivate_irq(&dev->udev, index);
+		ctx->irq = -1;
+	}
+	return 0;
+}
+
+static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev,
+				    unsigned int offset, int size,
+				    unsigned long val)
+{
+	/*
+	 * Here, we handle only the operations we care about,
+	 * ignoring the rest.
+	 */
+	if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) {
+		switch (val & ~PCI_MSIX_FLAGS_QSIZE) {
+		case PCI_MSIX_FLAGS_ENABLE:
+		case 0:
+			return uml_vfio_user_update_irqs(&dev->udev);
+		}
+	}
+	return 0;
+}
+
+static int uml_vfio_update_msix_table(struct uml_vfio_device *dev,
+				      unsigned int offset, int size,
+				      unsigned long val)
+{
+	int index;
+
+	/*
+	 * Here, we handle only the operations we care about,
+	 * ignoring the rest.
+	 */
+	offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA;
+
+	if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0)
+		return 0;
+
+	index = offset / PCI_MSIX_ENTRY_SIZE;
+	if (index >= dev->udev.irq_count)
+		return -EINVAL;
+
+	dev->msix_data[index] = val;
+
+	return val ? uml_vfio_activate_irq(dev, index) :
+		uml_vfio_deactivate_irq(dev, index);
+}
+
+static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev,
+					      unsigned int offset, int size)
+{
+	u8 data[8];
+
+	memset(data, 0xff, sizeof(data));
+
+	if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size))
+		return ULONG_MAX;
+
+	switch (size) {
+	case 1:
+		return data[0];
+	case 2:
+		return le16_to_cpup((void *)data);
+	case 4:
+		return le32_to_cpup((void *)data);
+#ifdef CONFIG_64BIT
+	case 8:
+		return le64_to_cpup((void *)data);
+#endif
+	default:
+		return ULONG_MAX;
+	}
+}
+
+static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev,
+					    unsigned int offset, int size)
+{
+	struct uml_vfio_device *dev = to_vdev(pdev);
+
+	return __uml_vfio_cfgspace_read(dev, offset, size);
+}
+
+static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev,
+				      unsigned int offset, int size,
+				      unsigned long val)
+{
+	u8 data[8];
+
+	switch (size) {
+	case 1:
+		data[0] = (u8)val;
+		break;
+	case 2:
+		put_unaligned_le16(val, (void *)data);
+		break;
+	case 4:
+		put_unaligned_le32(val, (void *)data);
+		break;
+#ifdef CONFIG_64BIT
+	case 8:
+		put_unaligned_le64(val, (void *)data);
+		break;
+#endif
+	}
+
+	WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size));
+}
+
+static void uml_vfio_cfgspace_write(struct um_pci_device *pdev,
+				    unsigned int offset, int size,
+				    unsigned long val)
+{
+	struct uml_vfio_device *dev = to_vdev(pdev);
+
+	if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF &&
+	    offset + size > dev->msix_cap)
+		WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val));
+
+	__uml_vfio_cfgspace_write(dev, offset, size, val);
+}
+
+static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar,
+				   void *buffer, unsigned int offset, int size)
+{
+	struct uml_vfio_device *dev = to_vdev(pdev);
+
+	memset(buffer, 0xff, size);
+	uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size);
+}
+
+static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar,
+				       unsigned int offset, int size)
+{
+	u8 data[8];
+
+	uml_vfio_bar_copy_from(pdev, bar, data, offset, size);
+
+	switch (size) {
+	case 1:
+		return data[0];
+	case 2:
+		return le16_to_cpup((void *)data);
+	case 4:
+		return le32_to_cpup((void *)data);
+#ifdef CONFIG_64BIT
+	case 8:
+		return le64_to_cpup((void *)data);
+#endif
+	default:
+		return ULONG_MAX;
+	}
+}
+
+static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar,
+				 unsigned int offset, const void *buffer,
+				 int size)
+{
+	struct uml_vfio_device *dev = to_vdev(pdev);
+
+	uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size);
+}
+
+static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar,
+			       unsigned int offset, int size,
+			       unsigned long val)
+{
+	struct uml_vfio_device *dev = to_vdev(pdev);
+	u8 data[8];
+
+	if (bar == dev->msix_bar && offset + size > dev->msix_offset &&
+	    offset < dev->msix_offset + dev->msix_size)
+		WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val));
+
+	switch (size) {
+	case 1:
+		data[0] = (u8)val;
+		break;
+	case 2:
+		put_unaligned_le16(val, (void *)data);
+		break;
+	case 4:
+		put_unaligned_le32(val, (void *)data);
+		break;
+#ifdef CONFIG_64BIT
+	case 8:
+		put_unaligned_le64(val, (void *)data);
+		break;
+#endif
+	}
+
+	uml_vfio_bar_copy_to(pdev, bar, offset, data, size);
+}
+
+static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar,
+			     unsigned int offset, u8 value, int size)
+{
+	struct uml_vfio_device *dev = to_vdev(pdev);
+	int i;
+
+	for (i = 0; i < size; i++)
+		uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1);
+}
+
+static const struct um_pci_ops uml_vfio_um_pci_ops = {
+	.cfgspace_read	= uml_vfio_cfgspace_read,
+	.cfgspace_write	= uml_vfio_cfgspace_write,
+	.bar_read	= uml_vfio_bar_read,
+	.bar_write	= uml_vfio_bar_write,
+	.bar_copy_from	= uml_vfio_bar_copy_from,
+	.bar_copy_to	= uml_vfio_bar_copy_to,
+	.bar_set	= uml_vfio_bar_set,
+};
+
+static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap)
+{
+	u8 id, pos;
+	u16 ent;
+	int ttl = 48; /* PCI_FIND_CAP_TTL */
+
+	pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos));
+
+	while (pos && ttl--) {
+		ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent));
+
+		id = ent & 0xff;
+		if (id == 0xff)
+			break;
+		if (id == cap)
+			return pos;
+
+		pos = ent >> 8;
+	}
+
+	return 0;
+}
+
+static int uml_vfio_read_msix_table(struct uml_vfio_device *dev)
+{
+	unsigned int off;
+	u16 flags;
+	u32 tbl;
+
+	off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (!off)
+		return -ENOTSUPP;
+
+	dev->msix_cap = off;
+
+	tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl));
+	flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, sizeof(flags));
+
+	dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR;
+	dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET;
+	dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * PCI_MSIX_ENTRY_SIZE;
+
+	dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL);
+	if (!dev->msix_data)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void uml_vfio_open_device(struct uml_vfio_device *dev)
+{
+	struct uml_vfio_intr_ctx *ctx;
+	int err, group_id, i;
+
+	group_id = uml_vfio_user_get_group_id(dev->name);
+	if (group_id < 0) {
+		pr_err("Failed to get group id (%s), error %d\n",
+		       dev->name, group_id);
+		goto free_dev;
+	}
+
+	dev->group = uml_vfio_open_group(group_id);
+	if (dev->group < 0) {
+		pr_err("Failed to open group %d (%s), error %d\n",
+		       group_id, dev->name, dev->group);
+		goto free_dev;
+	}
+
+	err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name);
+	if (err) {
+		pr_err("Failed to setup device (%s), error %d\n",
+		       dev->name, err);
+		goto release_group;
+	}
+
+	err = uml_vfio_read_msix_table(dev);
+	if (err) {
+		pr_err("Failed to read MSI-X table (%s), error %d\n",
+		       dev->name, err);
+		goto teardown_udev;
+	}
+
+	dev->intr_ctx = kmalloc_array(dev->udev.irq_count,
+				      sizeof(struct uml_vfio_intr_ctx),
+				      GFP_KERNEL);
+	if (!dev->intr_ctx) {
+		pr_err("Failed to allocate interrupt context (%s)\n",
+		       dev->name);
+		goto free_msix;
+	}
+
+	for (i = 0; i < dev->udev.irq_count; i++) {
+		ctx = &dev->intr_ctx[i];
+		ctx->dev = dev;
+		ctx->irq = -1;
+	}
+
+	dev->pdev.ops = &uml_vfio_um_pci_ops;
+
+	err = um_pci_device_register(&dev->pdev);
+	if (err) {
+		pr_err("Failed to register UM PCI device (%s), error %d\n",
+		       dev->name, err);
+		goto free_intr_ctx;
+	}
+
+	return;
+
+free_intr_ctx:
+	kfree(dev->intr_ctx);
+free_msix:
+	kfree(dev->msix_data);
+teardown_udev:
+	uml_vfio_user_teardown_device(&dev->udev);
+release_group:
+	uml_vfio_release_group(dev->group);
+free_dev:
+	list_del(&dev->list);
+	kfree(dev->name);
+	kfree(dev);
+}
+
+static void uml_vfio_release_device(struct uml_vfio_device *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->udev.irq_count; i++)
+		uml_vfio_deactivate_irq(dev, i);
+	uml_vfio_user_update_irqs(&dev->udev);
+
+	um_pci_device_unregister(&dev->pdev);
+	kfree(dev->intr_ctx);
+	kfree(dev->msix_data);
+	uml_vfio_user_teardown_device(&dev->udev);
+	uml_vfio_release_group(dev->group);
+	list_del(&dev->list);
+	kfree(dev->name);
+	kfree(dev);
+}
+
+static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp)
+{
+	struct uml_vfio_device *dev;
+	int fd;
+
+	if (uml_vfio_container.fd < 0) {
+		fd = uml_vfio_user_open_container();
+		if (fd < 0)
+			return fd;
+		uml_vfio_container.fd = fd;
+	}
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->name = kstrdup(device, GFP_KERNEL);
+	if (!dev->name) {
+		kfree(dev);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&dev->list, &uml_vfio_devices);
+	return 0;
+}
+
+static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp)
+{
+	return 0;
+}
+
+static const struct kernel_param_ops uml_vfio_cmdline_param_ops = {
+	.set = uml_vfio_cmdline_set,
+	.get = uml_vfio_cmdline_get,
+};
+
+device_param_cb(device, &uml_vfio_cmdline_param_ops, NULL, 0400);
+__uml_help(uml_vfio_cmdline_param_ops,
+"vfio_uml.device=<domain:bus:slot.function>\n"
+"    Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n"
+"    capable devices are supported, and it is assumed that drivers will\n"
+"    use MSI-X. This parameter can be specified multiple times to pass\n"
+"    through multiple PCI devices to UML.\n\n"
+);
+
+static int __init uml_vfio_init(void)
+{
+	struct uml_vfio_device *dev, *n;
+
+	sigio_broken();
+
+	/* If the opening fails, the device will be released. */
+	list_for_each_entry_safe(dev, n, &uml_vfio_devices, list)
+		uml_vfio_open_device(dev);
+
+	return 0;
+}
+late_initcall(uml_vfio_init);
+
+static void __exit uml_vfio_exit(void)
+{
+	struct uml_vfio_device *dev, *n;
+
+	list_for_each_entry_safe(dev, n, &uml_vfio_devices, list)
+		uml_vfio_release_device(dev);
+
+	if (uml_vfio_container.fd >= 0)
+		os_close_file(uml_vfio_container.fd);
+}
+module_exit(uml_vfio_exit);
diff --git a/arch/um/drivers/vfio_user.c b/arch/um/drivers/vfio_user.c
new file mode 100644
index 000000000000..6a45d8e14582
--- /dev/null
+++ b/arch/um/drivers/vfio_user.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Ant Group
+ * Author: Tiwei Bie <tiwei.btw@antgroup.com>
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include <linux/limits.h>
+#include <linux/vfio.h>
+#include <linux/pci_regs.h>
+#include <as-layout.h>
+#include <um_malloc.h>
+
+#include "vfio_user.h"
+
+int uml_vfio_user_open_container(void)
+{
+	int r, fd;
+
+	fd = open("/dev/vfio/vfio", O_RDWR);
+	if (fd < 0)
+		return -errno;
+
+	r = ioctl(fd, VFIO_GET_API_VERSION);
+	if (r != VFIO_API_VERSION) {
+		r = r < 0 ? -errno : -EINVAL;
+		goto error;
+	}
+
+	r = ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+	if (r <= 0) {
+		r = r < 0 ? -errno : -EINVAL;
+		goto error;
+	}
+
+	return fd;
+
+error:
+	close(fd);
+	return r;
+}
+
+int uml_vfio_user_setup_iommu(int container)
+{
+	/*
+	 * This is a bit tricky. See the big comment in
+	 * vhost_user_set_mem_table() in virtio_uml.c.
+	 */
+	unsigned long reserved = uml_reserved - uml_physmem;
+	struct vfio_iommu_type1_dma_map dma_map = {
+		.argsz = sizeof(dma_map),
+		.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+		.vaddr = uml_reserved,
+		.iova = reserved,
+		.size = physmem_size - reserved,
+	};
+
+	if (ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < 0)
+		return -errno;
+
+	if (ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map) < 0)
+		return -errno;
+
+	return 0;
+}
+
+int uml_vfio_user_get_group_id(const char *device)
+{
+	char *path, *buf, *end;
+	const char *name;
+	int r;
+
+	path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL);
+	if (!path)
+		return -ENOMEM;
+
+	sprintf(path, "/sys/bus/pci/devices/%s/iommu_group", device);
+
+	buf = uml_kmalloc(PATH_MAX + 1, UM_GFP_KERNEL);
+	if (!buf) {
+		r = -ENOMEM;
+		goto free_path;
+	}
+
+	r = readlink(path, buf, PATH_MAX);
+	if (r < 0) {
+		r = -errno;
+		goto free_buf;
+	}
+	buf[r] = '\0';
+
+	name = basename(buf);
+
+	r = strtoul(name, &end, 10);
+	if (*end != '\0' || end == name) {
+		r = -EINVAL;
+		goto free_buf;
+	}
+
+free_buf:
+	kfree(buf);
+free_path:
+	kfree(path);
+	return r;
+}
+
+int uml_vfio_user_open_group(int group_id)
+{
+	char *path;
+	int fd;
+
+	path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL);
+	if (!path)
+		return -ENOMEM;
+
+	sprintf(path, "/dev/vfio/%d", group_id);
+
+	fd = open(path, O_RDWR);
+	if (fd < 0) {
+		fd = -errno;
+		goto out;
+	}
+
+out:
+	kfree(path);
+	return fd;
+}
+
+int uml_vfio_user_set_container(int container, int group)
+{
+	if (ioctl(group, VFIO_GROUP_SET_CONTAINER, &container) < 0)
+		return -errno;
+	return 0;
+}
+
+int uml_vfio_user_unset_container(int container, int group)
+{
+	if (ioctl(group, VFIO_GROUP_UNSET_CONTAINER, &container) < 0)
+		return -errno;
+	return 0;
+}
+
+static int vfio_set_irqs(int device, int start, int count, int *irqfd)
+{
+	struct vfio_irq_set *irq_set;
+	int argsz = sizeof(*irq_set) + sizeof(*irqfd) * count;
+	int err = 0;
+
+	irq_set = uml_kmalloc(argsz, UM_GFP_KERNEL);
+	if (!irq_set)
+		return -ENOMEM;
+
+	irq_set->argsz = argsz;
+	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+	irq_set->start = start;
+	irq_set->count = count;
+	memcpy(irq_set->data, irqfd, sizeof(*irqfd) * count);
+
+	if (ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set) < 0) {
+		err = -errno;
+		goto out;
+	}
+
+out:
+	kfree(irq_set);
+	return err;
+}
+
+int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev,
+			       int group, const char *device)
+{
+	struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+	struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
+	int err, i;
+
+	dev->device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, device);
+	if (dev->device < 0)
+		return -errno;
+
+	if (ioctl(dev->device, VFIO_DEVICE_GET_INFO, &device_info) < 0) {
+		err = -errno;
+		goto close_device;
+	}
+
+	dev->num_regions = device_info.num_regions;
+	if (dev->num_regions > VFIO_PCI_CONFIG_REGION_INDEX + 1)
+		dev->num_regions = VFIO_PCI_CONFIG_REGION_INDEX + 1;
+
+	dev->region = uml_kmalloc(sizeof(*dev->region) * dev->num_regions,
+				  UM_GFP_KERNEL);
+	if (!dev->region) {
+		err = -ENOMEM;
+		goto close_device;
+	}
+
+	for (i = 0; i < dev->num_regions; i++) {
+		struct vfio_region_info region = {
+			.argsz = sizeof(region),
+			.index = i,
+		};
+		if (ioctl(dev->device, VFIO_DEVICE_GET_REGION_INFO, &region) < 0) {
+			err = -errno;
+			goto free_region;
+		}
+		dev->region[i].size = region.size;
+		dev->region[i].offset = region.offset;
+	}
+
+	/* Only MSI-X is supported currently. */
+	irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
+	if (ioctl(dev->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0) {
+		err = -errno;
+		goto free_region;
+	}
+
+	dev->irq_count = irq_info.count;
+
+	dev->irqfd = uml_kmalloc(sizeof(int) * dev->irq_count, UM_GFP_KERNEL);
+	if (!dev->irqfd) {
+		err = -ENOMEM;
+		goto free_region;
+	}
+
+	memset(dev->irqfd, -1, sizeof(int) * dev->irq_count);
+
+	err = vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd);
+	if (err)
+		goto free_irqfd;
+
+	return 0;
+
+free_irqfd:
+	kfree(dev->irqfd);
+free_region:
+	kfree(dev->region);
+close_device:
+	close(dev->device);
+	return err;
+}
+
+void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev)
+{
+	kfree(dev->irqfd);
+	kfree(dev->region);
+	close(dev->device);
+}
+
+int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index)
+{
+	int irqfd;
+
+	irqfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+	if (irqfd < 0)
+		return -errno;
+
+	dev->irqfd[index] = irqfd;
+	return irqfd;
+}
+
+void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index)
+{
+	close(dev->irqfd[index]);
+	dev->irqfd[index] = -1;
+}
+
+int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev)
+{
+	return vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd);
+}
+
+static int vfio_region_read(struct uml_vfio_user_device *dev, unsigned int index,
+			    uint64_t offset, void *buf, uint64_t size)
+{
+	if (index >= dev->num_regions || offset + size > dev->region[index].size)
+		return -EINVAL;
+
+	if (pread(dev->device, buf, size, dev->region[index].offset + offset) < 0)
+		return -errno;
+
+	return 0;
+}
+
+static int vfio_region_write(struct uml_vfio_user_device *dev, unsigned int index,
+			     uint64_t offset, const void *buf, uint64_t size)
+{
+	if (index >= dev->num_regions || offset + size > dev->region[index].size)
+		return -EINVAL;
+
+	if (pwrite(dev->device, buf, size, dev->region[index].offset + offset) < 0)
+		return -errno;
+
+	return 0;
+}
+
+int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev,
+				unsigned int offset, void *buf, int size)
+{
+	return vfio_region_read(dev, VFIO_PCI_CONFIG_REGION_INDEX,
+				offset, buf, size);
+}
+
+int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev,
+				 unsigned int offset, const void *buf, int size)
+{
+	return vfio_region_write(dev, VFIO_PCI_CONFIG_REGION_INDEX,
+				 offset, buf, size);
+}
+
+int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar,
+			   unsigned int offset, void *buf, int size)
+{
+	return vfio_region_read(dev, bar, offset, buf, size);
+}
+
+int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar,
+			    unsigned int offset, const void *buf, int size)
+{
+	return vfio_region_write(dev, bar, offset, buf, size);
+}
diff --git a/arch/um/drivers/vfio_user.h b/arch/um/drivers/vfio_user.h
new file mode 100644
index 000000000000..75535e05059b
--- /dev/null
+++ b/arch/um/drivers/vfio_user.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_VFIO_USER_H
+#define __UM_VFIO_USER_H
+
+struct uml_vfio_user_device {
+	int device;
+
+	struct {
+		uint64_t size;
+		uint64_t offset;
+	} *region;
+	int num_regions;
+
+	int32_t *irqfd;
+	int irq_count;
+};
+
+int uml_vfio_user_open_container(void);
+int uml_vfio_user_setup_iommu(int container);
+
+int uml_vfio_user_get_group_id(const char *device);
+int uml_vfio_user_open_group(int group_id);
+int uml_vfio_user_set_container(int container, int group);
+int uml_vfio_user_unset_container(int container, int group);
+
+int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev,
+			       int group, const char *device);
+void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev);
+
+int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index);
+void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index);
+int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev);
+
+int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev,
+				unsigned int offset, void *buf, int size);
+int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev,
+				 unsigned int offset, const void *buf, int size);
+
+int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar,
+			   unsigned int offset, void *buf, int size);
+int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar,
+			    unsigned int offset, const void *buf, int size);
+
+#endif /* __UM_VFIO_USER_H */
-- 
cgit v1.2.3


From 7633b8b1e793e3441448c12c271fbecc53397fa9 Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Tue, 15 Apr 2025 12:47:13 +0200
Subject: irqdomain: um: use irq_domain_create_linear() helper

um_pci_init() open-codes what the irq_domain_create_linear() helper
does already. Use the helper instead of open-coding it.

This needs retval checking modification.

Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: linux-um@lists.infradead.org
Link: https://patch.msgid.link/20250415104713.106819-1-jirislaby@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/virt-pci.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c
index b83b5a765d4e..0fe207ca4b72 100644
--- a/arch/um/drivers/virt-pci.c
+++ b/arch/um/drivers/virt-pci.c
@@ -538,11 +538,6 @@ void um_pci_platform_device_unregister(struct um_pci_device *dev)
 
 static int __init um_pci_init(void)
 {
-	struct irq_domain_info inner_domain_info = {
-		.size		= MAX_MSI_VECTORS,
-		.hwirq_max	= MAX_MSI_VECTORS,
-		.ops		= &um_pci_inner_domain_ops,
-	};
 	int err, i;
 
 	WARN_ON(logic_iomem_add_region(&virt_cfgspace_resource,
@@ -564,10 +559,10 @@ static int __init um_pci_init(void)
 		goto free;
 	}
 
-	inner_domain_info.fwnode = um_pci_fwnode;
-	um_pci_inner_domain = irq_domain_instantiate(&inner_domain_info);
-	if (IS_ERR(um_pci_inner_domain)) {
-		err = PTR_ERR(um_pci_inner_domain);
+	um_pci_inner_domain = irq_domain_create_linear(um_pci_fwnode, MAX_MSI_VECTORS,
+						       &um_pci_inner_domain_ops, NULL);
+	if (!um_pci_inner_domain) {
+		err = -ENOMEM;
 		goto free;
 	}
 
@@ -602,7 +597,7 @@ static int __init um_pci_init(void)
 	return 0;
 
 free:
-	if (!IS_ERR_OR_NULL(um_pci_inner_domain))
+	if (um_pci_inner_domain)
 		irq_domain_remove(um_pci_inner_domain);
 	if (um_pci_fwnode)
 		irq_domain_free_fwnode(um_pci_fwnode);
-- 
cgit v1.2.3


From 9c88156b2c81dc317297494e879f814e6574330a Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Fri, 18 Apr 2025 10:33:58 +0200
Subject: um/asm: Rename rep_nop() to native_pause()

Rename rep_nop() function to what it really does.

No functional change intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: David Laight <david.laight.linux@gmail.com>
Link: https://patch.msgid.link/20250418083436.133148-1-ubizjak@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/x86/um/asm/processor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 478710384b34..d50549e0089c 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -22,7 +22,7 @@
 #include <asm/user.h>
 
 /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static __always_inline void rep_nop(void)
+static __always_inline void native_pause(void)
 {
 	__asm__ __volatile__("rep;nop": : :"memory");
 }
@@ -33,7 +33,7 @@ static __always_inline void cpu_relax(void)
 	    time_travel_mode == TT_MODE_EXTERNAL)
 		time_travel_ndelay(1);
 	else
-		rep_nop();
+		native_pause();
 }
 
 #define task_pt_regs(t) (&(t)->thread.regs)
-- 
cgit v1.2.3


From 304c9f7f8f439083c56846c4433fbab7467eb01e Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Fri, 18 Apr 2025 10:33:59 +0200
Subject: um/asm: Replace "REP; NOP" with PAUSE mnemonic

Current minimum required version of binutils is 2.25,
which supports PAUSE instruction mnemonic.

Replace "REP; NOP" with this proper mnemonic.

No functional change intended.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: David Laight <david.laight.linux@gmail.com>
Link: https://patch.msgid.link/20250418083436.133148-2-ubizjak@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/x86/um/asm/processor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index d50549e0089c..e222d2ae28fd 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -21,10 +21,10 @@
 
 #include <asm/user.h>
 
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
+/* PAUSE is a good thing to insert into busy-wait loops. */
 static __always_inline void native_pause(void)
 {
-	__asm__ __volatile__("rep;nop": : :"memory");
+	__asm__ __volatile__("pause": : :"memory");
 }
 
 static __always_inline void cpu_relax(void)
-- 
cgit v1.2.3


From 65eaac591b752042006d3a79c0cfba47e2a9aaac Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Sat, 3 May 2025 13:17:08 +0800
Subject: um: Remove obsolete legacy network transports

These legacy network transports were marked as obsolete in commit
40814b98a570 ("um: Mark non-vector net transports as obsolete").
More than five years have passed since then. Remove these network
transports to reduce the maintenance burden.

Suggested-by: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Acked-By: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Link: https://patch.msgid.link/20250503051710.3286595-2-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/configs/i386_defconfig           |   6 -
 arch/um/configs/x86_64_defconfig         |   6 -
 arch/um/drivers/Kconfig                  | 170 ---------------------
 arch/um/drivers/Makefile                 |  18 +--
 arch/um/drivers/daemon.h                 |  29 ----
 arch/um/drivers/daemon_kern.c            |  95 ------------
 arch/um/drivers/daemon_user.c            | 194 ------------------------
 arch/um/drivers/slip.h                   |  21 ---
 arch/um/drivers/slip_common.c            |  55 -------
 arch/um/drivers/slip_common.h            | 106 -------------
 arch/um/drivers/slip_kern.c              |  93 ------------
 arch/um/drivers/slip_user.c              | 252 -------------------------------
 arch/um/drivers/slirp.h                  |  34 -----
 arch/um/drivers/slirp_kern.c             | 120 ---------------
 arch/um/drivers/slirp_user.c             | 124 ---------------
 arch/um/drivers/umcast.h                 |  27 ----
 arch/um/drivers/umcast_kern.c            | 188 -----------------------
 arch/um/drivers/umcast_user.c            | 184 ----------------------
 arch/um/drivers/vde.h                    |  32 ----
 arch/um/drivers/vde_kern.c               | 129 ----------------
 arch/um/drivers/vde_user.c               | 125 ---------------
 arch/um/include/shared/os.h              |   1 -
 arch/um/os-Linux/Makefile                |   2 +-
 arch/um/os-Linux/drivers/Makefile        |  13 --
 arch/um/os-Linux/drivers/etap.h          |  21 ---
 arch/um/os-Linux/drivers/ethertap_kern.c | 100 ------------
 arch/um/os-Linux/drivers/ethertap_user.c | 248 ------------------------------
 arch/um/os-Linux/drivers/tuntap.h        |  21 ---
 arch/um/os-Linux/drivers/tuntap_kern.c   |  86 -----------
 arch/um/os-Linux/drivers/tuntap_user.c   | 215 --------------------------
 arch/um/os-Linux/file.c                  |  15 --
 31 files changed, 2 insertions(+), 2728 deletions(-)
 delete mode 100644 arch/um/drivers/daemon.h
 delete mode 100644 arch/um/drivers/daemon_kern.c
 delete mode 100644 arch/um/drivers/daemon_user.c
 delete mode 100644 arch/um/drivers/slip.h
 delete mode 100644 arch/um/drivers/slip_common.c
 delete mode 100644 arch/um/drivers/slip_common.h
 delete mode 100644 arch/um/drivers/slip_kern.c
 delete mode 100644 arch/um/drivers/slip_user.c
 delete mode 100644 arch/um/drivers/slirp.h
 delete mode 100644 arch/um/drivers/slirp_kern.c
 delete mode 100644 arch/um/drivers/slirp_user.c
 delete mode 100644 arch/um/drivers/umcast.h
 delete mode 100644 arch/um/drivers/umcast_kern.c
 delete mode 100644 arch/um/drivers/umcast_user.c
 delete mode 100644 arch/um/drivers/vde.h
 delete mode 100644 arch/um/drivers/vde_kern.c
 delete mode 100644 arch/um/drivers/vde_user.c
 delete mode 100644 arch/um/os-Linux/drivers/Makefile
 delete mode 100644 arch/um/os-Linux/drivers/etap.h
 delete mode 100644 arch/um/os-Linux/drivers/ethertap_kern.c
 delete mode 100644 arch/um/os-Linux/drivers/ethertap_user.c
 delete mode 100644 arch/um/os-Linux/drivers/tuntap.h
 delete mode 100644 arch/um/os-Linux/drivers/tuntap_kern.c
 delete mode 100644 arch/um/os-Linux/drivers/tuntap_user.c

(limited to 'arch')

diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig
index 1ffa088739f4..7c8999f18f2d 100644
--- a/arch/um/configs/i386_defconfig
+++ b/arch/um/configs/i386_defconfig
@@ -53,12 +53,6 @@ CONFIG_UNIX=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
 CONFIG_UML_NET=y
-CONFIG_UML_NET_ETHERTAP=y
-CONFIG_UML_NET_TUNTAP=y
-CONFIG_UML_NET_SLIP=y
-CONFIG_UML_NET_DAEMON=y
-CONFIG_UML_NET_MCAST=y
-CONFIG_UML_NET_SLIRP=y
 CONFIG_EXT4_FS=y
 CONFIG_QUOTA=y
 CONFIG_AUTOFS_FS=m
diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig
index 03b10d3f6816..9858d989777e 100644
--- a/arch/um/configs/x86_64_defconfig
+++ b/arch/um/configs/x86_64_defconfig
@@ -52,12 +52,6 @@ CONFIG_UNIX=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
 CONFIG_UML_NET=y
-CONFIG_UML_NET_ETHERTAP=y
-CONFIG_UML_NET_TUNTAP=y
-CONFIG_UML_NET_SLIP=y
-CONFIG_UML_NET_DAEMON=y
-CONFIG_UML_NET_MCAST=y
-CONFIG_UML_NET_SLIRP=y
 CONFIG_EXT4_FS=y
 CONFIG_QUOTA=y
 CONFIG_AUTOFS_FS=m
diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index d7bb447ff958..bc68c3e341f5 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -143,99 +143,6 @@ config UML_NET
 	  enable at least one of the following transport options to actually
 	  make use of UML networking.
 
-config UML_NET_ETHERTAP
-	bool "Ethertap transport (obsolete)"
-	depends on UML_NET
-	help
-	  The Ethertap User-Mode Linux network transport allows a single
-	  running UML to exchange packets with its host over one of the
-	  host's Ethertap devices, such as /dev/tap0.  Additional running
-	  UMLs can use additional Ethertap devices, one per running UML.
-	  While the UML believes it's on a (multi-device, broadcast) virtual
-	  Ethernet network, it's in fact communicating over a point-to-point
-	  link with the host.
-
-	  To use this, your host kernel must have support for Ethertap
-	  devices.  Also, if your host kernel is 2.4.x, it must have
-	  CONFIG_NETLINK_DEV configured as Y or M.
-
-	  For more information, see
-	  <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
-	  has examples of the UML command line to use to enable Ethertap
-	  networking.
-
-	  NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please
-	  migrate to UML_NET_VECTOR.
-
-	  If unsure, say N.
-
-config UML_NET_TUNTAP
-	bool "TUN/TAP transport (obsolete)"
-	depends on UML_NET
-	help
-	  The UML TUN/TAP network transport allows a UML instance to exchange
-	  packets with the host over a TUN/TAP device.  This option will only
-	  work with a 2.4 host, unless you've applied the TUN/TAP patch to
-	  your 2.2 host kernel.
-
-	  To use this transport, your host kernel must have support for TUN/TAP
-	  devices, either built-in or as a module.
-
-	  NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please
-	  migrate to UML_NET_VECTOR.
-
-	  If unsure, say N.
-
-config UML_NET_SLIP
-	bool "SLIP transport (obsolete)"
-	depends on UML_NET
-	help
-	  The slip User-Mode Linux network transport allows a running UML to
-	  network with its host over a point-to-point link.  Unlike Ethertap,
-	  which can carry any Ethernet frame (and hence even non-IP packets),
-	  the slip transport can only carry IP packets.
-
-	  To use this, your host must support slip devices.
-
-	  For more information, see
-	  <http://user-mode-linux.sourceforge.net/old/networking.html>.
-	  has examples of the UML command line to use to enable slip
-	  networking, and details of a few quirks with it.
-
-	  NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please
-	  migrate to UML_NET_VECTOR.
-
-	  If unsure, say N.
-
-config UML_NET_DAEMON
-	bool "Daemon transport (obsolete)"
-	depends on UML_NET
-	help
-	  This User-Mode Linux network transport allows one or more running
-	  UMLs on a single host to communicate with each other, but not to
-	  the host.
-
-	  To use this form of networking, you'll need to run the UML
-	  networking daemon on the host.
-
-	  For more information, see
-	  <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
-	  has examples of the UML command line to use to enable Daemon
-	  networking.
-
-	  NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please
-	  migrate to UML_NET_VECTOR.
-
-	  If unsure, say N.
-
-config UML_NET_DAEMON_DEFAULT_SOCK
-	string "Default socket for daemon transport"
-	default "/tmp/uml.ctl"
-	depends on UML_NET_DAEMON
-	help
-	  This option allows setting the default socket for the daemon
-	  transport, normally it defaults to /tmp/uml.ctl.
-
 config UML_NET_VECTOR
 	bool "Vector I/O high performance network devices"
 	depends on UML_NET
@@ -248,83 +155,6 @@ config UML_NET_VECTOR
 	  with up to 4 times higher network throughput than the UML network
 	  drivers.
 
-config UML_NET_VDE
-	bool "VDE transport (obsolete)"
-	depends on UML_NET
-	depends on !MODVERSIONS
-	select MAY_HAVE_RUNTIME_DEPS
-	help
-	  This User-Mode Linux network transport allows one or more running
-	  UMLs on a single host to communicate with each other and also
-	  with the rest of the world using Virtual Distributed Ethernet,
-	  an improved fork of uml_switch.
-
-	  You must have libvdeplug installed in order to build the vde
-	  transport into UML.
-
-	  To use this form of networking, you will need to run vde_switch
-	  on the host.
-
-	  For more information, see <http://wiki.virtualsquare.org/>
-	  That site has a good overview of what VDE is and also examples
-	  of the UML command line to use to enable VDE networking.
-
-	  NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please
-	  migrate to UML_NET_VECTOR.
-
-	  If unsure, say N.
-
-config UML_NET_MCAST
-	bool "Multicast transport (obsolete)"
-	depends on UML_NET
-	help
-	  This Multicast User-Mode Linux network transport allows multiple
-	  UMLs (even ones running on different host machines!) to talk to
-	  each other over a virtual ethernet network.  However, it requires
-	  at least one UML with one of the other transports to act as a
-	  bridge if any of them need to be able to talk to their hosts or any
-	  other IP machines.
-
-	  To use this, your host kernel(s) must support IP Multicasting.
-
-	  For more information, see
-	  <http://user-mode-linux.sourceforge.net/old/networking.html>  That site
-	  has examples of the UML command line to use to enable Multicast
-	  networking, and notes about the security of this approach.
-
-	  NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please
-	  migrate to UML_NET_VECTOR.
-
-	  If unsure, say N.
-
-config UML_NET_SLIRP
-	bool "SLiRP transport (obsolete)"
-	depends on UML_NET
-	help
-	  The SLiRP User-Mode Linux network transport allows a running UML
-	  to network by invoking a program that can handle SLIP encapsulated
-	  packets.  This is commonly (but not limited to) the application
-	  known as SLiRP, a program that can re-socket IP packets back onto
-	  he host on which it is run.  Only IP packets are supported,
-	  unlike other network transports that can handle all Ethernet
-	  frames.  In general, slirp allows the UML the same IP connectivity
-	  to the outside world that the host user is permitted, and unlike
-	  other transports, SLiRP works without the need of root level
-	  privileges, setuid binaries, or SLIP devices on the host.  This
-	  also means not every type of connection is possible, but most
-	  situations can be accommodated with carefully crafted slirp
-	  commands that can be passed along as part of the network device's
-	  setup string.  The effect of this transport on the UML is similar
-	  that of a host behind a firewall that masquerades all network
-	  connections passing through it (but is less secure).
-
-	  NOTE: THIS TRANSPORT IS DEPRECATED AND WILL BE REMOVED SOON!!! Please
-	  migrate to UML_NET_VECTOR.
-
-	  If unsure, say N.
-
-	  Startup example: "eth0=slirp,FE:FD:01:02:03:04,/usr/local/bin/slirp"
-
 endmenu
 
 config VIRTIO_UML
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index 336be56b8975..6dfc2e9c0ce8 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -6,11 +6,7 @@
 # pcap is broken in 2.5 because kbuild doesn't allow pcap.a to be linked
 # in to pcap.o
 
-slip-objs := slip_kern.o slip_user.o
-slirp-objs := slirp_kern.o slirp_user.o
-daemon-objs := daemon_kern.o daemon_user.o
 vector-objs := vector_kern.o vector_user.o vector_transports.o
-umcast-objs := umcast_kern.o umcast_user.o
 net-objs := net_kern.o net_user.o
 mconsole-objs := mconsole_kern.o mconsole_user.o
 hostaudio-objs := hostaudio_kern.o
@@ -21,13 +17,6 @@ harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o
 rtc-objs := rtc_kern.o rtc_user.o
 vfio_uml-objs := vfio_kern.o vfio_user.o
 
-LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a)
-
-targets := vde_kern.o vde_user.o
-
-$(obj)/vde.o: $(obj)/vde_kern.o $(obj)/vde_user.o
-	$(LD) -r -dp -o $@ $^ $(ld_flags)
-
 #XXX: The call below does not work because the flags are added before the
 # object name, so nothing from the library gets linked.
 #$(call if_changed,ld)
@@ -39,12 +28,7 @@ obj-y := stdio_console.o fd.o chan_kern.o chan_user.o line.o
 obj-$(CONFIG_SSL) += ssl.o
 obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o
 
-obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o
-obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o
-obj-$(CONFIG_UML_NET_DAEMON) += daemon.o 
 obj-$(CONFIG_UML_NET_VECTOR) += vector.o
-obj-$(CONFIG_UML_NET_VDE) += vde.o
-obj-$(CONFIG_UML_NET_MCAST) += umcast.o
 obj-$(CONFIG_UML_NET) += net.o 
 obj-$(CONFIG_MCONSOLE) += mconsole.o
 obj-$(CONFIG_MMAPPER) += mmapper_kern.o 
@@ -66,7 +50,7 @@ obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o
 obj-$(CONFIG_UML_PCI_OVER_VFIO) += vfio_uml.o
 
 # pcap_user.o must be added explicitly.
-USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o vector_user.o
+USER_OBJS := fd.o null.o pty.o tty.o xterm.o vector_user.o
 CFLAGS_null.o = -DDEV_NULL=$(DEV_NULL_PATH)
 
 CFLAGS_xterm.o += '-DCONFIG_XTERM_CHAN_DEFAULT_EMULATOR="$(CONFIG_XTERM_CHAN_DEFAULT_EMULATOR)"'
diff --git a/arch/um/drivers/daemon.h b/arch/um/drivers/daemon.h
deleted file mode 100644
index 1509cc7eb907..000000000000
--- a/arch/um/drivers/daemon.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#ifndef __DAEMON_H__
-#define __DAEMON_H__
-
-#include <net_user.h>
-
-#define SWITCH_VERSION 3
-
-struct daemon_data {
-	char *sock_type;
-	char *ctl_sock;
-	void *ctl_addr;
-	void *data_addr;
-	void *local_addr;
-	int fd;
-	int control;
-	void *dev;
-};
-
-extern const struct net_user_info daemon_user_info;
-
-extern int daemon_user_write(int fd, void *buf, int len,
-			     struct daemon_data *pri);
-
-#endif
diff --git a/arch/um/drivers/daemon_kern.c b/arch/um/drivers/daemon_kern.c
deleted file mode 100644
index afde1e82c056..000000000000
--- a/arch/um/drivers/daemon_kern.c
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
- * James Leu (jleu@mindspring.net).
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Copyright (C) 2001 by various other people who didn't put their name here.
- */
-
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include <net_kern.h>
-#include "daemon.h"
-
-struct daemon_init {
-	char *sock_type;
-	char *ctl_sock;
-};
-
-static void daemon_init(struct net_device *dev, void *data)
-{
-	struct uml_net_private *pri;
-	struct daemon_data *dpri;
-	struct daemon_init *init = data;
-
-	pri = netdev_priv(dev);
-	dpri = (struct daemon_data *) pri->user;
-	dpri->sock_type = init->sock_type;
-	dpri->ctl_sock = init->ctl_sock;
-	dpri->fd = -1;
-	dpri->control = -1;
-	dpri->dev = dev;
-	/* We will free this pointer. If it contains crap we're burned. */
-	dpri->ctl_addr = NULL;
-	dpri->data_addr = NULL;
-	dpri->local_addr = NULL;
-
-	printk("daemon backend (uml_switch version %d) - %s:%s",
-	       SWITCH_VERSION, dpri->sock_type, dpri->ctl_sock);
-	printk("\n");
-}
-
-static int daemon_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	return net_recvfrom(fd, skb_mac_header(skb),
-			    skb->dev->mtu + ETH_HEADER_OTHER);
-}
-
-static int daemon_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	return daemon_user_write(fd, skb->data, skb->len,
-				 (struct daemon_data *) &lp->user);
-}
-
-static const struct net_kern_info daemon_kern_info = {
-	.init			= daemon_init,
-	.protocol		= eth_protocol,
-	.read			= daemon_read,
-	.write			= daemon_write,
-};
-
-static int daemon_setup(char *str, char **mac_out, void *data)
-{
-	struct daemon_init *init = data;
-	char *remain;
-
-	*init = ((struct daemon_init)
-		{ .sock_type 		= "unix",
-		  .ctl_sock 		= CONFIG_UML_NET_DAEMON_DEFAULT_SOCK });
-
-	remain = split_if_spec(str, mac_out, &init->sock_type, &init->ctl_sock,
-			       NULL);
-	if (remain != NULL)
-		printk(KERN_WARNING "daemon_setup : Ignoring data socket "
-		       "specification\n");
-
-	return 1;
-}
-
-static struct transport daemon_transport = {
-	.list 		= LIST_HEAD_INIT(daemon_transport.list),
-	.name 		= "daemon",
-	.setup  	= daemon_setup,
-	.user 		= &daemon_user_info,
-	.kern 		= &daemon_kern_info,
-	.private_size 	= sizeof(struct daemon_data),
-	.setup_size 	= sizeof(struct daemon_init),
-};
-
-static int register_daemon(void)
-{
-	register_transport(&daemon_transport);
-	return 0;
-}
-
-late_initcall(register_daemon);
diff --git a/arch/um/drivers/daemon_user.c b/arch/um/drivers/daemon_user.c
deleted file mode 100644
index 785baedc3555..000000000000
--- a/arch/um/drivers/daemon_user.c
+++ /dev/null
@@ -1,194 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
- * James Leu (jleu@mindspring.net).
- * Copyright (C) 2001 by various other people who didn't put their name here.
- */
-
-#include <stdint.h>
-#include <string.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/time.h>
-#include <sys/un.h>
-#include "daemon.h"
-#include <net_user.h>
-#include <os.h>
-#include <um_malloc.h>
-
-enum request_type { REQ_NEW_CONTROL };
-
-#define SWITCH_MAGIC 0xfeedface
-
-struct request_v3 {
-	uint32_t magic;
-	uint32_t version;
-	enum request_type type;
-	struct sockaddr_un sock;
-};
-
-static struct sockaddr_un *new_addr(void *name, int len)
-{
-	struct sockaddr_un *sun;
-
-	sun = uml_kmalloc(sizeof(struct sockaddr_un), UM_GFP_KERNEL);
-	if (sun == NULL) {
-		printk(UM_KERN_ERR "new_addr: allocation of sockaddr_un "
-		       "failed\n");
-		return NULL;
-	}
-	sun->sun_family = AF_UNIX;
-	memcpy(sun->sun_path, name, len);
-	return sun;
-}
-
-static int connect_to_switch(struct daemon_data *pri)
-{
-	struct sockaddr_un *ctl_addr = pri->ctl_addr;
-	struct sockaddr_un *local_addr = pri->local_addr;
-	struct sockaddr_un *sun;
-	struct request_v3 req;
-	int fd, n, err;
-
-	pri->control = socket(AF_UNIX, SOCK_STREAM, 0);
-	if (pri->control < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "daemon_open : control socket failed, "
-		       "errno = %d\n", -err);
-		return err;
-	}
-
-	if (connect(pri->control, (struct sockaddr *) ctl_addr,
-		   sizeof(*ctl_addr)) < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "daemon_open : control connect failed, "
-		       "errno = %d\n", -err);
-		goto out;
-	}
-
-	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
-	if (fd < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "daemon_open : data socket failed, "
-		       "errno = %d\n", -err);
-		goto out;
-	}
-	if (bind(fd, (struct sockaddr *) local_addr, sizeof(*local_addr)) < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "daemon_open : data bind failed, "
-		       "errno = %d\n", -err);
-		goto out_close;
-	}
-
-	sun = uml_kmalloc(sizeof(struct sockaddr_un), UM_GFP_KERNEL);
-	if (sun == NULL) {
-		printk(UM_KERN_ERR "new_addr: allocation of sockaddr_un "
-		       "failed\n");
-		err = -ENOMEM;
-		goto out_close;
-	}
-
-	req.magic = SWITCH_MAGIC;
-	req.version = SWITCH_VERSION;
-	req.type = REQ_NEW_CONTROL;
-	req.sock = *local_addr;
-	n = write(pri->control, &req, sizeof(req));
-	if (n != sizeof(req)) {
-		printk(UM_KERN_ERR "daemon_open : control setup request "
-		       "failed, err = %d\n", -errno);
-		err = -ENOTCONN;
-		goto out_free;
-	}
-
-	n = read(pri->control, sun, sizeof(*sun));
-	if (n != sizeof(*sun)) {
-		printk(UM_KERN_ERR "daemon_open : read of data socket failed, "
-		       "err = %d\n", -errno);
-		err = -ENOTCONN;
-		goto out_free;
-	}
-
-	pri->data_addr = sun;
-	return fd;
-
- out_free:
-	kfree(sun);
- out_close:
-	close(fd);
- out:
-	close(pri->control);
-	return err;
-}
-
-static int daemon_user_init(void *data, void *dev)
-{
-	struct daemon_data *pri = data;
-	struct timeval tv;
-	struct {
-		char zero;
-		int pid;
-		int usecs;
-	} name;
-
-	if (!strcmp(pri->sock_type, "unix"))
-		pri->ctl_addr = new_addr(pri->ctl_sock,
-					 strlen(pri->ctl_sock) + 1);
-	name.zero = 0;
-	name.pid = os_getpid();
-	gettimeofday(&tv, NULL);
-	name.usecs = tv.tv_usec;
-	pri->local_addr = new_addr(&name, sizeof(name));
-	pri->dev = dev;
-	pri->fd = connect_to_switch(pri);
-	if (pri->fd < 0) {
-		kfree(pri->local_addr);
-		pri->local_addr = NULL;
-		return pri->fd;
-	}
-
-	return 0;
-}
-
-static int daemon_open(void *data)
-{
-	struct daemon_data *pri = data;
-	return pri->fd;
-}
-
-static void daemon_remove(void *data)
-{
-	struct daemon_data *pri = data;
-
-	close(pri->fd);
-	pri->fd = -1;
-	close(pri->control);
-	pri->control = -1;
-
-	kfree(pri->data_addr);
-	pri->data_addr = NULL;
-	kfree(pri->ctl_addr);
-	pri->ctl_addr = NULL;
-	kfree(pri->local_addr);
-	pri->local_addr = NULL;
-}
-
-int daemon_user_write(int fd, void *buf, int len, struct daemon_data *pri)
-{
-	struct sockaddr_un *data_addr = pri->data_addr;
-
-	return net_sendto(fd, buf, len, data_addr, sizeof(*data_addr));
-}
-
-const struct net_user_info daemon_user_info = {
-	.init		= daemon_user_init,
-	.open		= daemon_open,
-	.close	 	= NULL,
-	.remove	 	= daemon_remove,
-	.add_address	= NULL,
-	.delete_address = NULL,
-	.mtu		= ETH_MAX_PACKET,
-	.max_packet	= ETH_MAX_PACKET + ETH_HEADER_OTHER,
-};
diff --git a/arch/um/drivers/slip.h b/arch/um/drivers/slip.h
deleted file mode 100644
index 0f3b7ca99465..000000000000
--- a/arch/um/drivers/slip.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __UM_SLIP_H
-#define __UM_SLIP_H
-
-#include "slip_common.h"
-
-struct slip_data {
-	void *dev;
-	char name[sizeof("slnnnnn\0")];
-	char *addr;
-	char *gate_addr;
-	int slave;
-	struct slip_proto slip;
-};
-
-extern const struct net_user_info slip_user_info;
-
-extern int slip_user_read(int fd, void *buf, int len, struct slip_data *pri);
-extern int slip_user_write(int fd, void *buf, int len, struct slip_data *pri);
-
-#endif
diff --git a/arch/um/drivers/slip_common.c b/arch/um/drivers/slip_common.c
deleted file mode 100644
index 20fe4f42743d..000000000000
--- a/arch/um/drivers/slip_common.c
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <string.h>
-#include "slip_common.h"
-#include <net_user.h>
-
-int slip_proto_read(int fd, void *buf, int len, struct slip_proto *slip)
-{
-	int i, n, size, start;
-
-	if(slip->more > 0){
-		i = 0;
-		while(i < slip->more){
-			size = slip_unesc(slip->ibuf[i++], slip->ibuf,
-					  &slip->pos, &slip->esc);
-			if(size){
-				memcpy(buf, slip->ibuf, size);
-				memmove(slip->ibuf, &slip->ibuf[i],
-					slip->more - i);
-				slip->more = slip->more - i;
-				return size;
-			}
-		}
-		slip->more = 0;
-	}
-
-	n = net_read(fd, &slip->ibuf[slip->pos],
-		     sizeof(slip->ibuf) - slip->pos);
-	if(n <= 0)
-		return n;
-
-	start = slip->pos;
-	for(i = 0; i < n; i++){
-		size = slip_unesc(slip->ibuf[start + i], slip->ibuf,&slip->pos,
-				  &slip->esc);
-		if(size){
-			memcpy(buf, slip->ibuf, size);
-			memmove(slip->ibuf, &slip->ibuf[start+i+1],
-				n - (i + 1));
-			slip->more = n - (i + 1);
-			return size;
-		}
-	}
-	return 0;
-}
-
-int slip_proto_write(int fd, void *buf, int len, struct slip_proto *slip)
-{
-	int actual, n;
-
-	actual = slip_esc(buf, slip->obuf, len);
-	n = net_write(fd, slip->obuf, actual);
-	if(n < 0)
-		return n;
-	else return len;
-}
diff --git a/arch/um/drivers/slip_common.h b/arch/um/drivers/slip_common.h
deleted file mode 100644
index d3798b5caf7f..000000000000
--- a/arch/um/drivers/slip_common.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __UM_SLIP_COMMON_H
-#define __UM_SLIP_COMMON_H
-
-#define BUF_SIZE 1500
- /* two bytes each for a (pathological) max packet of escaped chars +  *
-  * terminating END char + initial END char                            */
-#define ENC_BUF_SIZE (2 * BUF_SIZE + 2)
-
-/* SLIP protocol characters. */
-#define SLIP_END             0300	/* indicates end of frame	*/
-#define SLIP_ESC             0333	/* indicates byte stuffing	*/
-#define SLIP_ESC_END         0334	/* ESC ESC_END means END 'data'	*/
-#define SLIP_ESC_ESC         0335	/* ESC ESC_ESC means ESC 'data'	*/
-
-static inline int slip_unesc(unsigned char c, unsigned char *buf, int *pos,
-                             int *esc)
-{
-	int ret;
-
-	switch(c){
-	case SLIP_END:
-		*esc = 0;
-		ret=*pos;
-		*pos=0;
-		return(ret);
-	case SLIP_ESC:
-		*esc = 1;
-		return(0);
-	case SLIP_ESC_ESC:
-		if(*esc){
-			*esc = 0;
-			c = SLIP_ESC;
-		}
-		break;
-	case SLIP_ESC_END:
-		if(*esc){
-			*esc = 0;
-			c = SLIP_END;
-		}
-		break;
-	}
-	buf[(*pos)++] = c;
-	return(0);
-}
-
-static inline int slip_esc(unsigned char *s, unsigned char *d, int len)
-{
-	unsigned char *ptr = d;
-	unsigned char c;
-
-	/*
-	 * Send an initial END character to flush out any
-	 * data that may have accumulated in the receiver
-	 * due to line noise.
-	 */
-
-	*ptr++ = SLIP_END;
-
-	/*
-	 * For each byte in the packet, send the appropriate
-	 * character sequence, according to the SLIP protocol.
-	 */
-
-	while (len-- > 0) {
-		switch(c = *s++) {
-		case SLIP_END:
-			*ptr++ = SLIP_ESC;
-			*ptr++ = SLIP_ESC_END;
-			break;
-		case SLIP_ESC:
-			*ptr++ = SLIP_ESC;
-			*ptr++ = SLIP_ESC_ESC;
-			break;
-		default:
-			*ptr++ = c;
-			break;
-		}
-	}
-	*ptr++ = SLIP_END;
-	return (ptr - d);
-}
-
-struct slip_proto {
-	unsigned char ibuf[ENC_BUF_SIZE];
-	unsigned char obuf[ENC_BUF_SIZE];
-	int more; /* more data: do not read fd until ibuf has been drained */
-	int pos;
-	int esc;
-};
-
-static inline void slip_proto_init(struct slip_proto * slip)
-{
-	memset(slip->ibuf, 0, sizeof(slip->ibuf));
-	memset(slip->obuf, 0, sizeof(slip->obuf));
-	slip->more = 0;
-	slip->pos = 0;
-	slip->esc = 0;
-}
-
-extern int slip_proto_read(int fd, void *buf, int len,
-			   struct slip_proto *slip);
-extern int slip_proto_write(int fd, void *buf, int len,
-			    struct slip_proto *slip);
-
-#endif
diff --git a/arch/um/drivers/slip_kern.c b/arch/um/drivers/slip_kern.c
deleted file mode 100644
index c58ccdcc16d6..000000000000
--- a/arch/um/drivers/slip_kern.c
+++ /dev/null
@@ -1,93 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#include <linux/if_arp.h>
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include <net_kern.h>
-#include "slip.h"
-
-struct slip_init {
-	char *gate_addr;
-};
-
-static void slip_init(struct net_device *dev, void *data)
-{
-	struct uml_net_private *private;
-	struct slip_data *spri;
-	struct slip_init *init = data;
-
-	private = netdev_priv(dev);
-	spri = (struct slip_data *) private->user;
-
-	memset(spri->name, 0, sizeof(spri->name));
-	spri->addr = NULL;
-	spri->gate_addr = init->gate_addr;
-	spri->slave = -1;
-	spri->dev = dev;
-
-	slip_proto_init(&spri->slip);
-
-	dev->hard_header_len = 0;
-	dev->header_ops = NULL;
-	dev->addr_len = 0;
-	dev->type = ARPHRD_SLIP;
-	dev->tx_queue_len = 256;
-	dev->flags = IFF_NOARP;
-	printk("SLIP backend - SLIP IP = %s\n", spri->gate_addr);
-}
-
-static unsigned short slip_protocol(struct sk_buff *skbuff)
-{
-	return htons(ETH_P_IP);
-}
-
-static int slip_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	return slip_user_read(fd, skb_mac_header(skb), skb->dev->mtu,
-			      (struct slip_data *) &lp->user);
-}
-
-static int slip_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	return slip_user_write(fd, skb->data, skb->len,
-			       (struct slip_data *) &lp->user);
-}
-
-static const struct net_kern_info slip_kern_info = {
-	.init			= slip_init,
-	.protocol		= slip_protocol,
-	.read			= slip_read,
-	.write			= slip_write,
-};
-
-static int slip_setup(char *str, char **mac_out, void *data)
-{
-	struct slip_init *init = data;
-
-	*init = ((struct slip_init) { .gate_addr = NULL });
-
-	if (str[0] != '\0')
-		init->gate_addr = str;
-	return 1;
-}
-
-static struct transport slip_transport = {
-	.list 		= LIST_HEAD_INIT(slip_transport.list),
-	.name 		= "slip",
-	.setup  	= slip_setup,
-	.user 		= &slip_user_info,
-	.kern 		= &slip_kern_info,
-	.private_size 	= sizeof(struct slip_data),
-	.setup_size 	= sizeof(struct slip_init),
-};
-
-static int register_slip(void)
-{
-	register_transport(&slip_transport);
-	return 0;
-}
-
-late_initcall(register_slip);
diff --git a/arch/um/drivers/slip_user.c b/arch/um/drivers/slip_user.c
deleted file mode 100644
index 7334019c9e60..000000000000
--- a/arch/um/drivers/slip_user.c
+++ /dev/null
@@ -1,252 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <string.h>
-#include <termios.h>
-#include <sys/wait.h>
-#include <net_user.h>
-#include <os.h>
-#include "slip.h"
-#include <um_malloc.h>
-
-static int slip_user_init(void *data, void *dev)
-{
-	struct slip_data *pri = data;
-
-	pri->dev = dev;
-	return 0;
-}
-
-static int set_up_tty(int fd)
-{
-	int i;
-	struct termios tios;
-
-	if (tcgetattr(fd, &tios) < 0) {
-		printk(UM_KERN_ERR "could not get initial terminal "
-		       "attributes\n");
-		return -1;
-	}
-
-	tios.c_cflag = CS8 | CREAD | HUPCL | CLOCAL;
-	tios.c_iflag = IGNBRK | IGNPAR;
-	tios.c_oflag = 0;
-	tios.c_lflag = 0;
-	for (i = 0; i < NCCS; i++)
-		tios.c_cc[i] = 0;
-	tios.c_cc[VMIN] = 1;
-	tios.c_cc[VTIME] = 0;
-
-	cfsetospeed(&tios, B38400);
-	cfsetispeed(&tios, B38400);
-
-	if (tcsetattr(fd, TCSAFLUSH, &tios) < 0) {
-		printk(UM_KERN_ERR "failed to set terminal attributes\n");
-		return -1;
-	}
-	return 0;
-}
-
-struct slip_pre_exec_data {
-	int stdin_fd;
-	int stdout_fd;
-	int close_me;
-};
-
-static void slip_pre_exec(void *arg)
-{
-	struct slip_pre_exec_data *data = arg;
-
-	if (data->stdin_fd >= 0)
-		dup2(data->stdin_fd, 0);
-	dup2(data->stdout_fd, 1);
-	if (data->close_me >= 0)
-		close(data->close_me);
-}
-
-static int slip_tramp(char **argv, int fd)
-{
-	struct slip_pre_exec_data pe_data;
-	char *output;
-	int pid, fds[2], err, output_len;
-
-	err = os_pipe(fds, 1, 0);
-	if (err < 0) {
-		printk(UM_KERN_ERR "slip_tramp : pipe failed, err = %d\n",
-		       -err);
-		goto out;
-	}
-
-	err = 0;
-	pe_data.stdin_fd = fd;
-	pe_data.stdout_fd = fds[1];
-	pe_data.close_me = fds[0];
-	err = run_helper(slip_pre_exec, &pe_data, argv);
-	if (err < 0)
-		goto out_close;
-	pid = err;
-
-	output_len = UM_KERN_PAGE_SIZE;
-	output = uml_kmalloc(output_len, UM_GFP_KERNEL);
-	if (output == NULL) {
-		printk(UM_KERN_ERR "slip_tramp : failed to allocate output "
-		       "buffer\n");
-		os_kill_process(pid, 1);
-		err = -ENOMEM;
-		goto out_close;
-	}
-
-	close(fds[1]);
-	read_output(fds[0], output, output_len);
-	printk("%s", output);
-
-	err = helper_wait(pid);
-	close(fds[0]);
-
-	kfree(output);
-	return err;
-
-out_close:
-	close(fds[0]);
-	close(fds[1]);
-out:
-	return err;
-}
-
-static int slip_open(void *data)
-{
-	struct slip_data *pri = data;
-	char version_buf[sizeof("nnnnn\0")];
-	char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")];
-	char *argv[] = { "uml_net", version_buf, "slip", "up", gate_buf,
-			 NULL };
-	int sfd, mfd, err;
-
-	err = get_pty();
-	if (err < 0) {
-		printk(UM_KERN_ERR "slip-open : Failed to open pty, err = %d\n",
-		       -err);
-		goto out;
-	}
-	mfd = err;
-
-	err = open(ptsname(mfd), O_RDWR, 0);
-	if (err < 0) {
-		printk(UM_KERN_ERR "Couldn't open tty for slip line, "
-		       "err = %d\n", -err);
-		goto out_close;
-	}
-	sfd = err;
-
-	err = set_up_tty(sfd);
-	if (err)
-		goto out_close2;
-
-	pri->slave = sfd;
-	pri->slip.pos = 0;
-	pri->slip.esc = 0;
-	if (pri->gate_addr != NULL) {
-		sprintf(version_buf, "%d", UML_NET_VERSION);
-		strcpy(gate_buf, pri->gate_addr);
-
-		err = slip_tramp(argv, sfd);
-
-		if (err < 0) {
-			printk(UM_KERN_ERR "slip_tramp failed - err = %d\n",
-			       -err);
-			goto out_close2;
-		}
-		err = os_get_ifname(pri->slave, pri->name);
-		if (err < 0) {
-			printk(UM_KERN_ERR "get_ifname failed, err = %d\n",
-			       -err);
-			goto out_close2;
-		}
-		iter_addresses(pri->dev, open_addr, pri->name);
-	}
-	else {
-		err = os_set_slip(sfd);
-		if (err < 0) {
-			printk(UM_KERN_ERR "Failed to set slip discipline "
-			       "encapsulation - err = %d\n", -err);
-			goto out_close2;
-		}
-	}
-	return mfd;
-out_close2:
-	close(sfd);
-out_close:
-	close(mfd);
-out:
-	return err;
-}
-
-static void slip_close(int fd, void *data)
-{
-	struct slip_data *pri = data;
-	char version_buf[sizeof("nnnnn\0")];
-	char *argv[] = { "uml_net", version_buf, "slip", "down", pri->name,
-			 NULL };
-	int err;
-
-	if (pri->gate_addr != NULL)
-		iter_addresses(pri->dev, close_addr, pri->name);
-
-	sprintf(version_buf, "%d", UML_NET_VERSION);
-
-	err = slip_tramp(argv, pri->slave);
-
-	if (err != 0)
-		printk(UM_KERN_ERR "slip_tramp failed - errno = %d\n", -err);
-	close(fd);
-	close(pri->slave);
-	pri->slave = -1;
-}
-
-int slip_user_read(int fd, void *buf, int len, struct slip_data *pri)
-{
-	return slip_proto_read(fd, buf, len, &pri->slip);
-}
-
-int slip_user_write(int fd, void *buf, int len, struct slip_data *pri)
-{
-	return slip_proto_write(fd, buf, len, &pri->slip);
-}
-
-static void slip_add_addr(unsigned char *addr, unsigned char *netmask,
-			  void *data)
-{
-	struct slip_data *pri = data;
-
-	if (pri->slave < 0)
-		return;
-	open_addr(addr, netmask, pri->name);
-}
-
-static void slip_del_addr(unsigned char *addr, unsigned char *netmask,
-			    void *data)
-{
-	struct slip_data *pri = data;
-
-	if (pri->slave < 0)
-		return;
-	close_addr(addr, netmask, pri->name);
-}
-
-const struct net_user_info slip_user_info = {
-	.init		= slip_user_init,
-	.open		= slip_open,
-	.close	 	= slip_close,
-	.remove	 	= NULL,
-	.add_address	= slip_add_addr,
-	.delete_address = slip_del_addr,
-	.mtu		= BUF_SIZE,
-	.max_packet	= BUF_SIZE,
-};
diff --git a/arch/um/drivers/slirp.h b/arch/um/drivers/slirp.h
deleted file mode 100644
index 4aef2b88249a..000000000000
--- a/arch/um/drivers/slirp.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __UM_SLIRP_H
-#define __UM_SLIRP_H
-
-#include "slip_common.h"
-
-#define SLIRP_MAX_ARGS 100
-/*
- * XXX this next definition is here because I don't understand why this
- * initializer doesn't work in slirp_kern.c:
- *
- *   argv :  { init->argv[ 0 ... SLIRP_MAX_ARGS-1 ] },
- *
- * or why I can't typecast like this:
- *
- *   argv :  (char* [SLIRP_MAX_ARGS])(init->argv), 
- */
-struct arg_list_dummy_wrapper { char *argv[SLIRP_MAX_ARGS]; };
-
-struct slirp_data {
-	void *dev;
-	struct arg_list_dummy_wrapper argw;
-	int pid;
-	int slave;
-	struct slip_proto slip;
-};
-
-extern const struct net_user_info slirp_user_info;
-
-extern int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri);
-extern int slirp_user_write(int fd, void *buf, int len,
-			    struct slirp_data *pri);
-
-#endif
diff --git a/arch/um/drivers/slirp_kern.c b/arch/um/drivers/slirp_kern.c
deleted file mode 100644
index 0a6151ee9572..000000000000
--- a/arch/um/drivers/slirp_kern.c
+++ /dev/null
@@ -1,120 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#include <linux/if_arp.h>
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include <linux/string.h>
-#include <net_kern.h>
-#include <net_user.h>
-#include "slirp.h"
-
-struct slirp_init {
-	struct arg_list_dummy_wrapper argw;  /* XXX should be simpler... */
-};
-
-static void slirp_init(struct net_device *dev, void *data)
-{
-	struct uml_net_private *private;
-	struct slirp_data *spri;
-	struct slirp_init *init = data;
-	int i;
-
-	private = netdev_priv(dev);
-	spri = (struct slirp_data *) private->user;
-
-	spri->argw = init->argw;
-	spri->pid = -1;
-	spri->slave = -1;
-	spri->dev = dev;
-
-	slip_proto_init(&spri->slip);
-
-	dev->hard_header_len = 0;
-	dev->header_ops = NULL;
-	dev->addr_len = 0;
-	dev->type = ARPHRD_SLIP;
-	dev->tx_queue_len = 256;
-	dev->flags = IFF_NOARP;
-	printk("SLIRP backend - command line:");
-	for (i = 0; spri->argw.argv[i] != NULL; i++)
-		printk(" '%s'",spri->argw.argv[i]);
-	printk("\n");
-}
-
-static unsigned short slirp_protocol(struct sk_buff *skbuff)
-{
-	return htons(ETH_P_IP);
-}
-
-static int slirp_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	return slirp_user_read(fd, skb_mac_header(skb), skb->dev->mtu,
-			       (struct slirp_data *) &lp->user);
-}
-
-static int slirp_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	return slirp_user_write(fd, skb->data, skb->len,
-				(struct slirp_data *) &lp->user);
-}
-
-const struct net_kern_info slirp_kern_info = {
-	.init			= slirp_init,
-	.protocol		= slirp_protocol,
-	.read			= slirp_read,
-	.write			= slirp_write,
-};
-
-static int slirp_setup(char *str, char **mac_out, void *data)
-{
-	struct slirp_init *init = data;
-	int i=0;
-
-	*init = ((struct slirp_init) { .argw = { { "slirp", NULL  } } });
-
-	str = split_if_spec(str, mac_out, NULL);
-
-	if (str == NULL) /* no command line given after MAC addr */
-		return 1;
-
-	do {
-		if (i >= SLIRP_MAX_ARGS - 1) {
-			printk(KERN_WARNING "slirp_setup: truncating slirp "
-			       "arguments\n");
-			break;
-		}
-		init->argw.argv[i++] = str;
-		while(*str && *str!=',') {
-			if (*str == '_')
-				*str=' ';
-			str++;
-		}
-		if (*str != ',')
-			break;
-		*str++ = '\0';
-	} while (1);
-
-	init->argw.argv[i] = NULL;
-	return 1;
-}
-
-static struct transport slirp_transport = {
-	.list 		= LIST_HEAD_INIT(slirp_transport.list),
-	.name 		= "slirp",
-	.setup  	= slirp_setup,
-	.user 		= &slirp_user_info,
-	.kern 		= &slirp_kern_info,
-	.private_size 	= sizeof(struct slirp_data),
-	.setup_size 	= sizeof(struct slirp_init),
-};
-
-static int register_slirp(void)
-{
-	register_transport(&slirp_transport);
-	return 0;
-}
-
-late_initcall(register_slirp);
diff --git a/arch/um/drivers/slirp_user.c b/arch/um/drivers/slirp_user.c
deleted file mode 100644
index 97228aa080cb..000000000000
--- a/arch/um/drivers/slirp_user.c
+++ /dev/null
@@ -1,124 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <sys/wait.h>
-#include <net_user.h>
-#include <os.h>
-#include "slirp.h"
-
-static int slirp_user_init(void *data, void *dev)
-{
-	struct slirp_data *pri = data;
-
-	pri->dev = dev;
-	return 0;
-}
-
-struct slirp_pre_exec_data {
-	int stdin_fd;
-	int stdout_fd;
-};
-
-static void slirp_pre_exec(void *arg)
-{
-	struct slirp_pre_exec_data *data = arg;
-
-	if (data->stdin_fd != -1)
-		dup2(data->stdin_fd, 0);
-	if (data->stdout_fd != -1)
-		dup2(data->stdout_fd, 1);
-}
-
-static int slirp_tramp(char **argv, int fd)
-{
-	struct slirp_pre_exec_data pe_data;
-	int pid;
-
-	pe_data.stdin_fd = fd;
-	pe_data.stdout_fd = fd;
-	pid = run_helper(slirp_pre_exec, &pe_data, argv);
-
-	return pid;
-}
-
-static int slirp_open(void *data)
-{
-	struct slirp_data *pri = data;
-	int fds[2], err;
-
-	err = os_pipe(fds, 1, 1);
-	if (err)
-		return err;
-
-	err = slirp_tramp(pri->argw.argv, fds[1]);
-	if (err < 0) {
-		printk(UM_KERN_ERR "slirp_tramp failed - errno = %d\n", -err);
-		goto out;
-	}
-
-	pri->slave = fds[1];
-	pri->slip.pos = 0;
-	pri->slip.esc = 0;
-	pri->pid = err;
-
-	return fds[0];
-out:
-	close(fds[0]);
-	close(fds[1]);
-	return err;
-}
-
-static void slirp_close(int fd, void *data)
-{
-	struct slirp_data *pri = data;
-	int err;
-
-	close(fd);
-	close(pri->slave);
-
-	pri->slave = -1;
-
-	if (pri->pid<1) {
-		printk(UM_KERN_ERR "slirp_close: no child process to shut "
-		       "down\n");
-		return;
-	}
-
-#if 0
-	if (kill(pri->pid, SIGHUP)<0) {
-		printk(UM_KERN_ERR "slirp_close: sending hangup to %d failed "
-		       "(%d)\n", pri->pid, errno);
-	}
-#endif
-	err = helper_wait(pri->pid);
-	if (err < 0)
-		return;
-
-	pri->pid = -1;
-}
-
-int slirp_user_read(int fd, void *buf, int len, struct slirp_data *pri)
-{
-	return slip_proto_read(fd, buf, len, &pri->slip);
-}
-
-int slirp_user_write(int fd, void *buf, int len, struct slirp_data *pri)
-{
-	return slip_proto_write(fd, buf, len, &pri->slip);
-}
-
-const struct net_user_info slirp_user_info = {
-	.init		= slirp_user_init,
-	.open		= slirp_open,
-	.close	 	= slirp_close,
-	.remove	 	= NULL,
-	.add_address	= NULL,
-	.delete_address = NULL,
-	.mtu		= BUF_SIZE,
-	.max_packet	= BUF_SIZE,
-};
diff --git a/arch/um/drivers/umcast.h b/arch/um/drivers/umcast.h
deleted file mode 100644
index fe39bee1e3bd..000000000000
--- a/arch/um/drivers/umcast.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#ifndef __DRIVERS_UMCAST_H
-#define __DRIVERS_UMCAST_H
-
-#include <net_user.h>
-
-struct umcast_data {
-	char *addr;
-	unsigned short lport;
-	unsigned short rport;
-	void *listen_addr;
-	void *remote_addr;
-	int ttl;
-	int unicast;
-	void *dev;
-};
-
-extern const struct net_user_info umcast_user_info;
-
-extern int umcast_user_write(int fd, void *buf, int len,
-			     struct umcast_data *pri);
-
-#endif
diff --git a/arch/um/drivers/umcast_kern.c b/arch/um/drivers/umcast_kern.c
deleted file mode 100644
index 595a54f2b9c6..000000000000
--- a/arch/um/drivers/umcast_kern.c
+++ /dev/null
@@ -1,188 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * user-mode-linux networking multicast transport
- * Copyright (C) 2001 by Harald Welte <laforge@gnumonks.org>
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- *
- * based on the existing uml-networking code, which is
- * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
- * James Leu (jleu@mindspring.net).
- * Copyright (C) 2001 by various other people who didn't put their name here.
- *
- */
-
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include "umcast.h"
-#include <net_kern.h>
-
-struct umcast_init {
-	char *addr;
-	int lport;
-	int rport;
-	int ttl;
-	bool unicast;
-};
-
-static void umcast_init(struct net_device *dev, void *data)
-{
-	struct uml_net_private *pri;
-	struct umcast_data *dpri;
-	struct umcast_init *init = data;
-
-	pri = netdev_priv(dev);
-	dpri = (struct umcast_data *) pri->user;
-	dpri->addr = init->addr;
-	dpri->lport = init->lport;
-	dpri->rport = init->rport;
-	dpri->unicast = init->unicast;
-	dpri->ttl = init->ttl;
-	dpri->dev = dev;
-
-	if (dpri->unicast) {
-		printk(KERN_INFO "ucast backend address: %s:%u listen port: "
-		       "%u\n", dpri->addr, dpri->rport, dpri->lport);
-	} else {
-		printk(KERN_INFO "mcast backend multicast address: %s:%u, "
-		       "TTL:%u\n", dpri->addr, dpri->lport, dpri->ttl);
-	}
-}
-
-static int umcast_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	return net_recvfrom(fd, skb_mac_header(skb),
-			    skb->dev->mtu + ETH_HEADER_OTHER);
-}
-
-static int umcast_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	return umcast_user_write(fd, skb->data, skb->len,
-				(struct umcast_data *) &lp->user);
-}
-
-static const struct net_kern_info umcast_kern_info = {
-	.init			= umcast_init,
-	.protocol		= eth_protocol,
-	.read			= umcast_read,
-	.write			= umcast_write,
-};
-
-static int mcast_setup(char *str, char **mac_out, void *data)
-{
-	struct umcast_init *init = data;
-	char *port_str = NULL, *ttl_str = NULL, *remain;
-	char *last;
-
-	*init = ((struct umcast_init)
-		{ .addr	= "239.192.168.1",
-		  .lport	= 1102,
-		  .ttl	= 1 });
-
-	remain = split_if_spec(str, mac_out, &init->addr, &port_str, &ttl_str,
-			       NULL);
-	if (remain != NULL) {
-		printk(KERN_ERR "mcast_setup - Extra garbage on "
-		       "specification : '%s'\n", remain);
-		return 0;
-	}
-
-	if (port_str != NULL) {
-		init->lport = simple_strtoul(port_str, &last, 10);
-		if ((*last != '\0') || (last == port_str)) {
-			printk(KERN_ERR "mcast_setup - Bad port : '%s'\n",
-			       port_str);
-			return 0;
-		}
-	}
-
-	if (ttl_str != NULL) {
-		init->ttl = simple_strtoul(ttl_str, &last, 10);
-		if ((*last != '\0') || (last == ttl_str)) {
-			printk(KERN_ERR "mcast_setup - Bad ttl : '%s'\n",
-			       ttl_str);
-			return 0;
-		}
-	}
-
-	init->unicast = false;
-	init->rport = init->lport;
-
-	printk(KERN_INFO "Configured mcast device: %s:%u-%u\n", init->addr,
-	       init->lport, init->ttl);
-
-	return 1;
-}
-
-static int ucast_setup(char *str, char **mac_out, void *data)
-{
-	struct umcast_init *init = data;
-	char *lport_str = NULL, *rport_str = NULL, *remain;
-	char *last;
-
-	*init = ((struct umcast_init)
-		{ .addr		= "",
-		  .lport	= 1102,
-		  .rport	= 1102 });
-
-	remain = split_if_spec(str, mac_out, &init->addr,
-			       &lport_str, &rport_str, NULL);
-	if (remain != NULL) {
-		printk(KERN_ERR "ucast_setup - Extra garbage on "
-		       "specification : '%s'\n", remain);
-		return 0;
-	}
-
-	if (lport_str != NULL) {
-		init->lport = simple_strtoul(lport_str, &last, 10);
-		if ((*last != '\0') || (last == lport_str)) {
-			printk(KERN_ERR "ucast_setup - Bad listen port : "
-			       "'%s'\n", lport_str);
-			return 0;
-		}
-	}
-
-	if (rport_str != NULL) {
-		init->rport = simple_strtoul(rport_str, &last, 10);
-		if ((*last != '\0') || (last == rport_str)) {
-			printk(KERN_ERR "ucast_setup - Bad remote port : "
-			       "'%s'\n", rport_str);
-			return 0;
-		}
-	}
-
-	init->unicast = true;
-
-	printk(KERN_INFO "Configured ucast device: :%u -> %s:%u\n",
-	       init->lport, init->addr, init->rport);
-
-	return 1;
-}
-
-static struct transport mcast_transport = {
-	.list	= LIST_HEAD_INIT(mcast_transport.list),
-	.name	= "mcast",
-	.setup	= mcast_setup,
-	.user	= &umcast_user_info,
-	.kern	= &umcast_kern_info,
-	.private_size	= sizeof(struct umcast_data),
-	.setup_size	= sizeof(struct umcast_init),
-};
-
-static struct transport ucast_transport = {
-	.list	= LIST_HEAD_INIT(ucast_transport.list),
-	.name	= "ucast",
-	.setup	= ucast_setup,
-	.user	= &umcast_user_info,
-	.kern	= &umcast_kern_info,
-	.private_size	= sizeof(struct umcast_data),
-	.setup_size	= sizeof(struct umcast_init),
-};
-
-static int register_umcast(void)
-{
-	register_transport(&mcast_transport);
-	register_transport(&ucast_transport);
-	return 0;
-}
-
-late_initcall(register_umcast);
diff --git a/arch/um/drivers/umcast_user.c b/arch/um/drivers/umcast_user.c
deleted file mode 100644
index b50b13cff04e..000000000000
--- a/arch/um/drivers/umcast_user.c
+++ /dev/null
@@ -1,184 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * user-mode-linux networking multicast transport
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Copyright (C) 2001 by Harald Welte <laforge@gnumonks.org>
- *
- * based on the existing uml-networking code, which is
- * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
- * James Leu (jleu@mindspring.net).
- * Copyright (C) 2001 by various other people who didn't put their name here.
- *
- *
- */
-
-#include <unistd.h>
-#include <errno.h>
-#include <netinet/in.h>
-#include "umcast.h"
-#include <net_user.h>
-#include <um_malloc.h>
-
-static struct sockaddr_in *new_addr(char *addr, unsigned short port)
-{
-	struct sockaddr_in *sin;
-
-	sin = uml_kmalloc(sizeof(struct sockaddr_in), UM_GFP_KERNEL);
-	if (sin == NULL) {
-		printk(UM_KERN_ERR "new_addr: allocation of sockaddr_in "
-		       "failed\n");
-		return NULL;
-	}
-	sin->sin_family = AF_INET;
-	if (addr)
-		sin->sin_addr.s_addr = in_aton(addr);
-	else
-		sin->sin_addr.s_addr = INADDR_ANY;
-	sin->sin_port = htons(port);
-	return sin;
-}
-
-static int umcast_user_init(void *data, void *dev)
-{
-	struct umcast_data *pri = data;
-
-	pri->remote_addr = new_addr(pri->addr, pri->rport);
-	if (pri->unicast)
-		pri->listen_addr = new_addr(NULL, pri->lport);
-	else
-		pri->listen_addr = pri->remote_addr;
-	pri->dev = dev;
-	return 0;
-}
-
-static void umcast_remove(void *data)
-{
-	struct umcast_data *pri = data;
-
-	kfree(pri->listen_addr);
-	if (pri->unicast)
-		kfree(pri->remote_addr);
-	pri->listen_addr = pri->remote_addr = NULL;
-}
-
-static int umcast_open(void *data)
-{
-	struct umcast_data *pri = data;
-	struct sockaddr_in *lsin = pri->listen_addr;
-	struct sockaddr_in *rsin = pri->remote_addr;
-	struct ip_mreq mreq;
-	int fd, yes = 1, err = -EINVAL;
-
-
-	if ((!pri->unicast && lsin->sin_addr.s_addr == 0) ||
-	    (rsin->sin_addr.s_addr == 0) ||
-	    (lsin->sin_port == 0) || (rsin->sin_port == 0))
-		goto out;
-
-	fd = socket(AF_INET, SOCK_DGRAM, 0);
-
-	if (fd < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "umcast_open : data socket failed, "
-		       "errno = %d\n", errno);
-		goto out;
-	}
-
-	if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "umcast_open: SO_REUSEADDR failed, "
-		       "errno = %d\n", errno);
-		goto out_close;
-	}
-
-	if (!pri->unicast) {
-		/* set ttl according to config */
-		if (setsockopt(fd, SOL_IP, IP_MULTICAST_TTL, &pri->ttl,
-			       sizeof(pri->ttl)) < 0) {
-			err = -errno;
-			printk(UM_KERN_ERR "umcast_open: IP_MULTICAST_TTL "
-			       "failed, error = %d\n", errno);
-			goto out_close;
-		}
-
-		/* set LOOP, so data does get fed back to local sockets */
-		if (setsockopt(fd, SOL_IP, IP_MULTICAST_LOOP,
-			       &yes, sizeof(yes)) < 0) {
-			err = -errno;
-			printk(UM_KERN_ERR "umcast_open: IP_MULTICAST_LOOP "
-			       "failed, error = %d\n", errno);
-			goto out_close;
-		}
-	}
-
-	/* bind socket to the address */
-	if (bind(fd, (struct sockaddr *) lsin, sizeof(*lsin)) < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "umcast_open : data bind failed, "
-		       "errno = %d\n", errno);
-		goto out_close;
-	}
-
-	if (!pri->unicast) {
-		/* subscribe to the multicast group */
-		mreq.imr_multiaddr.s_addr = lsin->sin_addr.s_addr;
-		mreq.imr_interface.s_addr = 0;
-		if (setsockopt(fd, SOL_IP, IP_ADD_MEMBERSHIP,
-			       &mreq, sizeof(mreq)) < 0) {
-			err = -errno;
-			printk(UM_KERN_ERR "umcast_open: IP_ADD_MEMBERSHIP "
-			       "failed, error = %d\n", errno);
-			printk(UM_KERN_ERR "There appears not to be a "
-			       "multicast-capable network interface on the "
-			       "host.\n");
-			printk(UM_KERN_ERR "eth0 should be configured in order "
-			       "to use the multicast transport.\n");
-			goto out_close;
-		}
-	}
-
-	return fd;
-
- out_close:
-	close(fd);
- out:
-	return err;
-}
-
-static void umcast_close(int fd, void *data)
-{
-	struct umcast_data *pri = data;
-
-	if (!pri->unicast) {
-		struct ip_mreq mreq;
-		struct sockaddr_in *lsin = pri->listen_addr;
-
-		mreq.imr_multiaddr.s_addr = lsin->sin_addr.s_addr;
-		mreq.imr_interface.s_addr = 0;
-		if (setsockopt(fd, SOL_IP, IP_DROP_MEMBERSHIP,
-			       &mreq, sizeof(mreq)) < 0) {
-			printk(UM_KERN_ERR "umcast_close: IP_DROP_MEMBERSHIP "
-			       "failed, error = %d\n", errno);
-		}
-	}
-
-	close(fd);
-}
-
-int umcast_user_write(int fd, void *buf, int len, struct umcast_data *pri)
-{
-	struct sockaddr_in *data_addr = pri->remote_addr;
-
-	return net_sendto(fd, buf, len, data_addr, sizeof(*data_addr));
-}
-
-const struct net_user_info umcast_user_info = {
-	.init	= umcast_user_init,
-	.open	= umcast_open,
-	.close	= umcast_close,
-	.remove	= umcast_remove,
-	.add_address	= NULL,
-	.delete_address = NULL,
-	.mtu	= ETH_MAX_PACKET,
-	.max_packet	= ETH_MAX_PACKET + ETH_HEADER_OTHER,
-};
diff --git a/arch/um/drivers/vde.h b/arch/um/drivers/vde.h
deleted file mode 100644
index cab0379e6142..000000000000
--- a/arch/um/drivers/vde.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org).
- */
-
-#ifndef __UM_VDE_H__
-#define __UM_VDE_H__
-
-struct vde_data {
-	char *vde_switch;
-	char *descr;
-	void *args;
-	void *conn;
-	void *dev;
-};
-
-struct vde_init {
-	char *vde_switch;
-	char *descr;
-	int port;
-	char *group;
-	int mode;
-};
-
-extern const struct net_user_info vde_user_info;
-
-extern void vde_init_libstuff(struct vde_data *vpri, struct vde_init *init);
-
-extern int vde_user_read(void *conn, void *buf, int len);
-extern int vde_user_write(void *conn, void *buf, int len);
-
-#endif
diff --git a/arch/um/drivers/vde_kern.c b/arch/um/drivers/vde_kern.c
deleted file mode 100644
index bc6f22cbfb35..000000000000
--- a/arch/um/drivers/vde_kern.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org).
- *
- * Transport usage:
- *  ethN=vde,<vde_switch>,<mac addr>,<port>,<group>,<mode>,<description>
- *
- */
-
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include <net_kern.h>
-#include <net_user.h>
-#include "vde.h"
-
-static void vde_init(struct net_device *dev, void *data)
-{
-	struct vde_init *init = data;
-	struct uml_net_private *pri;
-	struct vde_data *vpri;
-
-	pri = netdev_priv(dev);
-	vpri = (struct vde_data *) pri->user;
-
-	vpri->vde_switch = init->vde_switch;
-	vpri->descr = init->descr ? init->descr : "UML vde_transport";
-	vpri->args = NULL;
-	vpri->conn = NULL;
-	vpri->dev = dev;
-
-	printk("vde backend - %s, ", vpri->vde_switch ?
-	       vpri->vde_switch : "(default socket)");
-
-	vde_init_libstuff(vpri, init);
-
-	printk("\n");
-}
-
-static int vde_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	struct vde_data *pri = (struct vde_data *) &lp->user;
-
-	if (pri->conn != NULL)
-		return vde_user_read(pri->conn, skb_mac_header(skb),
-				     skb->dev->mtu + ETH_HEADER_OTHER);
-
-	printk(KERN_ERR "vde_read - we have no VDECONN to read from");
-	return -EBADF;
-}
-
-static int vde_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	struct vde_data *pri = (struct vde_data *) &lp->user;
-
-	if (pri->conn != NULL)
-		return vde_user_write((void *)pri->conn, skb->data,
-				      skb->len);
-
-	printk(KERN_ERR "vde_write - we have no VDECONN to write to");
-	return -EBADF;
-}
-
-static const struct net_kern_info vde_kern_info = {
-	.init			= vde_init,
-	.protocol		= eth_protocol,
-	.read			= vde_read,
-	.write			= vde_write,
-};
-
-static int vde_setup(char *str, char **mac_out, void *data)
-{
-	struct vde_init *init = data;
-	char *remain, *port_str = NULL, *mode_str = NULL, *last;
-
-	*init = ((struct vde_init)
-		{ .vde_switch		= NULL,
-		  .descr		= NULL,
-		  .port			= 0,
-		  .group		= NULL,
-		  .mode			= 0 });
-
-	remain = split_if_spec(str, &init->vde_switch, mac_out, &port_str,
-				&init->group, &mode_str, &init->descr, NULL);
-
-	if (remain != NULL)
-		printk(KERN_WARNING "vde_setup - Ignoring extra data :"
-		       "'%s'\n", remain);
-
-	if (port_str != NULL) {
-		init->port = simple_strtoul(port_str, &last, 10);
-		if ((*last != '\0') || (last == port_str)) {
-			printk(KERN_ERR "vde_setup - Bad port : '%s'\n",
-						port_str);
-			return 0;
-		}
-	}
-
-	if (mode_str != NULL) {
-		init->mode = simple_strtoul(mode_str, &last, 8);
-		if ((*last != '\0') || (last == mode_str)) {
-			printk(KERN_ERR "vde_setup - Bad mode : '%s'\n",
-						mode_str);
-			return 0;
-		}
-	}
-
-	printk(KERN_INFO "Configured vde device: %s\n", init->vde_switch ?
-	       init->vde_switch : "(default socket)");
-
-	return 1;
-}
-
-static struct transport vde_transport = {
-	.list 		= LIST_HEAD_INIT(vde_transport.list),
-	.name 		= "vde",
-	.setup  	= vde_setup,
-	.user 		= &vde_user_info,
-	.kern 		= &vde_kern_info,
-	.private_size 	= sizeof(struct vde_data),
-	.setup_size 	= sizeof(struct vde_init),
-};
-
-static int register_vde(void)
-{
-	register_transport(&vde_transport);
-	return 0;
-}
-
-late_initcall(register_vde);
diff --git a/arch/um/drivers/vde_user.c b/arch/um/drivers/vde_user.c
deleted file mode 100644
index bc7dc4e1e486..000000000000
--- a/arch/um/drivers/vde_user.c
+++ /dev/null
@@ -1,125 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2007 Luca Bigliardi (shammash@artha.org).
- */
-
-#include <stddef.h>
-#include <errno.h>
-#include <libvdeplug.h>
-#include <net_user.h>
-#include <um_malloc.h>
-#include "vde.h"
-
-static int vde_user_init(void *data, void *dev)
-{
-	struct vde_data *pri = data;
-	VDECONN *conn = NULL;
-	int err = -EINVAL;
-
-	pri->dev = dev;
-
-	conn = vde_open(pri->vde_switch, pri->descr, pri->args);
-
-	if (conn == NULL) {
-		err = -errno;
-		printk(UM_KERN_ERR "vde_user_init: vde_open failed, "
-		       "errno = %d\n", errno);
-		return err;
-	}
-
-	printk(UM_KERN_INFO "vde backend - connection opened\n");
-
-	pri->conn = conn;
-
-	return 0;
-}
-
-static int vde_user_open(void *data)
-{
-	struct vde_data *pri = data;
-
-	if (pri->conn != NULL)
-		return vde_datafd(pri->conn);
-
-	printk(UM_KERN_WARNING "vde_open - we have no VDECONN to open");
-	return -EINVAL;
-}
-
-static void vde_remove(void *data)
-{
-	struct vde_data *pri = data;
-
-	if (pri->conn != NULL) {
-		printk(UM_KERN_INFO "vde backend - closing connection\n");
-		vde_close(pri->conn);
-		pri->conn = NULL;
-		kfree(pri->args);
-		pri->args = NULL;
-		return;
-	}
-
-	printk(UM_KERN_WARNING "vde_remove - we have no VDECONN to remove");
-}
-
-const struct net_user_info vde_user_info = {
-	.init		= vde_user_init,
-	.open		= vde_user_open,
-	.close	 	= NULL,
-	.remove	 	= vde_remove,
-	.add_address	= NULL,
-	.delete_address = NULL,
-	.mtu		= ETH_MAX_PACKET,
-	.max_packet	= ETH_MAX_PACKET + ETH_HEADER_OTHER,
-};
-
-void vde_init_libstuff(struct vde_data *vpri, struct vde_init *init)
-{
-	struct vde_open_args *args;
-
-	vpri->args = uml_kmalloc(sizeof(struct vde_open_args), UM_GFP_KERNEL);
-	if (vpri->args == NULL) {
-		printk(UM_KERN_ERR "vde_init_libstuff - vde_open_args "
-		       "allocation failed");
-		return;
-	}
-
-	args = vpri->args;
-
-	args->port = init->port;
-	args->group = init->group;
-	args->mode = init->mode ? init->mode : 0700;
-
-	args->port ?  printk("port %d", args->port) :
-		printk("undefined port");
-}
-
-int vde_user_read(void *conn, void *buf, int len)
-{
-	VDECONN *vconn = conn;
-	int rv;
-
-	if (vconn == NULL)
-		return 0;
-
-	rv = vde_recv(vconn, buf, len, 0);
-	if (rv < 0) {
-		if (errno == EAGAIN)
-			return 0;
-		return -errno;
-	}
-	else if (rv == 0)
-		return -ENOTCONN;
-
-	return rv;
-}
-
-int vde_user_write(void *conn, void *buf, int len)
-{
-	VDECONN *vconn = conn;
-
-	if (vconn == NULL)
-		return 0;
-
-	return vde_send(vconn, buf, len, 0);
-}
-
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 152a60080d5b..e63a74a5ff19 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -143,7 +143,6 @@ extern int os_access(const char *file, int mode);
 extern int os_set_exec_close(int fd);
 extern int os_ioctl_generic(int fd, unsigned int cmd, unsigned long arg);
 extern int os_get_ifname(int fd, char *namebuf);
-extern int os_set_slip(int fd);
 extern int os_mode_fd(int fd, int mode);
 
 extern int os_seek_file(int fd, unsigned long long offset);
diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile
index 049dfa5bc9c6..fae836713487 100644
--- a/arch/um/os-Linux/Makefile
+++ b/arch/um/os-Linux/Makefile
@@ -8,7 +8,7 @@ KCOV_INSTRUMENT                := n
 
 obj-y = execvp.o file.o helper.o irq.o main.o mem.o process.o \
 	registers.o sigio.o signal.o start_up.o time.o tty.o \
-	umid.o user_syms.o util.o drivers/ skas/
+	umid.o user_syms.o util.o skas/
 
 CFLAGS_signal.o += -Wframe-larger-than=4096
 
diff --git a/arch/um/os-Linux/drivers/Makefile b/arch/um/os-Linux/drivers/Makefile
deleted file mode 100644
index cf2d75bb1884..000000000000
--- a/arch/um/os-Linux/drivers/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# 
-# Copyright (C) 2000, 2002 Jeff Dike (jdike@karaya.com)
-#
-
-ethertap-objs := ethertap_kern.o ethertap_user.o
-tuntap-objs := tuntap_kern.o tuntap_user.o
-
-obj-y = 
-obj-$(CONFIG_UML_NET_ETHERTAP) += ethertap.o
-obj-$(CONFIG_UML_NET_TUNTAP) += tuntap.o
-
-include $(srctree)/arch/um/scripts/Makefile.rules
diff --git a/arch/um/os-Linux/drivers/etap.h b/arch/um/os-Linux/drivers/etap.h
deleted file mode 100644
index a475259f90e1..000000000000
--- a/arch/um/os-Linux/drivers/etap.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* 
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#ifndef __DRIVERS_ETAP_H
-#define __DRIVERS_ETAP_H
-
-#include <net_user.h>
-
-struct ethertap_data {
-	char *dev_name;
-	char *gate_addr;
-	int data_fd;
-	int control_fd;
-	void *dev;
-};
-
-extern const struct net_user_info ethertap_user_info;
-
-#endif
diff --git a/arch/um/os-Linux/drivers/ethertap_kern.c b/arch/um/os-Linux/drivers/ethertap_kern.c
deleted file mode 100644
index 5e5ee40680ce..000000000000
--- a/arch/um/os-Linux/drivers/ethertap_kern.c
+++ /dev/null
@@ -1,100 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
- * James Leu (jleu@mindspring.net).
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Copyright (C) 2001 by various other people who didn't put their name here.
- */
-
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include "etap.h"
-#include <net_kern.h>
-
-struct ethertap_init {
-	char *dev_name;
-	char *gate_addr;
-};
-
-static void etap_init(struct net_device *dev, void *data)
-{
-	struct uml_net_private *pri;
-	struct ethertap_data *epri;
-	struct ethertap_init *init = data;
-
-	pri = netdev_priv(dev);
-	epri = (struct ethertap_data *) pri->user;
-	epri->dev_name = init->dev_name;
-	epri->gate_addr = init->gate_addr;
-	epri->data_fd = -1;
-	epri->control_fd = -1;
-	epri->dev = dev;
-
-	printk(KERN_INFO "ethertap backend - %s", epri->dev_name);
-	if (epri->gate_addr != NULL)
-		printk(KERN_CONT ", IP = %s", epri->gate_addr);
-	printk(KERN_CONT "\n");
-}
-
-static int etap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	int len;
-
-	len = net_recvfrom(fd, skb_mac_header(skb),
-			   skb->dev->mtu + 2 + ETH_HEADER_ETHERTAP);
-	if (len <= 0)
-		return(len);
-
-	skb_pull(skb, 2);
-	len -= 2;
-	return len;
-}
-
-static int etap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	skb_push(skb, 2);
-	return net_send(fd, skb->data, skb->len);
-}
-
-const struct net_kern_info ethertap_kern_info = {
-	.init			= etap_init,
-	.protocol		= eth_protocol,
-	.read			= etap_read,
-	.write 			= etap_write,
-};
-
-static int ethertap_setup(char *str, char **mac_out, void *data)
-{
-	struct ethertap_init *init = data;
-
-	*init = ((struct ethertap_init)
-		{ .dev_name 	= NULL,
-		  .gate_addr 	= NULL });
-	if (tap_setup_common(str, "ethertap", &init->dev_name, mac_out,
-			    &init->gate_addr))
-		return 0;
-	if (init->dev_name == NULL) {
-		printk(KERN_ERR "ethertap_setup : Missing tap device name\n");
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct transport ethertap_transport = {
-	.list 		= LIST_HEAD_INIT(ethertap_transport.list),
-	.name 		= "ethertap",
-	.setup  	= ethertap_setup,
-	.user 		= &ethertap_user_info,
-	.kern 		= &ethertap_kern_info,
-	.private_size 	= sizeof(struct ethertap_data),
-	.setup_size 	= sizeof(struct ethertap_init),
-};
-
-static int register_ethertap(void)
-{
-	register_transport(&ethertap_transport);
-	return 0;
-}
-
-late_initcall(register_ethertap);
diff --git a/arch/um/os-Linux/drivers/ethertap_user.c b/arch/um/os-Linux/drivers/ethertap_user.c
deleted file mode 100644
index bdf215c0eca7..000000000000
--- a/arch/um/os-Linux/drivers/ethertap_user.c
+++ /dev/null
@@ -1,248 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
- * James Leu (jleu@mindspring.net).
- * Copyright (C) 2001 by various other people who didn't put their name here.
- */
-
-#include <stdio.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/wait.h>
-#include "etap.h"
-#include <os.h>
-#include <net_user.h>
-#include <um_malloc.h>
-
-#define MAX_PACKET ETH_MAX_PACKET
-
-static int etap_user_init(void *data, void *dev)
-{
-	struct ethertap_data *pri = data;
-
-	pri->dev = dev;
-	return 0;
-}
-
-struct addr_change {
-	enum { ADD_ADDR, DEL_ADDR } what;
-	unsigned char addr[4];
-	unsigned char netmask[4];
-};
-
-static void etap_change(int op, unsigned char *addr, unsigned char *netmask,
-			int fd)
-{
-	struct addr_change change;
-	char *output;
-	int n;
-
-	change.what = op;
-	memcpy(change.addr, addr, sizeof(change.addr));
-	memcpy(change.netmask, netmask, sizeof(change.netmask));
-	CATCH_EINTR(n = write(fd, &change, sizeof(change)));
-	if (n != sizeof(change)) {
-		printk(UM_KERN_ERR "etap_change - request failed, err = %d\n",
-		       errno);
-		return;
-	}
-
-	output = uml_kmalloc(UM_KERN_PAGE_SIZE, UM_GFP_KERNEL);
-	if (output == NULL)
-		printk(UM_KERN_ERR "etap_change : Failed to allocate output "
-		       "buffer\n");
-	read_output(fd, output, UM_KERN_PAGE_SIZE);
-	if (output != NULL) {
-		printk("%s", output);
-		kfree(output);
-	}
-}
-
-static void etap_open_addr(unsigned char *addr, unsigned char *netmask,
-			   void *arg)
-{
-	etap_change(ADD_ADDR, addr, netmask, *((int *) arg));
-}
-
-static void etap_close_addr(unsigned char *addr, unsigned char *netmask,
-			    void *arg)
-{
-	etap_change(DEL_ADDR, addr, netmask, *((int *) arg));
-}
-
-struct etap_pre_exec_data {
-	int control_remote;
-	int control_me;
-	int data_me;
-};
-
-static void etap_pre_exec(void *arg)
-{
-	struct etap_pre_exec_data *data = arg;
-
-	dup2(data->control_remote, 1);
-	close(data->data_me);
-	close(data->control_me);
-}
-
-static int etap_tramp(char *dev, char *gate, int control_me,
-		      int control_remote, int data_me, int data_remote)
-{
-	struct etap_pre_exec_data pe_data;
-	int pid, err, n;
-	char version_buf[sizeof("nnnnn\0")];
-	char data_fd_buf[sizeof("nnnnnn\0")];
-	char gate_buf[sizeof("nnn.nnn.nnn.nnn\0")];
-	char *setup_args[] = { "uml_net", version_buf, "ethertap", dev,
-			       data_fd_buf, gate_buf, NULL };
-	char *nosetup_args[] = { "uml_net", version_buf, "ethertap",
-				 dev, data_fd_buf, NULL };
-	char **args, c;
-
-	sprintf(data_fd_buf, "%d", data_remote);
-	sprintf(version_buf, "%d", UML_NET_VERSION);
-	if (gate != NULL) {
-		strscpy(gate_buf, gate);
-		args = setup_args;
-	}
-	else args = nosetup_args;
-
-	err = 0;
-	pe_data.control_remote = control_remote;
-	pe_data.control_me = control_me;
-	pe_data.data_me = data_me;
-	pid = run_helper(etap_pre_exec, &pe_data, args);
-
-	if (pid < 0)
-		err = pid;
-	close(data_remote);
-	close(control_remote);
-	CATCH_EINTR(n = read(control_me, &c, sizeof(c)));
-	if (n != sizeof(c)) {
-		err = -errno;
-		printk(UM_KERN_ERR "etap_tramp : read of status failed, "
-		       "err = %d\n", -err);
-		return err;
-	}
-	if (c != 1) {
-		printk(UM_KERN_ERR "etap_tramp : uml_net failed\n");
-		err = helper_wait(pid);
-	}
-	return err;
-}
-
-static int etap_open(void *data)
-{
-	struct ethertap_data *pri = data;
-	char *output;
-	int data_fds[2], control_fds[2], err, output_len;
-
-	err = tap_open_common(pri->dev, pri->gate_addr);
-	if (err)
-		return err;
-
-	err = socketpair(AF_UNIX, SOCK_DGRAM, 0, data_fds);
-	if (err) {
-		err = -errno;
-		printk(UM_KERN_ERR "etap_open - data socketpair failed - "
-		       "err = %d\n", errno);
-		return err;
-	}
-
-	err = socketpair(AF_UNIX, SOCK_STREAM, 0, control_fds);
-	if (err) {
-		err = -errno;
-		printk(UM_KERN_ERR "etap_open - control socketpair failed - "
-		       "err = %d\n", errno);
-		goto out_close_data;
-	}
-
-	err = etap_tramp(pri->dev_name, pri->gate_addr, control_fds[0],
-			 control_fds[1], data_fds[0], data_fds[1]);
-	output_len = UM_KERN_PAGE_SIZE;
-	output = uml_kmalloc(output_len, UM_GFP_KERNEL);
-	read_output(control_fds[0], output, output_len);
-
-	if (output == NULL)
-		printk(UM_KERN_ERR "etap_open : failed to allocate output "
-		       "buffer\n");
-	else {
-		printk("%s", output);
-		kfree(output);
-	}
-
-	if (err < 0) {
-		printk(UM_KERN_ERR "etap_tramp failed - err = %d\n", -err);
-		goto out_close_control;
-	}
-
-	pri->data_fd = data_fds[0];
-	pri->control_fd = control_fds[0];
-	iter_addresses(pri->dev, etap_open_addr, &pri->control_fd);
-	return data_fds[0];
-
-out_close_control:
-	close(control_fds[0]);
-	close(control_fds[1]);
-out_close_data:
-	close(data_fds[0]);
-	close(data_fds[1]);
-	return err;
-}
-
-static void etap_close(int fd, void *data)
-{
-	struct ethertap_data *pri = data;
-
-	iter_addresses(pri->dev, etap_close_addr, &pri->control_fd);
-	close(fd);
-
-	if (shutdown(pri->data_fd, SHUT_RDWR) < 0)
-		printk(UM_KERN_ERR "etap_close - shutdown data socket failed, "
-		       "errno = %d\n", errno);
-
-	if (shutdown(pri->control_fd, SHUT_RDWR) < 0)
-		printk(UM_KERN_ERR "etap_close - shutdown control socket "
-		       "failed, errno = %d\n", errno);
-
-	close(pri->data_fd);
-	pri->data_fd = -1;
-	close(pri->control_fd);
-	pri->control_fd = -1;
-}
-
-static void etap_add_addr(unsigned char *addr, unsigned char *netmask,
-			  void *data)
-{
-	struct ethertap_data *pri = data;
-
-	tap_check_ips(pri->gate_addr, addr);
-	if (pri->control_fd == -1)
-		return;
-	etap_open_addr(addr, netmask, &pri->control_fd);
-}
-
-static void etap_del_addr(unsigned char *addr, unsigned char *netmask,
-			  void *data)
-{
-	struct ethertap_data *pri = data;
-
-	if (pri->control_fd == -1)
-		return;
-
-	etap_close_addr(addr, netmask, &pri->control_fd);
-}
-
-const struct net_user_info ethertap_user_info = {
-	.init		= etap_user_init,
-	.open		= etap_open,
-	.close	 	= etap_close,
-	.remove	 	= NULL,
-	.add_address	= etap_add_addr,
-	.delete_address = etap_del_addr,
-	.mtu		= ETH_MAX_PACKET,
-	.max_packet	= ETH_MAX_PACKET + ETH_HEADER_ETHERTAP,
-};
diff --git a/arch/um/os-Linux/drivers/tuntap.h b/arch/um/os-Linux/drivers/tuntap.h
deleted file mode 100644
index e364e42abfc5..000000000000
--- a/arch/um/os-Linux/drivers/tuntap.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* 
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#ifndef __UM_TUNTAP_H
-#define __UM_TUNTAP_H
-
-#include <net_user.h>
-
-struct tuntap_data {
-	char *dev_name;
-	int fixed_config;
-	char *gate_addr;
-	int fd;
-	void *dev;
-};
-
-extern const struct net_user_info tuntap_user_info;
-
-#endif
diff --git a/arch/um/os-Linux/drivers/tuntap_kern.c b/arch/um/os-Linux/drivers/tuntap_kern.c
deleted file mode 100644
index ff022d9cf0dd..000000000000
--- a/arch/um/os-Linux/drivers/tuntap_kern.c
+++ /dev/null
@@ -1,86 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#include <linux/netdevice.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <asm/errno.h>
-#include <net_kern.h>
-#include "tuntap.h"
-
-struct tuntap_init {
-	char *dev_name;
-	char *gate_addr;
-};
-
-static void tuntap_init(struct net_device *dev, void *data)
-{
-	struct uml_net_private *pri;
-	struct tuntap_data *tpri;
-	struct tuntap_init *init = data;
-
-	pri = netdev_priv(dev);
-	tpri = (struct tuntap_data *) pri->user;
-	tpri->dev_name = init->dev_name;
-	tpri->fixed_config = (init->dev_name != NULL);
-	tpri->gate_addr = init->gate_addr;
-	tpri->fd = -1;
-	tpri->dev = dev;
-
-	printk(KERN_INFO "TUN/TAP backend - ");
-	if (tpri->gate_addr != NULL)
-		printk(KERN_CONT "IP = %s", tpri->gate_addr);
-	printk(KERN_CONT "\n");
-}
-
-static int tuntap_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	return net_read(fd, skb_mac_header(skb),
-			skb->dev->mtu + ETH_HEADER_OTHER);
-}
-
-static int tuntap_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
-{
-	return net_write(fd, skb->data, skb->len);
-}
-
-const struct net_kern_info tuntap_kern_info = {
-	.init			= tuntap_init,
-	.protocol		= eth_protocol,
-	.read			= tuntap_read,
-	.write 			= tuntap_write,
-};
-
-static int tuntap_setup(char *str, char **mac_out, void *data)
-{
-	struct tuntap_init *init = data;
-
-	*init = ((struct tuntap_init)
-		{ .dev_name 	= NULL,
-		  .gate_addr 	= NULL });
-	if (tap_setup_common(str, "tuntap", &init->dev_name, mac_out,
-			    &init->gate_addr))
-		return 0;
-
-	return 1;
-}
-
-static struct transport tuntap_transport = {
-	.list 		= LIST_HEAD_INIT(tuntap_transport.list),
-	.name 		= "tuntap",
-	.setup  	= tuntap_setup,
-	.user 		= &tuntap_user_info,
-	.kern 		= &tuntap_kern_info,
-	.private_size 	= sizeof(struct tuntap_data),
-	.setup_size 	= sizeof(struct tuntap_init),
-};
-
-static int register_tuntap(void)
-{
-	register_transport(&tuntap_transport);
-	return 0;
-}
-
-late_initcall(register_tuntap);
diff --git a/arch/um/os-Linux/drivers/tuntap_user.c b/arch/um/os-Linux/drivers/tuntap_user.c
deleted file mode 100644
index 91f0e27ca3a6..000000000000
--- a/arch/um/os-Linux/drivers/tuntap_user.c
+++ /dev/null
@@ -1,215 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* 
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#include <stdio.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <linux/if_tun.h>
-#include <net/if.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-#include <kern_util.h>
-#include <os.h>
-#include "tuntap.h"
-
-static int tuntap_user_init(void *data, void *dev)
-{
-	struct tuntap_data *pri = data;
-
-	pri->dev = dev;
-	return 0;
-}
-
-static void tuntap_add_addr(unsigned char *addr, unsigned char *netmask,
-			    void *data)
-{
-	struct tuntap_data *pri = data;
-
-	tap_check_ips(pri->gate_addr, addr);
-	if ((pri->fd == -1) || pri->fixed_config)
-		return;
-	open_addr(addr, netmask, pri->dev_name);
-}
-
-static void tuntap_del_addr(unsigned char *addr, unsigned char *netmask,
-			    void *data)
-{
-	struct tuntap_data *pri = data;
-
-	if ((pri->fd == -1) || pri->fixed_config)
-		return;
-	close_addr(addr, netmask, pri->dev_name);
-}
-
-struct tuntap_pre_exec_data {
-	int stdout_fd;
-	int close_me;
-};
-
-static void tuntap_pre_exec(void *arg)
-{
-	struct tuntap_pre_exec_data *data = arg;
-
-	dup2(data->stdout_fd, 1);
-	close(data->close_me);
-}
-
-static int tuntap_open_tramp(char *gate, int *fd_out, int me, int remote,
-			     char *buffer, int buffer_len, int *used_out)
-{
-	struct tuntap_pre_exec_data data;
-	char version_buf[sizeof("nnnnn\0")];
-	char *argv[] = { "uml_net", version_buf, "tuntap", "up", gate,
-			 NULL };
-	char buf[CMSG_SPACE(sizeof(*fd_out))];
-	struct msghdr msg;
-	struct cmsghdr *cmsg;
-	struct iovec iov;
-	int pid, n, err;
-
-	sprintf(version_buf, "%d", UML_NET_VERSION);
-
-	data.stdout_fd = remote;
-	data.close_me = me;
-
-	pid = run_helper(tuntap_pre_exec, &data, argv);
-
-	if (pid < 0)
-		return pid;
-
-	close(remote);
-
-	msg.msg_name = NULL;
-	msg.msg_namelen = 0;
-	if (buffer != NULL) {
-		iov = ((struct iovec) { buffer, buffer_len });
-		msg.msg_iov = &iov;
-		msg.msg_iovlen = 1;
-	}
-	else {
-		msg.msg_iov = NULL;
-		msg.msg_iovlen = 0;
-	}
-	msg.msg_control = buf;
-	msg.msg_controllen = sizeof(buf);
-	msg.msg_flags = 0;
-	n = recvmsg(me, &msg, 0);
-	*used_out = n;
-	if (n < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "tuntap_open_tramp : recvmsg failed - "
-		       "errno = %d\n", errno);
-		return err;
-	}
-	helper_wait(pid);
-
-	cmsg = CMSG_FIRSTHDR(&msg);
-	if (cmsg == NULL) {
-		printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a "
-		       "message\n");
-		return -EINVAL;
-	}
-	if ((cmsg->cmsg_level != SOL_SOCKET) ||
-	   (cmsg->cmsg_type != SCM_RIGHTS)) {
-		printk(UM_KERN_ERR "tuntap_open_tramp : didn't receive a "
-		       "descriptor\n");
-		return -EINVAL;
-	}
-	*fd_out = ((int *) CMSG_DATA(cmsg))[0];
-	os_set_exec_close(*fd_out);
-	return 0;
-}
-
-static int tuntap_open(void *data)
-{
-	struct ifreq ifr;
-	struct tuntap_data *pri = data;
-	char *output, *buffer;
-	int err, fds[2], len, used;
-
-	err = tap_open_common(pri->dev, pri->gate_addr);
-	if (err < 0)
-		return err;
-
-	if (pri->fixed_config) {
-		pri->fd = os_open_file("/dev/net/tun",
-				       of_cloexec(of_rdwr(OPENFLAGS())), 0);
-		if (pri->fd < 0) {
-			printk(UM_KERN_ERR "Failed to open /dev/net/tun, "
-			       "err = %d\n", -pri->fd);
-			return pri->fd;
-		}
-		memset(&ifr, 0, sizeof(ifr));
-		ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
-		strscpy(ifr.ifr_name, pri->dev_name);
-		if (ioctl(pri->fd, TUNSETIFF, &ifr) < 0) {
-			err = -errno;
-			printk(UM_KERN_ERR "TUNSETIFF failed, errno = %d\n",
-			       errno);
-			close(pri->fd);
-			return err;
-		}
-	}
-	else {
-		err = socketpair(AF_UNIX, SOCK_DGRAM, 0, fds);
-		if (err) {
-			err = -errno;
-			printk(UM_KERN_ERR "tuntap_open : socketpair failed - "
-			       "errno = %d\n", errno);
-			return err;
-		}
-
-		buffer = get_output_buffer(&len);
-		if (buffer != NULL)
-			len--;
-		used = 0;
-
-		err = tuntap_open_tramp(pri->gate_addr, &pri->fd, fds[0],
-					fds[1], buffer, len, &used);
-
-		output = buffer;
-		if (err < 0) {
-			printk("%s", output);
-			free_output_buffer(buffer);
-			printk(UM_KERN_ERR "tuntap_open_tramp failed - "
-			       "err = %d\n", -err);
-			return err;
-		}
-
-		pri->dev_name = uml_strdup(buffer);
-		output += IFNAMSIZ;
-		printk("%s", output);
-		free_output_buffer(buffer);
-
-		close(fds[0]);
-		iter_addresses(pri->dev, open_addr, pri->dev_name);
-	}
-
-	return pri->fd;
-}
-
-static void tuntap_close(int fd, void *data)
-{
-	struct tuntap_data *pri = data;
-
-	if (!pri->fixed_config)
-		iter_addresses(pri->dev, close_addr, pri->dev_name);
-	close(fd);
-	pri->fd = -1;
-}
-
-const struct net_user_info tuntap_user_info = {
-	.init		= tuntap_user_init,
-	.open		= tuntap_open,
-	.close	 	= tuntap_close,
-	.remove	 	= NULL,
-	.add_address	= tuntap_add_addr,
-	.delete_address = tuntap_del_addr,
-	.mtu		= ETH_MAX_PACKET,
-	.max_packet	= ETH_MAX_PACKET + ETH_HEADER_OTHER,
-};
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index a0d01c68ce3e..617886d1fb1e 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -106,21 +106,6 @@ int os_get_ifname(int fd, char* namebuf)
 	return 0;
 }
 
-int os_set_slip(int fd)
-{
-	int disc, sencap;
-
-	disc = N_SLIP;
-	if (ioctl(fd, TIOCSETD, &disc) < 0)
-		return -errno;
-
-	sencap = 0;
-	if (ioctl(fd, SIOCSIFENCAP, &sencap) < 0)
-		return -errno;
-
-	return 0;
-}
-
 int os_mode_fd(int fd, int mode)
 {
 	int err;
-- 
cgit v1.2.3


From b555cb66583e99158cfef8e91c025252cefae55b Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Sat, 3 May 2025 13:17:09 +0800
Subject: um: vector: Eliminate the dependency on uml_net

The only dependency on uml_net (i.e., the legacy network transport
infrastructure) is the call to uml_net_setup_etheraddr(). Implement
it inside vector to eliminate the uml_net dependency completely. It
will allow us to remove uml_net in the next step.

Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Acked-By: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Link: https://patch.msgid.link/20250503051710.3286595-3-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/Kconfig       |  1 -
 arch/um/drivers/vector_kern.c | 52 +++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 50 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index bc68c3e341f5..1bd4e053be04 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -145,7 +145,6 @@ config UML_NET
 
 config UML_NET_VECTOR
 	bool "Vector I/O high performance network devices"
-	depends on UML_NET
 	select MAY_HAVE_RUNTIME_DEPS
 	help
 	  This User-Mode Linux network driver uses multi-message send
diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c
index b97bb52dd562..0d5c96897fa0 100644
--- a/arch/um/drivers/vector_kern.c
+++ b/arch/um/drivers/vector_kern.c
@@ -27,7 +27,6 @@
 #include <init.h>
 #include <irq_kern.h>
 #include <irq_user.h>
-#include <net_kern.h>
 #include <os.h>
 #include "mconsole_kern.h"
 #include "vector_user.h"
@@ -1539,7 +1538,56 @@ static void vector_timer_expire(struct timer_list *t)
 	napi_schedule(&vp->napi);
 }
 
+static void vector_setup_etheraddr(struct net_device *dev, char *str)
+{
+	u8 addr[ETH_ALEN];
+	char *end;
+	int i;
 
+	if (str == NULL)
+		goto random;
+
+	for (i = 0; i < 6; i++) {
+		addr[i] = simple_strtoul(str, &end, 16);
+		if ((end == str) ||
+		   ((*end != ':') && (*end != ',') && (*end != '\0'))) {
+			printk(KERN_ERR
+			       "setup_etheraddr: failed to parse '%s' "
+			       "as an ethernet address\n", str);
+			goto random;
+		}
+		str = end + 1;
+	}
+	if (is_multicast_ether_addr(addr)) {
+		printk(KERN_ERR
+		       "Attempt to assign a multicast ethernet address to a "
+		       "device disallowed\n");
+		goto random;
+	}
+	if (!is_valid_ether_addr(addr)) {
+		printk(KERN_ERR
+		       "Attempt to assign an invalid ethernet address to a "
+		       "device disallowed\n");
+		goto random;
+	}
+	if (!is_local_ether_addr(addr)) {
+		printk(KERN_WARNING
+		       "Warning: Assigning a globally valid ethernet "
+		       "address to a device\n");
+		printk(KERN_WARNING "You should set the 2nd rightmost bit in "
+		       "the first byte of the MAC,\n");
+		printk(KERN_WARNING "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n",
+		       addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4],
+		       addr[5]);
+	}
+	eth_hw_addr_set(dev, addr);
+	return;
+
+random:
+	printk(KERN_INFO
+	       "Choosing a random ethernet address for device %s\n", dev->name);
+	eth_hw_addr_random(dev);
+}
 
 static void vector_eth_configure(
 		int n,
@@ -1574,7 +1622,7 @@ static void vector_eth_configure(
 	 * and fail.
 	 */
 	snprintf(dev->name, sizeof(dev->name), "vec%d", n);
-	uml_net_setup_etheraddr(dev, uml_vector_fetch_arg(def, "mac"));
+	vector_setup_etheraddr(dev, uml_vector_fetch_arg(def, "mac"));
 	vp = netdev_priv(dev);
 
 	/* sysfs register */
-- 
cgit v1.2.3


From e619e18ed462bded8e8f12672a37053d39451404 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Sat, 3 May 2025 13:17:10 +0800
Subject: um: Remove legacy network transport infrastructure

All legacy network transports have been removed. Vector transports
provide the same capabilities with significantly higher network
throughput. There is no reason to keep the legacy network transport
infrastructure anymore. Remove it to reduce the maintenance burden.

Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Acked-By: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Link: https://patch.msgid.link/20250503051710.3286595-4-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/configs/i386_defconfig    |   1 -
 arch/um/configs/x86_64_defconfig  |   1 -
 arch/um/drivers/Kconfig           |  27 +-
 arch/um/drivers/Makefile          |   2 -
 arch/um/drivers/net_kern.c        | 889 --------------------------------------
 arch/um/drivers/net_user.c        | 271 ------------
 arch/um/include/shared/net_kern.h |  69 ---
 arch/um/include/shared/net_user.h |  52 ---
 8 files changed, 5 insertions(+), 1307 deletions(-)
 delete mode 100644 arch/um/drivers/net_kern.c
 delete mode 100644 arch/um/drivers/net_user.c
 delete mode 100644 arch/um/include/shared/net_kern.h
 delete mode 100644 arch/um/include/shared/net_user.h

(limited to 'arch')

diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig
index 7c8999f18f2d..29d9666eceae 100644
--- a/arch/um/configs/i386_defconfig
+++ b/arch/um/configs/i386_defconfig
@@ -52,7 +52,6 @@ CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
-CONFIG_UML_NET=y
 CONFIG_EXT4_FS=y
 CONFIG_QUOTA=y
 CONFIG_AUTOFS_FS=m
diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig
index 9858d989777e..cf309c5406a2 100644
--- a/arch/um/configs/x86_64_defconfig
+++ b/arch/um/configs/x86_64_defconfig
@@ -51,7 +51,6 @@ CONFIG_PACKET=y
 CONFIG_UNIX=y
 CONFIG_INET=y
 # CONFIG_IPV6 is not set
-CONFIG_UML_NET=y
 CONFIG_EXT4_FS=y
 CONFIG_QUOTA=y
 CONFIG_AUTOFS_FS=m
diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index 1bd4e053be04..34085bfc6d41 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -124,25 +124,6 @@ endmenu
 menu "UML Network Devices"
 	depends on NET
 
-# UML virtual driver
-config UML_NET
-	bool "Virtual network device"
-	help
-	  While the User-Mode port cannot directly talk to any physical
-	  hardware devices, this choice and the following transport options
-	  provide one or more virtual network devices through which the UML
-	  kernels can talk to each other, the host, and with the host's help,
-	  machines on the outside world.
-
-	  For more information, including explanations of the networking and
-	  sample configurations, see
-	  <http://user-mode-linux.sourceforge.net/old/networking.html>.
-
-	  If you'd like to be able to enable networking in the User-Mode
-	  linux environment, say Y; otherwise say N.  Note that you must
-	  enable at least one of the following transport options to actually
-	  make use of UML networking.
-
 config UML_NET_VECTOR
 	bool "Vector I/O high performance network devices"
 	select MAY_HAVE_RUNTIME_DEPS
@@ -150,9 +131,11 @@ config UML_NET_VECTOR
 	  This User-Mode Linux network driver uses multi-message send
 	  and receive functions. The host running the UML guest must have
 	  a linux kernel version above 3.0 and a libc version > 2.13.
-	  This driver provides tap, raw, gre and l2tpv3 network transports
-	  with up to 4 times higher network throughput than the UML network
-	  drivers.
+	  This driver provides tap, raw, gre and l2tpv3 network transports.
+
+	  For more information, including explanations of the networking
+	  and sample configurations, see
+	  <file:Documentation/virt/uml/user_mode_linux_howto_v2.rst>.
 
 endmenu
 
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index 6dfc2e9c0ce8..6bf8cbf71d3c 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -7,7 +7,6 @@
 # in to pcap.o
 
 vector-objs := vector_kern.o vector_user.o vector_transports.o
-net-objs := net_kern.o net_user.o
 mconsole-objs := mconsole_kern.o mconsole_user.o
 hostaudio-objs := hostaudio_kern.o
 ubd-objs := ubd_kern.o ubd_user.o
@@ -29,7 +28,6 @@ obj-$(CONFIG_SSL) += ssl.o
 obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o
 
 obj-$(CONFIG_UML_NET_VECTOR) += vector.o
-obj-$(CONFIG_UML_NET) += net.o 
 obj-$(CONFIG_MCONSOLE) += mconsole.o
 obj-$(CONFIG_MMAPPER) += mmapper_kern.o 
 obj-$(CONFIG_BLK_DEV_UBD) += ubd.o 
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
deleted file mode 100644
index d5a9c5aabaec..000000000000
--- a/arch/um/drivers/net_kern.c
+++ /dev/null
@@ -1,889 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
- * James Leu (jleu@mindspring.net).
- * Copyright (C) 2001 by various other people who didn't put their name here.
- */
-
-#include <linux/memblock.h>
-#include <linux/etherdevice.h>
-#include <linux/ethtool.h>
-#include <linux/inetdevice.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/netdevice.h>
-#include <linux/platform_device.h>
-#include <linux/rtnetlink.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <init.h>
-#include <irq_kern.h>
-#include <irq_user.h>
-#include "mconsole_kern.h"
-#include <net_kern.h>
-#include <net_user.h>
-
-#define DRIVER_NAME "uml-netdev"
-
-static DEFINE_SPINLOCK(opened_lock);
-static LIST_HEAD(opened);
-
-/*
- * The drop_skb is used when we can't allocate an skb.  The
- * packet is read into drop_skb in order to get the data off the
- * connection to the host.
- * It is reallocated whenever a maximum packet size is seen which is
- * larger than any seen before.  update_drop_skb is called from
- * eth_configure when a new interface is added.
- */
-static DEFINE_SPINLOCK(drop_lock);
-static struct sk_buff *drop_skb;
-static int drop_max;
-
-static int update_drop_skb(int max)
-{
-	struct sk_buff *new;
-	unsigned long flags;
-	int err = 0;
-
-	spin_lock_irqsave(&drop_lock, flags);
-
-	if (max <= drop_max)
-		goto out;
-
-	err = -ENOMEM;
-	new = dev_alloc_skb(max);
-	if (new == NULL)
-		goto out;
-
-	skb_put(new, max);
-
-	kfree_skb(drop_skb);
-	drop_skb = new;
-	drop_max = max;
-	err = 0;
-out:
-	spin_unlock_irqrestore(&drop_lock, flags);
-
-	return err;
-}
-
-static int uml_net_rx(struct net_device *dev)
-{
-	struct uml_net_private *lp = netdev_priv(dev);
-	int pkt_len;
-	struct sk_buff *skb;
-
-	/* If we can't allocate memory, try again next round. */
-	skb = dev_alloc_skb(lp->max_packet);
-	if (skb == NULL) {
-		drop_skb->dev = dev;
-		/* Read a packet into drop_skb and don't do anything with it. */
-		(*lp->read)(lp->fd, drop_skb, lp);
-		dev->stats.rx_dropped++;
-		return 0;
-	}
-
-	skb->dev = dev;
-	skb_put(skb, lp->max_packet);
-	skb_reset_mac_header(skb);
-	pkt_len = (*lp->read)(lp->fd, skb, lp);
-
-	if (pkt_len > 0) {
-		skb_trim(skb, pkt_len);
-		skb->protocol = (*lp->protocol)(skb);
-
-		dev->stats.rx_bytes += skb->len;
-		dev->stats.rx_packets++;
-		netif_rx(skb);
-		return pkt_len;
-	}
-
-	kfree_skb(skb);
-	return pkt_len;
-}
-
-static void uml_dev_close(struct work_struct *work)
-{
-	struct uml_net_private *lp =
-		container_of(work, struct uml_net_private, work);
-	dev_close(lp->dev);
-}
-
-static irqreturn_t uml_net_interrupt(int irq, void *dev_id)
-{
-	struct net_device *dev = dev_id;
-	struct uml_net_private *lp = netdev_priv(dev);
-	int err;
-
-	if (!netif_running(dev))
-		return IRQ_NONE;
-
-	spin_lock(&lp->lock);
-	while ((err = uml_net_rx(dev)) > 0) ;
-	if (err < 0) {
-		printk(KERN_ERR
-		       "Device '%s' read returned %d, shutting it down\n",
-		       dev->name, err);
-		/* dev_close can't be called in interrupt context, and takes
-		 * again lp->lock.
-		 * And dev_close() can be safely called multiple times on the
-		 * same device, since it tests for (dev->flags & IFF_UP). So
-		 * there's no harm in delaying the device shutdown.
-		 * Furthermore, the workqueue will not re-enqueue an already
-		 * enqueued work item. */
-		schedule_work(&lp->work);
-		goto out;
-	}
-out:
-	spin_unlock(&lp->lock);
-	return IRQ_HANDLED;
-}
-
-static int uml_net_open(struct net_device *dev)
-{
-	struct uml_net_private *lp = netdev_priv(dev);
-	int err;
-
-	if (lp->fd >= 0) {
-		err = -ENXIO;
-		goto out;
-	}
-
-	lp->fd = (*lp->open)(&lp->user);
-	if (lp->fd < 0) {
-		err = lp->fd;
-		goto out;
-	}
-
-	err = um_request_irq(dev->irq, lp->fd, IRQ_READ, uml_net_interrupt,
-			     IRQF_SHARED, dev->name, dev);
-	if (err < 0) {
-		printk(KERN_ERR "uml_net_open: failed to get irq(%d)\n", err);
-		err = -ENETUNREACH;
-		goto out_close;
-	}
-
-	netif_start_queue(dev);
-
-	/* clear buffer - it can happen that the host side of the interface
-	 * is full when we get here.  In this case, new data is never queued,
-	 * SIGIOs never arrive, and the net never works.
-	 */
-	while ((err = uml_net_rx(dev)) > 0) ;
-
-	spin_lock(&opened_lock);
-	list_add(&lp->list, &opened);
-	spin_unlock(&opened_lock);
-
-	return 0;
-out_close:
-	if (lp->close != NULL) (*lp->close)(lp->fd, &lp->user);
-	lp->fd = -1;
-out:
-	return err;
-}
-
-static int uml_net_close(struct net_device *dev)
-{
-	struct uml_net_private *lp = netdev_priv(dev);
-
-	netif_stop_queue(dev);
-
-	um_free_irq(dev->irq, dev);
-	if (lp->close != NULL)
-		(*lp->close)(lp->fd, &lp->user);
-	lp->fd = -1;
-
-	spin_lock(&opened_lock);
-	list_del(&lp->list);
-	spin_unlock(&opened_lock);
-
-	return 0;
-}
-
-static netdev_tx_t uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-	struct uml_net_private *lp = netdev_priv(dev);
-	unsigned long flags;
-	int len;
-
-	netif_stop_queue(dev);
-
-	spin_lock_irqsave(&lp->lock, flags);
-
-	len = (*lp->write)(lp->fd, skb, lp);
-	skb_tx_timestamp(skb);
-
-	if (len == skb->len) {
-		dev->stats.tx_packets++;
-		dev->stats.tx_bytes += skb->len;
-		netif_trans_update(dev);
-		netif_start_queue(dev);
-
-		/* this is normally done in the interrupt when tx finishes */
-		netif_wake_queue(dev);
-	}
-	else if (len == 0) {
-		netif_start_queue(dev);
-		dev->stats.tx_dropped++;
-	}
-	else {
-		netif_start_queue(dev);
-		printk(KERN_ERR "uml_net_start_xmit: failed(%d)\n", len);
-	}
-
-	spin_unlock_irqrestore(&lp->lock, flags);
-
-	dev_consume_skb_any(skb);
-
-	return NETDEV_TX_OK;
-}
-
-static void uml_net_set_multicast_list(struct net_device *dev)
-{
-	return;
-}
-
-static void uml_net_tx_timeout(struct net_device *dev, unsigned int txqueue)
-{
-	netif_trans_update(dev);
-	netif_wake_queue(dev);
-}
-
-#ifdef CONFIG_NET_POLL_CONTROLLER
-static void uml_net_poll_controller(struct net_device *dev)
-{
-	disable_irq(dev->irq);
-	uml_net_interrupt(dev->irq, dev);
-	enable_irq(dev->irq);
-}
-#endif
-
-static void uml_net_get_drvinfo(struct net_device *dev,
-				struct ethtool_drvinfo *info)
-{
-	strscpy(info->driver, DRIVER_NAME);
-}
-
-static const struct ethtool_ops uml_net_ethtool_ops = {
-	.get_drvinfo	= uml_net_get_drvinfo,
-	.get_link	= ethtool_op_get_link,
-	.get_ts_info	= ethtool_op_get_ts_info,
-};
-
-void uml_net_setup_etheraddr(struct net_device *dev, char *str)
-{
-	u8 addr[ETH_ALEN];
-	char *end;
-	int i;
-
-	if (str == NULL)
-		goto random;
-
-	for (i = 0; i < 6; i++) {
-		addr[i] = simple_strtoul(str, &end, 16);
-		if ((end == str) ||
-		   ((*end != ':') && (*end != ',') && (*end != '\0'))) {
-			printk(KERN_ERR
-			       "setup_etheraddr: failed to parse '%s' "
-			       "as an ethernet address\n", str);
-			goto random;
-		}
-		str = end + 1;
-	}
-	if (is_multicast_ether_addr(addr)) {
-		printk(KERN_ERR
-		       "Attempt to assign a multicast ethernet address to a "
-		       "device disallowed\n");
-		goto random;
-	}
-	if (!is_valid_ether_addr(addr)) {
-		printk(KERN_ERR
-		       "Attempt to assign an invalid ethernet address to a "
-		       "device disallowed\n");
-		goto random;
-	}
-	if (!is_local_ether_addr(addr)) {
-		printk(KERN_WARNING
-		       "Warning: Assigning a globally valid ethernet "
-		       "address to a device\n");
-		printk(KERN_WARNING "You should set the 2nd rightmost bit in "
-		       "the first byte of the MAC,\n");
-		printk(KERN_WARNING "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n",
-		       addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4],
-		       addr[5]);
-	}
-	eth_hw_addr_set(dev, addr);
-	return;
-
-random:
-	printk(KERN_INFO
-	       "Choosing a random ethernet address for device %s\n", dev->name);
-	eth_hw_addr_random(dev);
-}
-
-static DEFINE_SPINLOCK(devices_lock);
-static LIST_HEAD(devices);
-
-static struct platform_driver uml_net_driver = {
-	.driver = {
-		.name  = DRIVER_NAME,
-	},
-};
-
-static void net_device_release(struct device *dev)
-{
-	struct uml_net *device = container_of(dev, struct uml_net, pdev.dev);
-	struct net_device *netdev = device->dev;
-	struct uml_net_private *lp = netdev_priv(netdev);
-
-	if (lp->remove != NULL)
-		(*lp->remove)(&lp->user);
-	list_del(&device->list);
-	kfree(device);
-	free_netdev(netdev);
-}
-
-static const struct net_device_ops uml_netdev_ops = {
-	.ndo_open 		= uml_net_open,
-	.ndo_stop 		= uml_net_close,
-	.ndo_start_xmit 	= uml_net_start_xmit,
-	.ndo_set_rx_mode	= uml_net_set_multicast_list,
-	.ndo_tx_timeout 	= uml_net_tx_timeout,
-	.ndo_set_mac_address	= eth_mac_addr,
-	.ndo_validate_addr	= eth_validate_addr,
-#ifdef CONFIG_NET_POLL_CONTROLLER
-	.ndo_poll_controller = uml_net_poll_controller,
-#endif
-};
-
-/*
- * Ensures that platform_driver_register is called only once by
- * eth_configure.  Will be set in an initcall.
- */
-static int driver_registered;
-
-static void eth_configure(int n, void *init, char *mac,
-			  struct transport *transport, gfp_t gfp_mask)
-{
-	struct uml_net *device;
-	struct net_device *dev;
-	struct uml_net_private *lp;
-	int err, size;
-
-	size = transport->private_size + sizeof(struct uml_net_private);
-
-	device = kzalloc(sizeof(*device), gfp_mask);
-	if (device == NULL) {
-		printk(KERN_ERR "eth_configure failed to allocate struct "
-		       "uml_net\n");
-		return;
-	}
-
-	dev = alloc_etherdev(size);
-	if (dev == NULL) {
-		printk(KERN_ERR "eth_configure: failed to allocate struct "
-		       "net_device for eth%d\n", n);
-		goto out_free_device;
-	}
-
-	INIT_LIST_HEAD(&device->list);
-	device->index = n;
-
-	/* If this name ends up conflicting with an existing registered
-	 * netdevice, that is OK, register_netdev{,ice}() will notice this
-	 * and fail.
-	 */
-	snprintf(dev->name, sizeof(dev->name), "eth%d", n);
-
-	uml_net_setup_etheraddr(dev, mac);
-
-	printk(KERN_INFO "Netdevice %d (%pM) : ", n, dev->dev_addr);
-
-	lp = netdev_priv(dev);
-	/* This points to the transport private data. It's still clear, but we
-	 * must memset it to 0 *now*. Let's help the drivers. */
-	memset(lp, 0, size);
-	INIT_WORK(&lp->work, uml_dev_close);
-
-	/* sysfs register */
-	if (!driver_registered) {
-		platform_driver_register(&uml_net_driver);
-		driver_registered = 1;
-	}
-	device->pdev.id = n;
-	device->pdev.name = DRIVER_NAME;
-	device->pdev.dev.release = net_device_release;
-	dev_set_drvdata(&device->pdev.dev, device);
-	if (platform_device_register(&device->pdev))
-		goto out_free_netdev;
-	SET_NETDEV_DEV(dev,&device->pdev.dev);
-
-	device->dev = dev;
-
-	/*
-	 * These just fill in a data structure, so there's no failure
-	 * to be worried about.
-	 */
-	(*transport->kern->init)(dev, init);
-
-	*lp = ((struct uml_net_private)
-		{ .list  		= LIST_HEAD_INIT(lp->list),
-		  .dev 			= dev,
-		  .fd 			= -1,
-		  .mac 			= { 0xfe, 0xfd, 0x0, 0x0, 0x0, 0x0},
-		  .max_packet		= transport->user->max_packet,
-		  .protocol 		= transport->kern->protocol,
-		  .open 		= transport->user->open,
-		  .close 		= transport->user->close,
-		  .remove 		= transport->user->remove,
-		  .read 		= transport->kern->read,
-		  .write 		= transport->kern->write,
-		  .add_address 		= transport->user->add_address,
-		  .delete_address  	= transport->user->delete_address });
-
-	spin_lock_init(&lp->lock);
-	memcpy(lp->mac, dev->dev_addr, sizeof(lp->mac));
-
-	if ((transport->user->init != NULL) &&
-	    ((*transport->user->init)(&lp->user, dev) != 0))
-		goto out_unregister;
-
-	dev->mtu = transport->user->mtu;
-	dev->netdev_ops = &uml_netdev_ops;
-	dev->ethtool_ops = &uml_net_ethtool_ops;
-	dev->watchdog_timeo = (HZ >> 1);
-	dev->irq = UM_ETH_IRQ;
-
-	err = update_drop_skb(lp->max_packet);
-	if (err)
-		goto out_undo_user_init;
-
-	rtnl_lock();
-	err = register_netdevice(dev);
-	rtnl_unlock();
-	if (err)
-		goto out_undo_user_init;
-
-	spin_lock(&devices_lock);
-	list_add(&device->list, &devices);
-	spin_unlock(&devices_lock);
-
-	return;
-
-out_undo_user_init:
-	if (transport->user->remove != NULL)
-		(*transport->user->remove)(&lp->user);
-out_unregister:
-	platform_device_unregister(&device->pdev);
-	return; /* platform_device_unregister frees dev and device */
-out_free_netdev:
-	free_netdev(dev);
-out_free_device:
-	kfree(device);
-}
-
-static struct uml_net *find_device(int n)
-{
-	struct uml_net *device;
-	struct list_head *ele;
-
-	spin_lock(&devices_lock);
-	list_for_each(ele, &devices) {
-		device = list_entry(ele, struct uml_net, list);
-		if (device->index == n)
-			goto out;
-	}
-	device = NULL;
- out:
-	spin_unlock(&devices_lock);
-	return device;
-}
-
-static int eth_parse(char *str, int *index_out, char **str_out,
-		     char **error_out)
-{
-	char *end;
-	int n, err = -EINVAL;
-
-	n = simple_strtoul(str, &end, 0);
-	if (end == str) {
-		*error_out = "Bad device number";
-		return err;
-	}
-
-	str = end;
-	if (*str != '=') {
-		*error_out = "Expected '=' after device number";
-		return err;
-	}
-
-	str++;
-	if (find_device(n)) {
-		*error_out = "Device already configured";
-		return err;
-	}
-
-	*index_out = n;
-	*str_out = str;
-	return 0;
-}
-
-struct eth_init {
-	struct list_head list;
-	char *init;
-	int index;
-};
-
-static DEFINE_SPINLOCK(transports_lock);
-static LIST_HEAD(transports);
-
-/* Filled in during early boot */
-static LIST_HEAD(eth_cmd_line);
-
-static int check_transport(struct transport *transport, char *eth, int n,
-			   void **init_out, char **mac_out, gfp_t gfp_mask)
-{
-	int len;
-
-	len = strlen(transport->name);
-	if (strncmp(eth, transport->name, len))
-		return 0;
-
-	eth += len;
-	if (*eth == ',')
-		eth++;
-	else if (*eth != '\0')
-		return 0;
-
-	*init_out = kmalloc(transport->setup_size, gfp_mask);
-	if (*init_out == NULL)
-		return 1;
-
-	if (!transport->setup(eth, mac_out, *init_out)) {
-		kfree(*init_out);
-		*init_out = NULL;
-	}
-	return 1;
-}
-
-void register_transport(struct transport *new)
-{
-	struct list_head *ele, *next;
-	struct eth_init *eth;
-	void *init;
-	char *mac = NULL;
-	int match;
-
-	spin_lock(&transports_lock);
-	BUG_ON(!list_empty(&new->list));
-	list_add(&new->list, &transports);
-	spin_unlock(&transports_lock);
-
-	list_for_each_safe(ele, next, &eth_cmd_line) {
-		eth = list_entry(ele, struct eth_init, list);
-		match = check_transport(new, eth->init, eth->index, &init,
-					&mac, GFP_KERNEL);
-		if (!match)
-			continue;
-		else if (init != NULL) {
-			eth_configure(eth->index, init, mac, new, GFP_KERNEL);
-			kfree(init);
-		}
-		list_del(&eth->list);
-	}
-}
-
-static int eth_setup_common(char *str, int index)
-{
-	struct list_head *ele;
-	struct transport *transport;
-	void *init;
-	char *mac = NULL;
-	int found = 0;
-
-	spin_lock(&transports_lock);
-	list_for_each(ele, &transports) {
-		transport = list_entry(ele, struct transport, list);
-	        if (!check_transport(transport, str, index, &init,
-					&mac, GFP_ATOMIC))
-			continue;
-		if (init != NULL) {
-			eth_configure(index, init, mac, transport, GFP_ATOMIC);
-			kfree(init);
-		}
-		found = 1;
-		break;
-	}
-
-	spin_unlock(&transports_lock);
-	return found;
-}
-
-static int __init eth_setup(char *str)
-{
-	struct eth_init *new;
-	char *error;
-	int n, err;
-
-	err = eth_parse(str, &n, &str, &error);
-	if (err) {
-		printk(KERN_ERR "eth_setup - Couldn't parse '%s' : %s\n",
-		       str, error);
-		return 1;
-	}
-
-	new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES);
-
-	INIT_LIST_HEAD(&new->list);
-	new->index = n;
-	new->init = str;
-
-	list_add_tail(&new->list, &eth_cmd_line);
-	return 1;
-}
-
-__setup("eth", eth_setup);
-__uml_help(eth_setup,
-"eth[0-9]+=<transport>,<options>\n"
-"    Configure a network device.\n\n"
-);
-
-static int net_config(char *str, char **error_out)
-{
-	int n, err;
-
-	err = eth_parse(str, &n, &str, error_out);
-	if (err)
-		return err;
-
-	/* This string is broken up and the pieces used by the underlying
-	 * driver.  So, it is freed only if eth_setup_common fails.
-	 */
-	str = kstrdup(str, GFP_KERNEL);
-	if (str == NULL) {
-	        *error_out = "net_config failed to strdup string";
-		return -ENOMEM;
-	}
-	err = !eth_setup_common(str, n);
-	if (err)
-		kfree(str);
-	return err;
-}
-
-static int net_id(char **str, int *start_out, int *end_out)
-{
-	char *end;
-	int n;
-
-	n = simple_strtoul(*str, &end, 0);
-	if ((*end != '\0') || (end == *str))
-		return -1;
-
-	*start_out = n;
-	*end_out = n;
-	*str = end;
-	return n;
-}
-
-static int net_remove(int n, char **error_out)
-{
-	struct uml_net *device;
-	struct net_device *dev;
-	struct uml_net_private *lp;
-
-	device = find_device(n);
-	if (device == NULL)
-		return -ENODEV;
-
-	dev = device->dev;
-	lp = netdev_priv(dev);
-	if (lp->fd > 0)
-		return -EBUSY;
-	unregister_netdev(dev);
-	platform_device_unregister(&device->pdev);
-
-	return 0;
-}
-
-static struct mc_device net_mc = {
-	.list		= LIST_HEAD_INIT(net_mc.list),
-	.name		= "eth",
-	.config		= net_config,
-	.get_config	= NULL,
-	.id		= net_id,
-	.remove		= net_remove,
-};
-
-#ifdef CONFIG_INET
-static int uml_inetaddr_event(struct notifier_block *this, unsigned long event,
-			      void *ptr)
-{
-	struct in_ifaddr *ifa = ptr;
-	struct net_device *dev = ifa->ifa_dev->dev;
-	struct uml_net_private *lp;
-	void (*proc)(unsigned char *, unsigned char *, void *);
-	unsigned char addr_buf[4], netmask_buf[4];
-
-	if (dev->netdev_ops->ndo_open != uml_net_open)
-		return NOTIFY_DONE;
-
-	lp = netdev_priv(dev);
-
-	proc = NULL;
-	switch (event) {
-	case NETDEV_UP:
-		proc = lp->add_address;
-		break;
-	case NETDEV_DOWN:
-		proc = lp->delete_address;
-		break;
-	}
-	if (proc != NULL) {
-		memcpy(addr_buf, &ifa->ifa_address, sizeof(addr_buf));
-		memcpy(netmask_buf, &ifa->ifa_mask, sizeof(netmask_buf));
-		(*proc)(addr_buf, netmask_buf, &lp->user);
-	}
-	return NOTIFY_DONE;
-}
-
-/* uml_net_init shouldn't be called twice on two CPUs at the same time */
-static struct notifier_block uml_inetaddr_notifier = {
-	.notifier_call		= uml_inetaddr_event,
-};
-
-static void inet_register(void)
-{
-	struct list_head *ele;
-	struct uml_net_private *lp;
-	struct in_device *ip;
-	struct in_ifaddr *in;
-
-	register_inetaddr_notifier(&uml_inetaddr_notifier);
-
-	/* Devices may have been opened already, so the uml_inetaddr_notifier
-	 * didn't get a chance to run for them.  This fakes it so that
-	 * addresses which have already been set up get handled properly.
-	 */
-	spin_lock(&opened_lock);
-	list_for_each(ele, &opened) {
-		lp = list_entry(ele, struct uml_net_private, list);
-		ip = lp->dev->ip_ptr;
-		if (ip == NULL)
-			continue;
-		in = ip->ifa_list;
-		while (in != NULL) {
-			uml_inetaddr_event(NULL, NETDEV_UP, in);
-			in = in->ifa_next;
-		}
-	}
-	spin_unlock(&opened_lock);
-}
-#else
-static inline void inet_register(void)
-{
-}
-#endif
-
-static int uml_net_init(void)
-{
-	mconsole_register_dev(&net_mc);
-	inet_register();
-	return 0;
-}
-
-__initcall(uml_net_init);
-
-static void close_devices(void)
-{
-	struct list_head *ele;
-	struct uml_net_private *lp;
-
-	spin_lock(&opened_lock);
-	list_for_each(ele, &opened) {
-		lp = list_entry(ele, struct uml_net_private, list);
-		um_free_irq(lp->dev->irq, lp->dev);
-		if ((lp->close != NULL) && (lp->fd >= 0))
-			(*lp->close)(lp->fd, &lp->user);
-		if (lp->remove != NULL)
-			(*lp->remove)(&lp->user);
-	}
-	spin_unlock(&opened_lock);
-}
-
-__uml_exitcall(close_devices);
-
-void iter_addresses(void *d, void (*cb)(unsigned char *, unsigned char *,
-					void *),
-		    void *arg)
-{
-	struct net_device *dev = d;
-	struct in_device *ip = dev->ip_ptr;
-	struct in_ifaddr *in;
-	unsigned char address[4], netmask[4];
-
-	if (ip == NULL) return;
-	in = ip->ifa_list;
-	while (in != NULL) {
-		memcpy(address, &in->ifa_address, sizeof(address));
-		memcpy(netmask, &in->ifa_mask, sizeof(netmask));
-		(*cb)(address, netmask, arg);
-		in = in->ifa_next;
-	}
-}
-
-int dev_netmask(void *d, void *m)
-{
-	struct net_device *dev = d;
-	struct in_device *ip = dev->ip_ptr;
-	struct in_ifaddr *in;
-	__be32 *mask_out = m;
-
-	if (ip == NULL)
-		return 1;
-
-	in = ip->ifa_list;
-	if (in == NULL)
-		return 1;
-
-	*mask_out = in->ifa_mask;
-	return 0;
-}
-
-void *get_output_buffer(int *len_out)
-{
-	void *ret;
-
-	ret = (void *) __get_free_pages(GFP_KERNEL, 0);
-	if (ret) *len_out = PAGE_SIZE;
-	else *len_out = 0;
-	return ret;
-}
-
-void free_output_buffer(void *buffer)
-{
-	free_pages((unsigned long) buffer, 0);
-}
-
-int tap_setup_common(char *str, char *type, char **dev_name, char **mac_out,
-		     char **gate_addr)
-{
-	char *remain;
-
-	remain = split_if_spec(str, dev_name, mac_out, gate_addr, NULL);
-	if (remain != NULL) {
-		printk(KERN_ERR "tap_setup_common - Extra garbage on "
-		       "specification : '%s'\n", remain);
-		return 1;
-	}
-
-	return 0;
-}
-
-unsigned short eth_protocol(struct sk_buff *skb)
-{
-	return eth_type_trans(skb, skb->dev);
-}
diff --git a/arch/um/drivers/net_user.c b/arch/um/drivers/net_user.c
deleted file mode 100644
index 4c9576452ab0..000000000000
--- a/arch/um/drivers/net_user.c
+++ /dev/null
@@ -1,271 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#include <stdio.h>
-#include <unistd.h>
-#include <stdarg.h>
-#include <errno.h>
-#include <stddef.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <sys/wait.h>
-#include <net_user.h>
-#include <os.h>
-#include <um_malloc.h>
-
-int tap_open_common(void *dev, char *gate_addr)
-{
-	int tap_addr[4];
-
-	if (gate_addr == NULL)
-		return 0;
-	if (sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0],
-		  &tap_addr[1], &tap_addr[2], &tap_addr[3]) != 4) {
-		printk(UM_KERN_ERR "Invalid tap IP address - '%s'\n",
-		       gate_addr);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-void tap_check_ips(char *gate_addr, unsigned char *eth_addr)
-{
-	int tap_addr[4];
-
-	if ((gate_addr != NULL) &&
-	    (sscanf(gate_addr, "%d.%d.%d.%d", &tap_addr[0],
-		    &tap_addr[1], &tap_addr[2], &tap_addr[3]) == 4) &&
-	    (eth_addr[0] == tap_addr[0]) &&
-	    (eth_addr[1] == tap_addr[1]) &&
-	    (eth_addr[2] == tap_addr[2]) &&
-	    (eth_addr[3] == tap_addr[3])) {
-		printk(UM_KERN_ERR "The tap IP address and the UML eth IP "
-		       "address must be different\n");
-	}
-}
-
-/* Do reliable error handling as this fails frequently enough. */
-void read_output(int fd, char *output, int len)
-{
-	int remain, ret, expected;
-	char c;
-	char *str;
-
-	if (output == NULL) {
-		output = &c;
-		len = sizeof(c);
-	}
-
-	*output = '\0';
-	ret = read(fd, &remain, sizeof(remain));
-
-	if (ret != sizeof(remain)) {
-		if (ret < 0)
-			ret = -errno;
-		expected = sizeof(remain);
-		str = "length";
-		goto err;
-	}
-
-	while (remain != 0) {
-		expected = (remain < len) ? remain : len;
-		ret = read(fd, output, expected);
-		if (ret != expected) {
-			if (ret < 0)
-				ret = -errno;
-			str = "data";
-			goto err;
-		}
-		remain -= ret;
-	}
-
-	return;
-
-err:
-	if (ret < 0)
-		printk(UM_KERN_ERR "read_output - read of %s failed, "
-		       "errno = %d\n", str, -ret);
-	else
-		printk(UM_KERN_ERR "read_output - read of %s failed, read only "
-		       "%d of %d bytes\n", str, ret, expected);
-}
-
-int net_read(int fd, void *buf, int len)
-{
-	int n;
-
-	n = read(fd,  buf,  len);
-
-	if ((n < 0) && (errno == EAGAIN))
-		return 0;
-	else if (n == 0)
-		return -ENOTCONN;
-	return n;
-}
-
-int net_recvfrom(int fd, void *buf, int len)
-{
-	int n;
-
-	CATCH_EINTR(n = recvfrom(fd,  buf,  len, 0, NULL, NULL));
-	if (n < 0) {
-		if (errno == EAGAIN)
-			return 0;
-		return -errno;
-	}
-	else if (n == 0)
-		return -ENOTCONN;
-	return n;
-}
-
-int net_write(int fd, void *buf, int len)
-{
-	int n;
-
-	n = write(fd, buf, len);
-
-	if ((n < 0) && (errno == EAGAIN))
-		return 0;
-	else if (n == 0)
-		return -ENOTCONN;
-	return n;
-}
-
-int net_send(int fd, void *buf, int len)
-{
-	int n;
-
-	CATCH_EINTR(n = send(fd, buf, len, 0));
-	if (n < 0) {
-		if (errno == EAGAIN)
-			return 0;
-		return -errno;
-	}
-	else if (n == 0)
-		return -ENOTCONN;
-	return n;
-}
-
-int net_sendto(int fd, void *buf, int len, void *to, int sock_len)
-{
-	int n;
-
-	CATCH_EINTR(n = sendto(fd, buf, len, 0, (struct sockaddr *) to,
-			       sock_len));
-	if (n < 0) {
-		if (errno == EAGAIN)
-			return 0;
-		return -errno;
-	}
-	else if (n == 0)
-		return -ENOTCONN;
-	return n;
-}
-
-struct change_pre_exec_data {
-	int close_me;
-	int stdout_fd;
-};
-
-static void change_pre_exec(void *arg)
-{
-	struct change_pre_exec_data *data = arg;
-
-	close(data->close_me);
-	dup2(data->stdout_fd, 1);
-}
-
-static int change_tramp(char **argv, char *output, int output_len)
-{
-	int pid, fds[2], err;
-	struct change_pre_exec_data pe_data;
-
-	err = os_pipe(fds, 1, 0);
-	if (err < 0) {
-		printk(UM_KERN_ERR "change_tramp - pipe failed, err = %d\n",
-		       -err);
-		return err;
-	}
-	pe_data.close_me = fds[0];
-	pe_data.stdout_fd = fds[1];
-	pid = run_helper(change_pre_exec, &pe_data, argv);
-
-	if (pid > 0)	/* Avoid hang as we won't get data in failure case. */
-		read_output(fds[0], output, output_len);
-
-	close(fds[0]);
-	close(fds[1]);
-
-	if (pid > 0)
-		helper_wait(pid);
-	return pid;
-}
-
-static void change(char *dev, char *what, unsigned char *addr,
-		   unsigned char *netmask)
-{
-	char addr_buf[sizeof("255.255.255.255\0")];
-	char netmask_buf[sizeof("255.255.255.255\0")];
-	char version[sizeof("nnnnn\0")];
-	char *argv[] = { "uml_net", version, what, dev, addr_buf,
-			 netmask_buf, NULL };
-	char *output;
-	int output_len, pid;
-
-	sprintf(version, "%d", UML_NET_VERSION);
-	sprintf(addr_buf, "%d.%d.%d.%d", addr[0], addr[1], addr[2], addr[3]);
-	sprintf(netmask_buf, "%d.%d.%d.%d", netmask[0], netmask[1],
-		netmask[2], netmask[3]);
-
-	output_len = UM_KERN_PAGE_SIZE;
-	output = uml_kmalloc(output_len, UM_GFP_KERNEL);
-	if (output == NULL)
-		printk(UM_KERN_ERR "change : failed to allocate output "
-		       "buffer\n");
-
-	pid = change_tramp(argv, output, output_len);
-	if (pid < 0) {
-		kfree(output);
-		return;
-	}
-
-	if (output != NULL) {
-		printk("%s", output);
-		kfree(output);
-	}
-}
-
-void open_addr(unsigned char *addr, unsigned char *netmask, void *arg)
-{
-	change(arg, "add", addr, netmask);
-}
-
-void close_addr(unsigned char *addr, unsigned char *netmask, void *arg)
-{
-	change(arg, "del", addr, netmask);
-}
-
-char *split_if_spec(char *str, ...)
-{
-	char **arg, *end, *ret = NULL;
-	va_list ap;
-
-	va_start(ap, str);
-	while ((arg = va_arg(ap, char **)) != NULL) {
-		if (*str == '\0')
-			goto out;
-		end = strchr(str, ',');
-		if (end != str)
-			*arg = str;
-		if (end == NULL)
-			goto out;
-		*end++ = '\0';
-		str = end;
-	}
-	ret = str;
-out:
-	va_end(ap);
-	return ret;
-}
diff --git a/arch/um/include/shared/net_kern.h b/arch/um/include/shared/net_kern.h
deleted file mode 100644
index 67b2e9a1f2e5..000000000000
--- a/arch/um/include/shared/net_kern.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#ifndef __UM_NET_KERN_H
-#define __UM_NET_KERN_H
-
-#include <linux/netdevice.h>
-#include <linux/platform_device.h>
-#include <linux/skbuff.h>
-#include <linux/socket.h>
-#include <linux/list.h>
-#include <linux/workqueue.h>
-
-struct uml_net {
-	struct list_head list;
-	struct net_device *dev;
-	struct platform_device pdev;
-	int index;
-};
-
-struct uml_net_private {
-	struct list_head list;
-	spinlock_t lock;
-	struct net_device *dev;
-	struct timer_list tl;
-
-	struct work_struct work;
-	int fd;
-	unsigned char mac[ETH_ALEN];
-	int max_packet;
-	unsigned short (*protocol)(struct sk_buff *);
-	int (*open)(void *);
-	void (*close)(int, void *);
-	void (*remove)(void *);
-	int (*read)(int, struct sk_buff *skb, struct uml_net_private *);
-	int (*write)(int, struct sk_buff *skb, struct uml_net_private *);
-
-	void (*add_address)(unsigned char *, unsigned char *, void *);
-	void (*delete_address)(unsigned char *, unsigned char *, void *);
-	char user[];
-};
-
-struct net_kern_info {
-	void (*init)(struct net_device *, void *);
-	unsigned short (*protocol)(struct sk_buff *);
-	int (*read)(int, struct sk_buff *skb, struct uml_net_private *);
-	int (*write)(int, struct sk_buff *skb, struct uml_net_private *);
-};
-
-struct transport {
-	struct list_head list;
-	const char *name;
-	int (* const setup)(char *, char **, void *);
-	const struct net_user_info *user;
-	const struct net_kern_info *kern;
-	const int private_size;
-	const int setup_size;
-};
-
-extern int tap_setup_common(char *str, char *type, char **dev_name,
-			    char **mac_out, char **gate_addr);
-extern void register_transport(struct transport *new);
-extern unsigned short eth_protocol(struct sk_buff *skb);
-extern void uml_net_setup_etheraddr(struct net_device *dev, char *str);
-
-
-#endif
diff --git a/arch/um/include/shared/net_user.h b/arch/um/include/shared/net_user.h
deleted file mode 100644
index ba92a4d93531..000000000000
--- a/arch/um/include/shared/net_user.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#ifndef __UM_NET_USER_H__
-#define __UM_NET_USER_H__
-
-#define ETH_ADDR_LEN (6)
-#define ETH_HEADER_ETHERTAP (16)
-#define ETH_HEADER_OTHER (26) /* 14 for ethernet + VLAN + MPLS for crazy people */
-#define ETH_MAX_PACKET (1500)
-
-#define UML_NET_VERSION (4)
-
-struct net_user_info {
-	int (*init)(void *, void *);
-	int (*open)(void *);
-	void (*close)(int, void *);
-	void (*remove)(void *);
-	void (*add_address)(unsigned char *, unsigned char *, void *);
-	void (*delete_address)(unsigned char *, unsigned char *, void *);
-	int max_packet;
-	int mtu;
-};
-
-extern void iter_addresses(void *d, void (*cb)(unsigned char *,
-					       unsigned char *, void *),
-			   void *arg);
-
-extern void *get_output_buffer(int *len_out);
-extern void free_output_buffer(void *buffer);
-
-extern int tap_open_common(void *dev, char *gate_addr);
-extern void tap_check_ips(char *gate_addr, unsigned char *eth_addr);
-
-extern void read_output(int fd, char *output_out, int len);
-
-extern int net_read(int fd, void *buf, int len);
-extern int net_recvfrom(int fd, void *buf, int len);
-extern int net_write(int fd, void *buf, int len);
-extern int net_send(int fd, void *buf, int len);
-extern int net_sendto(int fd, void *buf, int len, void *to, int sock_len);
-
-extern void open_addr(unsigned char *addr, unsigned char *netmask, void *arg);
-extern void close_addr(unsigned char *addr, unsigned char *netmask, void *arg);
-
-extern char *split_if_spec(char *str, ...);
-
-extern int dev_netmask(void *d, void *m);
-
-#endif
-- 
cgit v1.2.3


From 788aa64c01f1262310b4c1fb827a36df170d86ea Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben.dooks@codethink.co.uk>
Date: Thu, 10 Apr 2025 07:05:22 +0000
Subject: riscv: save the SR_SUM status over switches

When threads/tasks are switched we need to ensure the old execution's
SR_SUM state is saved and the new thread has the old SR_SUM state
restored.

The issue was seen under heavy load especially with the syz-stress tool
running, with crashes as follows in schedule_tail:

Unable to handle kernel access to user memory without uaccess routines
at virtual address 000000002749f0d0
Oops [#1]
Modules linked in:
CPU: 1 PID: 4875 Comm: syz-executor.0 Not tainted
5.12.0-rc2-syzkaller-00467-g0d7588ab9ef9 #0
Hardware name: riscv-virtio,qemu (DT)
epc : schedule_tail+0x72/0xb2 kernel/sched/core.c:4264
 ra : task_pid_vnr include/linux/sched.h:1421 [inline]
 ra : schedule_tail+0x70/0xb2 kernel/sched/core.c:4264
epc : ffffffe00008c8b0 ra : ffffffe00008c8ae sp : ffffffe025d17ec0
 gp : ffffffe005d25378 tp : ffffffe00f0d0000 t0 : 0000000000000000
 t1 : 0000000000000001 t2 : 00000000000f4240 s0 : ffffffe025d17ee0
 s1 : 000000002749f0d0 a0 : 000000000000002a a1 : 0000000000000003
 a2 : 1ffffffc0cfac500 a3 : ffffffe0000c80cc a4 : 5ae9db91c19bbe00
 a5 : 0000000000000000 a6 : 0000000000f00000 a7 : ffffffe000082eba
 s2 : 0000000000040000 s3 : ffffffe00eef96c0 s4 : ffffffe022c77fe0
 s5 : 0000000000004000 s6 : ffffffe067d74e00 s7 : ffffffe067d74850
 s8 : ffffffe067d73e18 s9 : ffffffe067d74e00 s10: ffffffe00eef96e8
 s11: 000000ae6cdf8368 t3 : 5ae9db91c19bbe00 t4 : ffffffc4043cafb2
 t5 : ffffffc4043cafba t6 : 0000000000040000
status: 0000000000000120 badaddr: 000000002749f0d0 cause:
000000000000000f
Call Trace:
[<ffffffe00008c8b0>] schedule_tail+0x72/0xb2 kernel/sched/core.c:4264
[<ffffffe000005570>] ret_from_exception+0x0/0x14
Dumping ftrace buffer:
   (ftrace buffer empty)
---[ end trace b5f8f9231dc87dda ]---

The issue comes from the put_user() in schedule_tail
(kernel/sched/core.c) doing the following:

asmlinkage __visible void schedule_tail(struct task_struct *prev)
{
...
        if (current->set_child_tid)
                put_user(task_pid_vnr(current), current->set_child_tid);
...
}

the put_user() macro causes the code sequence to come out as follows:

1:	__enable_user_access()
2:	reg = task_pid_vnr(current);
3:	*current->set_child_tid = reg;
4:	__disable_user_access()

The problem is that we may have a sleeping function as argument which
could clear SR_SUM causing the panic above. This was fixed by
evaluating the argument of the put_user() macro outside the user-enabled
section in commit 285a76bb2cf5 ("riscv: evaluate put_user() arg before
enabling user access")"

In order for riscv to take advantage of unsafe_get/put_XXX() macros and
to avoid the same issue we had with put_user() and sleeping functions we
must ensure code flow can go through switch_to() from within a region of
code with SR_SUM enabled and come back with SR_SUM still enabled. This
patch addresses the problem allowing future work to enable full use of
unsafe_get/put_XXX() macros without needing to take a CSR bit flip cost
on every access. Make switch_to() save and restore SR_SUM.

Reported-by: syzbot+e74b94fe601ab9552d69@syzkaller.appspotmail.com
Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
Signed-off-by: Cyril Bur <cyrilbur@tenstorrent.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Reviewed-by: Deepak Gupta <debug@rivosinc.com>
Link: https://lore.kernel.org/r/20250410070526.3160847-2-cyrilbur@tenstorrent.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/processor.h | 1 +
 arch/riscv/kernel/asm-offsets.c    | 5 +++++
 arch/riscv/kernel/entry.S          | 8 ++++++++
 3 files changed, 14 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index 5f56eb9d114a..58fd11c89fe9 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -103,6 +103,7 @@ struct thread_struct {
 	struct __riscv_d_ext_state fstate;
 	unsigned long bad_cause;
 	unsigned long envcfg;
+	unsigned long status;
 	u32 riscv_v_flags;
 	u32 vstate_ctrl;
 	struct __riscv_v_ext_state vstate;
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 16490755304e..969c65b1fe41 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -34,6 +34,7 @@ void asm_offsets(void)
 	OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
 	OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
 	OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
+	OFFSET(TASK_THREAD_STATUS, task_struct, thread.status);
 
 	OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
 	OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
@@ -346,6 +347,10 @@ void asm_offsets(void)
 		  offsetof(struct task_struct, thread.s[11])
 		- offsetof(struct task_struct, thread.ra)
 	);
+	DEFINE(TASK_THREAD_STATUS_RA,
+		  offsetof(struct task_struct, thread.status)
+		- offsetof(struct task_struct, thread.ra)
+	);
 
 	DEFINE(TASK_THREAD_F0_F0,
 		  offsetof(struct task_struct, thread.fstate.f[0])
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 33a5a9f2a0d4..00bd0de9faa2 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -397,9 +397,17 @@ SYM_FUNC_START(__switch_to)
 	REG_S s9,  TASK_THREAD_S9_RA(a3)
 	REG_S s10, TASK_THREAD_S10_RA(a3)
 	REG_S s11, TASK_THREAD_S11_RA(a3)
+
+	/* save the user space access flag */
+	li    s0, SR_SUM
+	csrr  s1, CSR_STATUS
+	REG_S s1, TASK_THREAD_STATUS_RA(a3)
+
 	/* Save the kernel shadow call stack pointer */
 	scs_save_current
 	/* Restore context from next->thread */
+	REG_L s0,  TASK_THREAD_STATUS_RA(a4)
+	csrs  CSR_STATUS, s0
 	REG_L ra,  TASK_THREAD_RA_RA(a4)
 	REG_L sp,  TASK_THREAD_SP_RA(a4)
 	REG_L s0,  TASK_THREAD_S0_RA(a4)
-- 
cgit v1.2.3


From 19500c6dbc5c348564a6513c801ab0889300565a Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@kernel.org>
Date: Thu, 10 Apr 2025 07:05:23 +0000
Subject: riscv: implement user_access_begin() and families

Currently, when a function like strncpy_from_user() is called,
the userspace access protection is disabled and enabled
for every word read.

By implementing user_access_begin() and families, the protection
is disabled at the beginning of the copy and enabled at the end.

The __inttype macro is borrowed from x86 implementation.

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Signed-off-by: Cyril Bur <cyrilbur@tenstorrent.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20250410070526.3160847-3-cyrilbur@tenstorrent.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/uaccess.h | 76 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h
index fee56b0c8058..c9a461467bf4 100644
--- a/arch/riscv/include/asm/uaccess.h
+++ b/arch/riscv/include/asm/uaccess.h
@@ -61,6 +61,19 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, unsigne
 #define __disable_user_access()							\
 	__asm__ __volatile__ ("csrc sstatus, %0" : : "r" (SR_SUM) : "memory")
 
+/*
+ * This is the smallest unsigned integer type that can fit a value
+ * (up to 'long long')
+ */
+#define __inttype(x) __typeof__(		\
+	__typefits(x, char,			\
+	  __typefits(x, short,			\
+	    __typefits(x, int,			\
+	      __typefits(x, long, 0ULL)))))
+
+#define __typefits(x, type, not) \
+	__builtin_choose_expr(sizeof(x) <= sizeof(type), (unsigned type)0, not)
+
 /*
  * The exception table consists of pairs of addresses: the first is the
  * address of an instruction that is allowed to fault, and the second is
@@ -368,6 +381,69 @@ do {									\
 		goto err_label;						\
 } while (0)
 
+static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
+{
+	if (unlikely(!access_ok(ptr, len)))
+		return 0;
+	__enable_user_access();
+	return 1;
+}
+#define user_access_begin user_access_begin
+#define user_access_end __disable_user_access
+
+static inline unsigned long user_access_save(void) { return 0UL; }
+static inline void user_access_restore(unsigned long enabled) { }
+
+/*
+ * We want the unsafe accessors to always be inlined and use
+ * the error labels - thus the macro games.
+ */
+#define unsafe_put_user(x, ptr, label)	do {				\
+	long __err = 0;							\
+	__put_user_nocheck(x, (ptr), __err);				\
+	if (__err)							\
+		goto label;						\
+} while (0)
+
+#define unsafe_get_user(x, ptr, label)	do {				\
+	long __err = 0;							\
+	__inttype(*(ptr)) __gu_val;					\
+	__get_user_nocheck(__gu_val, (ptr), __err);			\
+	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
+	if (__err)							\
+		goto label;						\
+} while (0)
+
+#define unsafe_copy_loop(dst, src, len, type, op, label)		\
+	while (len >= sizeof(type)) {					\
+		op(*(type *)(src), (type __user *)(dst), label);	\
+		dst += sizeof(type);					\
+		src += sizeof(type);					\
+		len -= sizeof(type);					\
+	}
+
+#define unsafe_copy_to_user(_dst, _src, _len, label)			\
+do {									\
+	char __user *__ucu_dst = (_dst);				\
+	const char *__ucu_src = (_src);					\
+	size_t __ucu_len = (_len);					\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, unsafe_put_user, label);	\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, unsafe_put_user, label);	\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, unsafe_put_user, label);	\
+	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, unsafe_put_user, label);	\
+} while (0)
+
+#define unsafe_copy_from_user(_dst, _src, _len, label)			\
+do {									\
+	char *__ucu_dst = (_dst);					\
+	const char __user *__ucu_src = (_src);				\
+	size_t __ucu_len = (_len);					\
+	unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u64, unsafe_get_user, label);	\
+	unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u32, unsafe_get_user, label);	\
+	unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u16, unsafe_get_user, label);	\
+	unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u8, unsafe_get_user, label);	\
+} while (0)
+
 #else /* CONFIG_MMU */
 #include <asm-generic/uaccess.h>
 #endif /* CONFIG_MMU */
-- 
cgit v1.2.3


From 62135bf660b2c3887e22f33d3adbefedb4dc9c7a Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@kernel.org>
Date: Thu, 10 Apr 2025 07:05:24 +0000
Subject: riscv: uaccess: use input constraints for ptr of __put_user()

Putting ptr in the inputs as opposed to output may seem incorrect but
this is done for a few reasons:
- Not having it in the output permits the use of asm goto in a
  subsequent patch. There are bugs in gcc [1] which would otherwise
  prevent it.
- Since the output memory is userspace there isn't any real benefit from
  telling the compiler about the memory clobber.
- x86, arm and powerpc all use this technique.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 # 1

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
[Cyril Bur: Rewritten commit message]
Signed-off-by: Cyril Bur <cyrilbur@tenstorrent.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20250410070526.3160847-4-cyrilbur@tenstorrent.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/uaccess.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h
index c9a461467bf4..da36057847f0 100644
--- a/arch/riscv/include/asm/uaccess.h
+++ b/arch/riscv/include/asm/uaccess.h
@@ -219,11 +219,11 @@ do {								\
 	__typeof__(*(ptr)) __x = x;				\
 	__asm__ __volatile__ (					\
 		"1:\n"						\
-		"	" insn " %z2, %1\n"			\
+		"	" insn " %z1, %2\n"			\
 		"2:\n"						\
 		_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %0)		\
-		: "+r" (err), "=m" (*(ptr))			\
-		: "rJ" (__x));					\
+		: "+r" (err)					\
+		: "rJ" (__x), "m"(*(ptr)));			\
 } while (0)
 
 #ifdef CONFIG_64BIT
@@ -236,16 +236,16 @@ do {								\
 	u64 __x = (__typeof__((x)-(x)))(x);			\
 	__asm__ __volatile__ (					\
 		"1:\n"						\
-		"	sw %z3, %1\n"				\
+		"	sw %z1, %3\n"				\
 		"2:\n"						\
-		"	sw %z4, %2\n"				\
+		"	sw %z2, %4\n"				\
 		"3:\n"						\
 		_ASM_EXTABLE_UACCESS_ERR(1b, 3b, %0)		\
 		_ASM_EXTABLE_UACCESS_ERR(2b, 3b, %0)		\
-		: "+r" (err),					\
-			"=m" (__ptr[__LSW]),			\
-			"=m" (__ptr[__MSW])			\
-		: "rJ" (__x), "rJ" (__x >> 32));		\
+		: "+r" (err)					\
+		: "rJ" (__x), "rJ" (__x >> 32),			\
+			"m" (__ptr[__LSW]),			\
+			"m" (__ptr[__MSW]));			\
 } while (0)
 #endif /* CONFIG_64BIT */
 
-- 
cgit v1.2.3


From cdf647e817143c9762c5bdf724ca2821a171f011 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@kernel.org>
Date: Thu, 10 Apr 2025 07:05:25 +0000
Subject: riscv: uaccess: use 'asm goto' for put_user()

With 'asm goto' we don't need to test the error etc, the exception just
jumps to the error handling directly.

Because there are no output clobbers which could trigger gcc bugs [1]
the use of asm_goto_output() macro is not necessary here. Not using
asm_goto_output() is desirable as the generated output asm will be
cleaner.

Use of the volatile keyword is redundant as per gcc 14.2.0 manual section
6.48.2.7 Goto Labels:
> Also note that an asm goto statement is always implicitly considered
  volatile.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 # 1

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
[Cyril Bur: Rewritten commit message]
Signed-off-by: Cyril Bur <cyrilbur@tenstorrent.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20250410070526.3160847-5-cyrilbur@tenstorrent.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/uaccess.h | 71 +++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 38 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h
index da36057847f0..719c9179a751 100644
--- a/arch/riscv/include/asm/uaccess.h
+++ b/arch/riscv/include/asm/uaccess.h
@@ -214,61 +214,66 @@ do {								\
 		((x) = (__force __typeof__(x))0, -EFAULT);	\
 })
 
-#define __put_user_asm(insn, x, ptr, err)			\
+#define __put_user_asm(insn, x, ptr, label)			\
 do {								\
 	__typeof__(*(ptr)) __x = x;				\
-	__asm__ __volatile__ (					\
+	asm goto(						\
 		"1:\n"						\
-		"	" insn " %z1, %2\n"			\
-		"2:\n"						\
-		_ASM_EXTABLE_UACCESS_ERR(1b, 2b, %0)		\
-		: "+r" (err)					\
-		: "rJ" (__x), "m"(*(ptr)));			\
+		"	" insn " %z0, %1\n"			\
+		_ASM_EXTABLE(1b, %l2)				\
+		: : "rJ" (__x), "m"(*(ptr)) : : label);		\
 } while (0)
 
 #ifdef CONFIG_64BIT
-#define __put_user_8(x, ptr, err) \
-	__put_user_asm("sd", x, ptr, err)
+#define __put_user_8(x, ptr, label) \
+	__put_user_asm("sd", x, ptr, label)
 #else /* !CONFIG_64BIT */
-#define __put_user_8(x, ptr, err)				\
+#define __put_user_8(x, ptr, label)				\
 do {								\
 	u32 __user *__ptr = (u32 __user *)(ptr);		\
 	u64 __x = (__typeof__((x)-(x)))(x);			\
-	__asm__ __volatile__ (					\
+	asm goto(						\
 		"1:\n"						\
-		"	sw %z1, %3\n"				\
+		"	sw %z0, %2\n"				\
 		"2:\n"						\
-		"	sw %z2, %4\n"				\
-		"3:\n"						\
-		_ASM_EXTABLE_UACCESS_ERR(1b, 3b, %0)		\
-		_ASM_EXTABLE_UACCESS_ERR(2b, 3b, %0)		\
-		: "+r" (err)					\
-		: "rJ" (__x), "rJ" (__x >> 32),			\
+		"	sw %z1, %3\n"				\
+		_ASM_EXTABLE(1b, %l4)				\
+		_ASM_EXTABLE(2b, %l4)				\
+		: : "rJ" (__x), "rJ" (__x >> 32),		\
 			"m" (__ptr[__LSW]),			\
-			"m" (__ptr[__MSW]));			\
+			"m" (__ptr[__MSW]) : : label);		\
 } while (0)
 #endif /* CONFIG_64BIT */
 
-#define __put_user_nocheck(x, __gu_ptr, __pu_err)					\
+#define __put_user_nocheck(x, __gu_ptr, label)			\
 do {								\
 	switch (sizeof(*__gu_ptr)) {				\
 	case 1:							\
-		__put_user_asm("sb", (x), __gu_ptr, __pu_err);	\
+		__put_user_asm("sb", (x), __gu_ptr, label);	\
 		break;						\
 	case 2:							\
-		__put_user_asm("sh", (x), __gu_ptr, __pu_err);	\
+		__put_user_asm("sh", (x), __gu_ptr, label);	\
 		break;						\
 	case 4:							\
-		__put_user_asm("sw", (x), __gu_ptr, __pu_err);	\
+		__put_user_asm("sw", (x), __gu_ptr, label);	\
 		break;						\
 	case 8:							\
-		__put_user_8((x), __gu_ptr, __pu_err);	\
+		__put_user_8((x), __gu_ptr, label);		\
 		break;						\
 	default:						\
 		BUILD_BUG();					\
 	}							\
 } while (0)
 
+#define __put_user_error(x, ptr, err)				\
+do {								\
+	__label__ err_label;					\
+	__put_user_nocheck(x, ptr, err_label);			\
+	break;							\
+err_label:							\
+	(err) = -EFAULT;					\
+} while (0)
+
 /**
  * __put_user: - Write a simple value into user space, with less checking.
  * @x:   Value to copy to user space.
@@ -299,7 +304,7 @@ do {								\
 	__chk_user_ptr(__gu_ptr);				\
 								\
 	__enable_user_access();					\
-	__put_user_nocheck(__val, __gu_ptr, __pu_err);		\
+	__put_user_error(__val, __gu_ptr, __pu_err);		\
 	__disable_user_access();				\
 								\
 	__pu_err;						\
@@ -373,13 +378,7 @@ do {									\
 } while (0)
 
 #define __put_kernel_nofault(dst, src, type, err_label)			\
-do {									\
-	long __kr_err = 0;						\
-									\
-	__put_user_nocheck(*((type *)(src)), (type *)(dst), __kr_err);	\
-	if (unlikely(__kr_err))						\
-		goto err_label;						\
-} while (0)
+	__put_user_nocheck(*((type *)(src)), (type *)(dst), err_label)
 
 static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
 {
@@ -398,12 +397,8 @@ static inline void user_access_restore(unsigned long enabled) { }
  * We want the unsafe accessors to always be inlined and use
  * the error labels - thus the macro games.
  */
-#define unsafe_put_user(x, ptr, label)	do {				\
-	long __err = 0;							\
-	__put_user_nocheck(x, (ptr), __err);				\
-	if (__err)							\
-		goto label;						\
-} while (0)
+#define unsafe_put_user(x, ptr, label)					\
+	__put_user_nocheck(x, (ptr), label)
 
 #define unsafe_get_user(x, ptr, label)	do {				\
 	long __err = 0;							\
-- 
cgit v1.2.3


From f6bff7827a48e59cff1ef98aae72452d65174e0c Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@kernel.org>
Date: Thu, 10 Apr 2025 07:05:26 +0000
Subject: riscv: uaccess: use 'asm_goto_output' for get_user()

With 'asm goto' we don't need to test the error etc, the exception just
jumps to the error handling directly.

Unlike put_user(), get_user() must work around GCC bugs [1] when using
output clobbers in an asm goto statement.

Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113921 # 1

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
[Cyril Bur: Rewritten commit message]
Signed-off-by: Cyril Bur <cyrilbur@tenstorrent.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20250410070526.3160847-6-cyrilbur@tenstorrent.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/uaccess.h | 95 ++++++++++++++++++++++++++++------------
 1 file changed, 68 insertions(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h
index 719c9179a751..87d01168f80a 100644
--- a/arch/riscv/include/asm/uaccess.h
+++ b/arch/riscv/include/asm/uaccess.h
@@ -96,27 +96,58 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, unsigne
  * call.
  */
 
-#define __get_user_asm(insn, x, ptr, err)			\
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+#define __get_user_asm(insn, x, ptr, label)			\
+	asm_goto_output(					\
+		"1:\n"						\
+		"	" insn " %0, %1\n"			\
+		_ASM_EXTABLE_UACCESS_ERR(1b, %l2, %0)		\
+		: "=&r" (x)					\
+		: "m" (*(ptr)) : : label)
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+#define __get_user_asm(insn, x, ptr, label)			\
 do {								\
-	__typeof__(x) __x;					\
+	long __gua_err = 0;					\
 	__asm__ __volatile__ (					\
 		"1:\n"						\
 		"	" insn " %1, %2\n"			\
 		"2:\n"						\
 		_ASM_EXTABLE_UACCESS_ERR_ZERO(1b, 2b, %0, %1)	\
-		: "+r" (err), "=&r" (__x)			\
+		: "+r" (__gua_err), "=&r" (x)			\
 		: "m" (*(ptr)));				\
-	(x) = __x;						\
+	if (__gua_err)						\
+		goto label;					\
 } while (0)
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
 
 #ifdef CONFIG_64BIT
-#define __get_user_8(x, ptr, err) \
-	__get_user_asm("ld", x, ptr, err)
+#define __get_user_8(x, ptr, label) \
+	__get_user_asm("ld", x, ptr, label)
 #else /* !CONFIG_64BIT */
-#define __get_user_8(x, ptr, err)				\
+
+#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+#define __get_user_8(x, ptr, label)				\
+	u32 __user *__ptr = (u32 __user *)(ptr);		\
+	u32 __lo, __hi;						\
+	asm_goto_output(					\
+		"1:\n"						\
+		"	lw %0, %2\n"				\
+		"2:\n"						\
+		"	lw %1, %3\n"				\
+		_ASM_EXTABLE_UACCESS_ERR(1b, %l4, %0)		\
+		_ASM_EXTABLE_UACCESS_ERR(2b, %l4, %0)		\
+		: "=&r" (__lo), "=r" (__hi)			\
+		: "m" (__ptr[__LSW]), "m" (__ptr[__MSW])	\
+		: : label);                                     \
+	(x) = (__typeof__(x))((__typeof__((x) - (x)))(		\
+		(((u64)__hi << 32) | __lo)));			\
+
+#else /* !CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+#define __get_user_8(x, ptr, label)				\
 do {								\
 	u32 __user *__ptr = (u32 __user *)(ptr);		\
 	u32 __lo, __hi;						\
+	long __gu8_err = 0;					\
 	__asm__ __volatile__ (					\
 		"1:\n"						\
 		"	lw %1, %3\n"				\
@@ -125,35 +156,51 @@ do {								\
 		"3:\n"						\
 		_ASM_EXTABLE_UACCESS_ERR_ZERO(1b, 3b, %0, %1)	\
 		_ASM_EXTABLE_UACCESS_ERR_ZERO(2b, 3b, %0, %1)	\
-		: "+r" (err), "=&r" (__lo), "=r" (__hi)		\
+		: "+r" (__gu8_err), "=&r" (__lo), "=r" (__hi)	\
 		: "m" (__ptr[__LSW]), "m" (__ptr[__MSW]));	\
-	if (err)						\
+	if (__gu8_err) {					\
 		__hi = 0;					\
-	(x) = (__typeof__(x))((__typeof__((x)-(x)))(		\
+		goto label;					\
+	}							\
+	(x) = (__typeof__(x))((__typeof__((x) - (x)))(		\
 		(((u64)__hi << 32) | __lo)));			\
 } while (0)
+#endif /* CONFIG_CC_HAS_ASM_GOTO_OUTPUT */
+
 #endif /* CONFIG_64BIT */
 
-#define __get_user_nocheck(x, __gu_ptr, __gu_err)		\
+#define __get_user_nocheck(x, __gu_ptr, label)			\
 do {								\
 	switch (sizeof(*__gu_ptr)) {				\
 	case 1:							\
-		__get_user_asm("lb", (x), __gu_ptr, __gu_err);	\
+		__get_user_asm("lb", (x), __gu_ptr, label);	\
 		break;						\
 	case 2:							\
-		__get_user_asm("lh", (x), __gu_ptr, __gu_err);	\
+		__get_user_asm("lh", (x), __gu_ptr, label);	\
 		break;						\
 	case 4:							\
-		__get_user_asm("lw", (x), __gu_ptr, __gu_err);	\
+		__get_user_asm("lw", (x), __gu_ptr, label);	\
 		break;						\
 	case 8:							\
-		__get_user_8((x), __gu_ptr, __gu_err);	\
+		__get_user_8((x), __gu_ptr, label);		\
 		break;						\
 	default:						\
 		BUILD_BUG();					\
 	}							\
 } while (0)
 
+#define __get_user_error(x, ptr, err)					\
+do {									\
+	__label__ __gu_failed;						\
+									\
+	__get_user_nocheck(x, ptr, __gu_failed);			\
+		err = 0;						\
+		break;							\
+__gu_failed:								\
+		x = 0;							\
+		err = -EFAULT;						\
+} while (0)
+
 /**
  * __get_user: - Get a simple variable from user space, with less checking.
  * @x:   Variable to store result.
@@ -178,13 +225,16 @@ do {								\
 ({								\
 	const __typeof__(*(ptr)) __user *__gu_ptr = untagged_addr(ptr); \
 	long __gu_err = 0;					\
+	__typeof__(x) __gu_val;					\
 								\
 	__chk_user_ptr(__gu_ptr);				\
 								\
 	__enable_user_access();					\
-	__get_user_nocheck(x, __gu_ptr, __gu_err);		\
+	__get_user_error(__gu_val, __gu_ptr, __gu_err);		\
 	__disable_user_access();				\
 								\
+	(x) = __gu_val;						\
+								\
 	__gu_err;						\
 })
 
@@ -369,13 +419,7 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n)
 }
 
 #define __get_kernel_nofault(dst, src, type, err_label)			\
-do {									\
-	long __kr_err = 0;						\
-									\
-	__get_user_nocheck(*((type *)(dst)), (type *)(src), __kr_err);	\
-	if (unlikely(__kr_err))						\
-		goto err_label;						\
-} while (0)
+	__get_user_nocheck(*((type *)(dst)), (type *)(src), err_label)
 
 #define __put_kernel_nofault(dst, src, type, err_label)			\
 	__put_user_nocheck(*((type *)(src)), (type *)(dst), err_label)
@@ -401,12 +445,9 @@ static inline void user_access_restore(unsigned long enabled) { }
 	__put_user_nocheck(x, (ptr), label)
 
 #define unsafe_get_user(x, ptr, label)	do {				\
-	long __err = 0;							\
 	__inttype(*(ptr)) __gu_val;					\
-	__get_user_nocheck(__gu_val, (ptr), __err);			\
+	__get_user_nocheck(__gu_val, (ptr), label);			\
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
-	if (__err)							\
-		goto label;						\
 } while (0)
 
 #define unsafe_copy_loop(dst, src, len, type, op, label)		\
-- 
cgit v1.2.3


From 2940954c1ac527386e5203d4be8263d704491fbe Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Mon, 24 Feb 2025 19:20:40 +0800
Subject: riscv: vDSO: Remove --hash-style=both

When RISC-V borned, DT_GNU_HASH had already became the de-facto
standard so DT_HASH is just wasting storage space.  Remove the explicit
--hash-style=both setting and rely on the distro toolchain default,
which is most likely "gnu" (i.e. generating only DT_GNU_HASH, no
DT_HASH).

Following the logic of commit 48f6430505c0
("arm64/vdso: Remove --hash-style=sysv").

Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Link: https://lore.kernel.org/r/20250224112042.60282-2-xry111@xry111.site
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/kernel/vdso/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile
index 7575ef088adc..8d12f5646eb5 100644
--- a/arch/riscv/kernel/vdso/Makefile
+++ b/arch/riscv/kernel/vdso/Makefile
@@ -59,7 +59,7 @@ $(obj)/vdso.o: $(obj)/vdso.so
 $(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE
 	$(call if_changed,vdsold_and_check)
 LDFLAGS_vdso.so.dbg = -shared -soname=linux-vdso.so.1 \
-	--build-id=sha1 --hash-style=both --eh-frame-hdr
+	--build-id=sha1 --eh-frame-hdr
 
 # strip rule for the .so file
 $(obj)/%.so: OBJCOPYFLAGS := -S
-- 
cgit v1.2.3


From 2d147d77ae6e96c1c349a6ada0eac14111c3384a Mon Sep 17 00:00:00 2001
From: Cyan Yang <cyan.yang@sifive.com>
Date: Fri, 18 Apr 2025 13:32:29 +0800
Subject: riscv: Add SiFive xsfvqmaccdod and xsfvqmaccqoq vendor extensions

Add SiFive vendor extension support to the kernel with the target of
"xsfvqmaccdod" and "xsfvqmaccqoq".

Signed-off-by: Cyan Yang <cyan.yang@sifive.com>
Link: https://lore.kernel.org/r/20250418053239.4351-3-cyan.yang@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/Kconfig.vendor                         | 13 +++++++++++++
 arch/riscv/include/asm/vendor_extensions/sifive.h | 14 ++++++++++++++
 arch/riscv/kernel/vendor_extensions.c             | 10 ++++++++++
 arch/riscv/kernel/vendor_extensions/Makefile      |  1 +
 arch/riscv/kernel/vendor_extensions/sifive.c      | 19 +++++++++++++++++++
 5 files changed, 57 insertions(+)
 create mode 100644 arch/riscv/include/asm/vendor_extensions/sifive.h
 create mode 100644 arch/riscv/kernel/vendor_extensions/sifive.c

(limited to 'arch')

diff --git a/arch/riscv/Kconfig.vendor b/arch/riscv/Kconfig.vendor
index b096548fe0ff..e14f26368963 100644
--- a/arch/riscv/Kconfig.vendor
+++ b/arch/riscv/Kconfig.vendor
@@ -16,6 +16,19 @@ config RISCV_ISA_VENDOR_EXT_ANDES
 	  If you don't know what to do here, say Y.
 endmenu
 
+menu "SiFive"
+config RISCV_ISA_VENDOR_EXT_SIFIVE
+	bool "SiFive vendor extension support"
+	select RISCV_ISA_VENDOR_EXT
+	default y
+	help
+	  Say N here if you want to disable all SiFive vendor extension
+	  support. This will cause any SiFive vendor extensions that are
+	  requested by hardware probing to be ignored.
+
+	  If you don't know what to do here, say Y.
+endmenu
+
 menu "T-Head"
 config RISCV_ISA_VENDOR_EXT_THEAD
 	bool "T-Head vendor extension support"
diff --git a/arch/riscv/include/asm/vendor_extensions/sifive.h b/arch/riscv/include/asm/vendor_extensions/sifive.h
new file mode 100644
index 000000000000..608004250e2e
--- /dev/null
+++ b/arch/riscv/include/asm/vendor_extensions/sifive.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_H
+#define _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_H
+
+#include <asm/vendor_extensions.h>
+
+#include <linux/types.h>
+
+#define RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD		0
+#define RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ		1
+
+extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive;
+
+#endif
diff --git a/arch/riscv/kernel/vendor_extensions.c b/arch/riscv/kernel/vendor_extensions.c
index 9feb7f67a0a3..92d8ff81f42c 100644
--- a/arch/riscv/kernel/vendor_extensions.c
+++ b/arch/riscv/kernel/vendor_extensions.c
@@ -6,6 +6,7 @@
 #include <asm/vendorid_list.h>
 #include <asm/vendor_extensions.h>
 #include <asm/vendor_extensions/andes.h>
+#include <asm/vendor_extensions/sifive.h>
 #include <asm/vendor_extensions/thead.h>
 
 #include <linux/array_size.h>
@@ -15,6 +16,9 @@ struct riscv_isa_vendor_ext_data_list *riscv_isa_vendor_ext_list[] = {
 #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_ANDES
 	&riscv_isa_vendor_ext_list_andes,
 #endif
+#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE
+	&riscv_isa_vendor_ext_list_sifive,
+#endif
 #ifdef CONFIG_RISCV_ISA_VENDOR_EXT_THEAD
 	&riscv_isa_vendor_ext_list_thead,
 #endif
@@ -45,6 +49,12 @@ bool __riscv_isa_vendor_extension_available(int cpu, unsigned long vendor, unsig
 		cpu_bmap = riscv_isa_vendor_ext_list_andes.per_hart_isa_bitmap;
 		break;
 	#endif
+	#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE
+	case SIFIVE_VENDOR_ID:
+		bmap = &riscv_isa_vendor_ext_list_sifive.all_harts_isa_bitmap;
+		cpu_bmap = riscv_isa_vendor_ext_list_sifive.per_hart_isa_bitmap;
+		break;
+	#endif
 	#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_THEAD
 	case THEAD_VENDOR_ID:
 		bmap = &riscv_isa_vendor_ext_list_thead.all_harts_isa_bitmap;
diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile
index 866414c81a9f..d5fdde0e863b 100644
--- a/arch/riscv/kernel/vendor_extensions/Makefile
+++ b/arch/riscv/kernel/vendor_extensions/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES)	+= andes.o
+obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE)	+= sifive.o
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD)	+= thead.o
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD)	+= thead_hwprobe.o
diff --git a/arch/riscv/kernel/vendor_extensions/sifive.c b/arch/riscv/kernel/vendor_extensions/sifive.c
new file mode 100644
index 000000000000..990ac83b1f81
--- /dev/null
+++ b/arch/riscv/kernel/vendor_extensions/sifive.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <asm/cpufeature.h>
+#include <asm/vendor_extensions.h>
+#include <asm/vendor_extensions/sifive.h>
+
+#include <linux/array_size.h>
+#include <linux/types.h>
+
+/* All SiFive vendor extensions supported in Linux */
+const struct riscv_isa_ext_data riscv_isa_vendor_ext_sifive[] = {
+	__RISCV_ISA_EXT_DATA(xsfvqmaccdod, RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD),
+	__RISCV_ISA_EXT_DATA(xsfvqmaccqoq, RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ),
+};
+
+struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive = {
+	.ext_data_count = ARRAY_SIZE(riscv_isa_vendor_ext_sifive),
+	.ext_data = riscv_isa_vendor_ext_sifive,
+};
-- 
cgit v1.2.3


From e8fd215ed0eb814486d50b4835007cbc50b2c2b7 Mon Sep 17 00:00:00 2001
From: Cyan Yang <cyan.yang@sifive.com>
Date: Fri, 18 Apr 2025 13:32:30 +0800
Subject: riscv: hwprobe: Document SiFive xsfvqmaccdod and xsfvqmaccqoq vendor
 extensions

Document the support for sifive vendor extensions using the key
RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0 and two vendor extensions for SiFive
Int8 Matrix Multiplication Instructions using
RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCDOD and
RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCQOQ.

Signed-off-by: Cyan Yang <cyan.yang@sifive.com>
Link: https://lore.kernel.org/r/20250418053239.4351-4-cyan.yang@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/hwprobe.h      | 2 +-
 arch/riscv/include/uapi/asm/hwprobe.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
index 1f690fea0e03..1c6977305776 100644
--- a/arch/riscv/include/asm/hwprobe.h
+++ b/arch/riscv/include/asm/hwprobe.h
@@ -8,7 +8,7 @@
 
 #include <uapi/asm/hwprobe.h>
 
-#define RISCV_HWPROBE_MAX_KEY 12
+#define RISCV_HWPROBE_MAX_KEY 13
 
 static inline bool riscv_hwprobe_key_is_valid(__s64 key)
 {
diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
index 3c2fce939673..9c70101f021b 100644
--- a/arch/riscv/include/uapi/asm/hwprobe.h
+++ b/arch/riscv/include/uapi/asm/hwprobe.h
@@ -104,6 +104,7 @@ struct riscv_hwprobe {
 #define		RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED	4
 #define RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0	11
 #define RISCV_HWPROBE_KEY_ZICBOM_BLOCK_SIZE	12
+#define RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0	13
 /* Increase RISCV_HWPROBE_MAX_KEY when adding items. */
 
 /* Flags */
-- 
cgit v1.2.3


From 1a6274f035346e76835d46096136dd3e6cca9575 Mon Sep 17 00:00:00 2001
From: Cyan Yang <cyan.yang@sifive.com>
Date: Fri, 18 Apr 2025 13:32:31 +0800
Subject: riscv: hwprobe: Add SiFive vendor extension support and probe for
 xsfqmaccdod and xsfqmaccqoq

Add a new hwprobe key "RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0" which allows
userspace to probe for the new vendor extensions from SiFive. Also, add
new hwprobe for SiFive "xsfvqmaccdod" and "xsfvqmaccqoq" vendor
extensions.

Signed-off-by: Cyan Yang <cyan.yang@sifive.com>
Link: https://lore.kernel.org/r/20250418053239.4351-5-cyan.yang@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/hwprobe.h                     |  1 +
 .../include/asm/vendor_extensions/sifive_hwprobe.h   | 19 +++++++++++++++++++
 arch/riscv/include/uapi/asm/vendor/sifive.h          |  4 ++++
 arch/riscv/kernel/sys_hwprobe.c                      |  5 +++++
 arch/riscv/kernel/vendor_extensions/Makefile         |  1 +
 arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c | 20 ++++++++++++++++++++
 6 files changed, 50 insertions(+)
 create mode 100644 arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h
 create mode 100644 arch/riscv/include/uapi/asm/vendor/sifive.h
 create mode 100644 arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c

(limited to 'arch')

diff --git a/arch/riscv/include/asm/hwprobe.h b/arch/riscv/include/asm/hwprobe.h
index 1c6977305776..7fe0a379474a 100644
--- a/arch/riscv/include/asm/hwprobe.h
+++ b/arch/riscv/include/asm/hwprobe.h
@@ -22,6 +22,7 @@ static inline bool hwprobe_key_is_bitmask(__s64 key)
 	case RISCV_HWPROBE_KEY_IMA_EXT_0:
 	case RISCV_HWPROBE_KEY_CPUPERF_0:
 	case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0:
+	case RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0:
 		return true;
 	}
 
diff --git a/arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h b/arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h
new file mode 100644
index 000000000000..90a61abd033c
--- /dev/null
+++ b/arch/riscv/include/asm/vendor_extensions/sifive_hwprobe.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_HWPROBE_H
+#define _ASM_RISCV_VENDOR_EXTENSIONS_SIFIVE_HWPROBE_H
+
+#include <linux/cpumask.h>
+
+#include <uapi/asm/hwprobe.h>
+
+#ifdef CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE
+void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cpumask *cpus);
+#else
+static inline void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair,
+						   const struct cpumask *cpus)
+{
+	pair->value = 0;
+}
+#endif
+
+#endif
diff --git a/arch/riscv/include/uapi/asm/vendor/sifive.h b/arch/riscv/include/uapi/asm/vendor/sifive.h
new file mode 100644
index 000000000000..f25d8cf110d1
--- /dev/null
+++ b/arch/riscv/include/uapi/asm/vendor/sifive.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#define	RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCDOD		(1 << 0)
+#define	RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCQOQ		(1 << 1)
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index 249aec8594a9..138e74f05de7 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -15,6 +15,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/vector.h>
+#include <asm/vendor_extensions/sifive_hwprobe.h>
 #include <asm/vendor_extensions/thead_hwprobe.h>
 #include <vdso/vsyscall.h>
 
@@ -300,6 +301,10 @@ static void hwprobe_one_pair(struct riscv_hwprobe *pair,
 		pair->value = riscv_timebase;
 		break;
 
+	case RISCV_HWPROBE_KEY_VENDOR_EXT_SIFIVE_0:
+		hwprobe_isa_vendor_ext_sifive_0(pair, cpus);
+		break;
+
 	case RISCV_HWPROBE_KEY_VENDOR_EXT_THEAD_0:
 		hwprobe_isa_vendor_ext_thead_0(pair, cpus);
 		break;
diff --git a/arch/riscv/kernel/vendor_extensions/Makefile b/arch/riscv/kernel/vendor_extensions/Makefile
index d5fdde0e863b..a4eca96d1c8a 100644
--- a/arch/riscv/kernel/vendor_extensions/Makefile
+++ b/arch/riscv/kernel/vendor_extensions/Makefile
@@ -2,5 +2,6 @@
 
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_ANDES)	+= andes.o
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE)	+= sifive.o
+obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_SIFIVE)	+= sifive_hwprobe.o
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD)	+= thead.o
 obj-$(CONFIG_RISCV_ISA_VENDOR_EXT_THEAD)	+= thead_hwprobe.o
diff --git a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c
new file mode 100644
index 000000000000..461ce0f305ce
--- /dev/null
+++ b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <asm/vendor_extensions/sifive.h>
+#include <asm/vendor_extensions/sifive_hwprobe.h>
+#include <asm/vendor_extensions/vendor_hwprobe.h>
+
+#include <linux/cpumask.h>
+#include <linux/types.h>
+
+#include <uapi/asm/hwprobe.h>
+#include <uapi/asm/vendor/sifive.h>
+
+void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cpumask *cpus)
+{
+	VENDOR_EXTENSION_SUPPORTED(pair, cpus,
+				   riscv_isa_vendor_ext_list_sifive.per_hart_isa_bitmap, {
+		VENDOR_EXT_KEY(XSFVQMACCDOD);
+		VENDOR_EXT_KEY(XSFVQMACCQOQ);
+	});
+}
-- 
cgit v1.2.3


From e84fffe21b7498ff50aed3a96773993d04cfaed0 Mon Sep 17 00:00:00 2001
From: Cyan Yang <cyan.yang@sifive.com>
Date: Fri, 18 Apr 2025 13:32:33 +0800
Subject: riscv: Add SiFive xsfvfnrclipxfqf vendor extension

Add SiFive vendor extension "xsfvfnrclipxfqf" support to the kernel.

Signed-off-by: Cyan Yang <cyan.yang@sifive.com>
Link: https://lore.kernel.org/r/20250418053239.4351-7-cyan.yang@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/vendor_extensions/sifive.h | 1 +
 arch/riscv/kernel/vendor_extensions/sifive.c      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/vendor_extensions/sifive.h b/arch/riscv/include/asm/vendor_extensions/sifive.h
index 608004250e2e..2d05e3e73170 100644
--- a/arch/riscv/include/asm/vendor_extensions/sifive.h
+++ b/arch/riscv/include/asm/vendor_extensions/sifive.h
@@ -8,6 +8,7 @@
 
 #define RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD		0
 #define RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ		1
+#define RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF		2
 
 extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive;
 
diff --git a/arch/riscv/kernel/vendor_extensions/sifive.c b/arch/riscv/kernel/vendor_extensions/sifive.c
index 990ac83b1f81..077315e5b2d7 100644
--- a/arch/riscv/kernel/vendor_extensions/sifive.c
+++ b/arch/riscv/kernel/vendor_extensions/sifive.c
@@ -9,6 +9,7 @@
 
 /* All SiFive vendor extensions supported in Linux */
 const struct riscv_isa_ext_data riscv_isa_vendor_ext_sifive[] = {
+	__RISCV_ISA_EXT_DATA(xsfvfnrclipxfqf, RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF),
 	__RISCV_ISA_EXT_DATA(xsfvqmaccdod, RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD),
 	__RISCV_ISA_EXT_DATA(xsfvqmaccqoq, RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ),
 };
-- 
cgit v1.2.3


From 1d91224394c92245942c402245370c4abb0fcbfb Mon Sep 17 00:00:00 2001
From: Cyan Yang <cyan.yang@sifive.com>
Date: Fri, 18 Apr 2025 13:32:35 +0800
Subject: riscv: hwprobe: Add SiFive xsfvfnrclipxfqf vendor extension

Add hwprobe for SiFive "xsfvfnrclipxfqf" vendor extension.

Signed-off-by: Cyan Yang <cyan.yang@sifive.com>
Link: https://lore.kernel.org/r/20250418053239.4351-9-cyan.yang@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/uapi/asm/vendor/sifive.h          | 1 +
 arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/uapi/asm/vendor/sifive.h b/arch/riscv/include/uapi/asm/vendor/sifive.h
index f25d8cf110d1..b772d4631284 100644
--- a/arch/riscv/include/uapi/asm/vendor/sifive.h
+++ b/arch/riscv/include/uapi/asm/vendor/sifive.h
@@ -2,3 +2,4 @@
 
 #define	RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCDOD		(1 << 0)
 #define	RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCQOQ		(1 << 1)
+#define	RISCV_HWPROBE_VENDOR_EXT_XSFVFNRCLIPXFQF		(1 << 2)
diff --git a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c
index 461ce0f305ce..2b9505079a9f 100644
--- a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c
+++ b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c
@@ -16,5 +16,6 @@ void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cp
 				   riscv_isa_vendor_ext_list_sifive.per_hart_isa_bitmap, {
 		VENDOR_EXT_KEY(XSFVQMACCDOD);
 		VENDOR_EXT_KEY(XSFVQMACCQOQ);
+		VENDOR_EXT_KEY(XSFVFNRCLIPXFQF);
 	});
 }
-- 
cgit v1.2.3


From 34e9b16b4b888988730ffab9a9039cfcf305942e Mon Sep 17 00:00:00 2001
From: Cyan Yang <cyan.yang@sifive.com>
Date: Fri, 18 Apr 2025 13:32:37 +0800
Subject: riscv: Add SiFive xsfvfwmaccqqq vendor extension

Add SiFive vendor extension "xsfvfwmaccqqq" support to the kernel.

Signed-off-by: Cyan Yang <cyan.yang@sifive.com>
Link: https://lore.kernel.org/r/20250418053239.4351-11-cyan.yang@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/asm/vendor_extensions/sifive.h | 1 +
 arch/riscv/kernel/vendor_extensions/sifive.c      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/vendor_extensions/sifive.h b/arch/riscv/include/asm/vendor_extensions/sifive.h
index 2d05e3e73170..ac00e500361c 100644
--- a/arch/riscv/include/asm/vendor_extensions/sifive.h
+++ b/arch/riscv/include/asm/vendor_extensions/sifive.h
@@ -9,6 +9,7 @@
 #define RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD		0
 #define RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ		1
 #define RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF		2
+#define RISCV_ISA_VENDOR_EXT_XSFVFWMACCQQQ		3
 
 extern struct riscv_isa_vendor_ext_data_list riscv_isa_vendor_ext_list_sifive;
 
diff --git a/arch/riscv/kernel/vendor_extensions/sifive.c b/arch/riscv/kernel/vendor_extensions/sifive.c
index 077315e5b2d7..1411337dc1e6 100644
--- a/arch/riscv/kernel/vendor_extensions/sifive.c
+++ b/arch/riscv/kernel/vendor_extensions/sifive.c
@@ -10,6 +10,7 @@
 /* All SiFive vendor extensions supported in Linux */
 const struct riscv_isa_ext_data riscv_isa_vendor_ext_sifive[] = {
 	__RISCV_ISA_EXT_DATA(xsfvfnrclipxfqf, RISCV_ISA_VENDOR_EXT_XSFVFNRCLIPXFQF),
+	__RISCV_ISA_EXT_DATA(xsfvfwmaccqqq, RISCV_ISA_VENDOR_EXT_XSFVFWMACCQQQ),
 	__RISCV_ISA_EXT_DATA(xsfvqmaccdod, RISCV_ISA_VENDOR_EXT_XSFVQMACCDOD),
 	__RISCV_ISA_EXT_DATA(xsfvqmaccqoq, RISCV_ISA_VENDOR_EXT_XSFVQMACCQOQ),
 };
-- 
cgit v1.2.3


From d9669e33c8fadb5f81287f4961f01e40c0a11c23 Mon Sep 17 00:00:00 2001
From: Cyan Yang <cyan.yang@sifive.com>
Date: Fri, 18 Apr 2025 13:32:39 +0800
Subject: riscv: hwprobe: Add SiFive xsfvfwmaccqqq vendor extension

Add hwprobe for SiFive "xsfvfwmaccqqq" vendor extension.

Signed-off-by: Cyan Yang <cyan.yang@sifive.com>
Link: https://lore.kernel.org/r/20250418053239.4351-13-cyan.yang@sifive.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
---
 arch/riscv/include/uapi/asm/vendor/sifive.h          | 1 +
 arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/uapi/asm/vendor/sifive.h b/arch/riscv/include/uapi/asm/vendor/sifive.h
index b772d4631284..9f3278a4b298 100644
--- a/arch/riscv/include/uapi/asm/vendor/sifive.h
+++ b/arch/riscv/include/uapi/asm/vendor/sifive.h
@@ -3,3 +3,4 @@
 #define	RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCDOD		(1 << 0)
 #define	RISCV_HWPROBE_VENDOR_EXT_XSFVQMACCQOQ		(1 << 1)
 #define	RISCV_HWPROBE_VENDOR_EXT_XSFVFNRCLIPXFQF		(1 << 2)
+#define	RISCV_HWPROBE_VENDOR_EXT_XSFVFWMACCQQQ		(1 << 3)
diff --git a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c
index 2b9505079a9f..1f77f6309763 100644
--- a/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c
+++ b/arch/riscv/kernel/vendor_extensions/sifive_hwprobe.c
@@ -17,5 +17,6 @@ void hwprobe_isa_vendor_ext_sifive_0(struct riscv_hwprobe *pair, const struct cp
 		VENDOR_EXT_KEY(XSFVQMACCDOD);
 		VENDOR_EXT_KEY(XSFVQMACCQOQ);
 		VENDOR_EXT_KEY(XSFVFNRCLIPXFQF);
+		VENDOR_EXT_KEY(XSFVFWMACCQQQ);
 	});
 }
-- 
cgit v1.2.3


From 7d33e0ee5f98b3fc74566fa00d0926bc5deb8174 Mon Sep 17 00:00:00 2001
From: Fabrizio Castro <fabrizio.castro.jz@renesas.com>
Date: Wed, 23 Apr 2025 15:34:22 +0100
Subject: arm64: dts: renesas: r9a09g057: Add DMAC nodes

Add nodes for the DMAC IPs found on the Renesas RZ/V2H(P) SoC.

Signed-off-by: Fabrizio Castro <fabrizio.castro.jz@renesas.com>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@bp.renesas.com>
Link: https://lore.kernel.org/r/20250423143422.3747702-7-fabrizio.castro.jz@renesas.com
Signed-off-by: Vinod Koul <vkoul@kernel.org>
---
 arch/arm64/boot/dts/renesas/r9a09g057.dtsi | 165 +++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)

(limited to 'arch')

diff --git a/arch/arm64/boot/dts/renesas/r9a09g057.dtsi b/arch/arm64/boot/dts/renesas/r9a09g057.dtsi
index 0cd00bb05191..2689a3a6af85 100644
--- a/arch/arm64/boot/dts/renesas/r9a09g057.dtsi
+++ b/arch/arm64/boot/dts/renesas/r9a09g057.dtsi
@@ -280,6 +280,171 @@
 			resets = <&cpg 0x30>;
 		};
 
+		dmac0: dma-controller@11400000 {
+			compatible = "renesas,r9a09g057-dmac";
+			reg = <0 0x11400000 0 0x10000>;
+			interrupts = <GIC_SPI 499 IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 89  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 90  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 91  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 92  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 93  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 94  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 95  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 96  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 97  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 98  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 99  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 100 IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 101 IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 102 IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 103 IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 104 IRQ_TYPE_EDGE_RISING>;
+			interrupt-names = "error",
+					  "ch0", "ch1", "ch2", "ch3",
+					  "ch4", "ch5", "ch6", "ch7",
+					  "ch8", "ch9", "ch10", "ch11",
+					  "ch12", "ch13", "ch14", "ch15";
+			clocks = <&cpg CPG_MOD 0x0>;
+			power-domains = <&cpg>;
+			resets = <&cpg 0x31>;
+			#dma-cells = <1>;
+			dma-channels = <16>;
+			renesas,icu = <&icu 4>;
+		};
+
+		dmac1: dma-controller@14830000 {
+			compatible = "renesas,r9a09g057-dmac";
+			reg = <0 0x14830000 0 0x10000>;
+			interrupts = <GIC_SPI 495 IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 25  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 26  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 27  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 28  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 29  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 30  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 31  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 32  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 33  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 34  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 35  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 36  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 37  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 38  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 39  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 40  IRQ_TYPE_EDGE_RISING>;
+			interrupt-names = "error",
+					  "ch0", "ch1", "ch2", "ch3",
+					  "ch4", "ch5", "ch6", "ch7",
+					  "ch8", "ch9", "ch10", "ch11",
+					  "ch12", "ch13", "ch14", "ch15";
+			clocks = <&cpg CPG_MOD 0x1>;
+			power-domains = <&cpg>;
+			resets = <&cpg 0x32>;
+			#dma-cells = <1>;
+			dma-channels = <16>;
+			renesas,icu = <&icu 0>;
+		};
+
+		dmac2: dma-controller@14840000 {
+			compatible = "renesas,r9a09g057-dmac";
+			reg = <0 0x14840000 0 0x10000>;
+			interrupts = <GIC_SPI 496 IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 41  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 42  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 43  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 44  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 45  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 46  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 47  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 48  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 49  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 50  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 51  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 52  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 53  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 54  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 55  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 56  IRQ_TYPE_EDGE_RISING>;
+			interrupt-names = "error",
+					  "ch0", "ch1", "ch2", "ch3",
+					  "ch4", "ch5", "ch6", "ch7",
+					  "ch8", "ch9", "ch10", "ch11",
+					  "ch12", "ch13", "ch14", "ch15";
+			clocks = <&cpg CPG_MOD 0x2>;
+			power-domains = <&cpg>;
+			resets = <&cpg 0x33>;
+			#dma-cells = <1>;
+			dma-channels = <16>;
+			renesas,icu = <&icu 1>;
+		};
+
+		dmac3: dma-controller@12000000 {
+			compatible = "renesas,r9a09g057-dmac";
+			reg = <0 0x12000000 0 0x10000>;
+			interrupts = <GIC_SPI 497 IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 57  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 58  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 59  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 60  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 61  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 62  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 63  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 64  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 65  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 66  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 67  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 68  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 69  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 70  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 71  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 72  IRQ_TYPE_EDGE_RISING>;
+			interrupt-names = "error",
+					  "ch0", "ch1", "ch2", "ch3",
+					  "ch4", "ch5", "ch6", "ch7",
+					  "ch8", "ch9", "ch10", "ch11",
+					  "ch12", "ch13", "ch14", "ch15";
+			clocks = <&cpg CPG_MOD 0x3>;
+			power-domains = <&cpg>;
+			resets = <&cpg 0x34>;
+			#dma-cells = <1>;
+			dma-channels = <16>;
+			renesas,icu = <&icu 2>;
+		};
+
+		dmac4: dma-controller@12010000 {
+			compatible = "renesas,r9a09g057-dmac";
+			reg = <0 0x12010000 0 0x10000>;
+			interrupts = <GIC_SPI 498 IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 73  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 74  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 75  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 76  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 77  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 78  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 79  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 80  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 81  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 82  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 83  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 84  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 85  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 86  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 87  IRQ_TYPE_EDGE_RISING>,
+				     <GIC_SPI 88  IRQ_TYPE_EDGE_RISING>;
+			interrupt-names = "error",
+					  "ch0", "ch1", "ch2", "ch3",
+					  "ch4", "ch5", "ch6", "ch7",
+					  "ch8", "ch9", "ch10", "ch11",
+					  "ch12", "ch13", "ch14", "ch15";
+			clocks = <&cpg CPG_MOD 0x4>;
+			power-domains = <&cpg>;
+			resets = <&cpg 0x35>;
+			#dma-cells = <1>;
+			dma-channels = <16>;
+			renesas,icu = <&icu 3>;
+		};
+
 		ostm0: timer@11800000 {
 			compatible = "renesas,r9a09g057-ostm", "renesas,ostm";
 			reg = <0x0 0x11800000 0x0 0x1000>;
-- 
cgit v1.2.3


From 4cc7543eb494daae7d6282e17459e1b06eff82aa Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@bootlin.com>
Date: Mon, 5 May 2025 14:57:58 +0200
Subject: MIPS: SMP: Move the AP sync point before the non-parallel aware
 functions

When CONFIG_HOTPLUG_PARALLEL is enabled, the code executing before
cpuhp_ap_sync_alive() is executed in parallel, while after it is
serialized. The functions set_cpu_sibling_map() and set_cpu_core_map()
were not designed to be executed in parallel, so by moving the
cpuhp_ap_sync_alive() before cpuhp_ap_sync_alive(), we then ensure
they will be called serialized.

The measurement done on EyeQ5 did not show any relevant boot time
increase after applying this patch.

Fixes: 76c43eb507bc ("MIPS: SMP: Implement parallel CPU bring up for EyeQ")
Reported-by: Huacai Chen <chenhuacai@kernel.org>
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/kernel/smp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 1726744f2b2e..7901b59d8f60 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -374,13 +374,13 @@ asmlinkage void start_secondary(void)
 	calibrate_delay();
 	cpu_data[cpu].udelay_val = loops_per_jiffy;
 
+#ifdef CONFIG_HOTPLUG_PARALLEL
+	cpuhp_ap_sync_alive();
+#endif
 	set_cpu_sibling_map(cpu);
 	set_cpu_core_map(cpu);
 
 	cpumask_set_cpu(cpu, &cpu_coherent_mask);
-#ifdef CONFIG_HOTPLUG_PARALLEL
-	cpuhp_ap_sync_alive();
-#endif
 	notify_cpu_starting(cpu);
 
 #ifndef CONFIG_HOTPLUG_PARALLEL
-- 
cgit v1.2.3


From 3590692a136d75e39cd67b0f23e032669fcdbcd2 Mon Sep 17 00:00:00 2001
From: Charan Pedumuru <charan.pedumuru@gmail.com>
Date: Wed, 7 May 2025 06:29:35 +0000
Subject: mips: dts: pic32: pic32mzda: Rename the sdhci nodename to match with
 common mmc-controller binding

Rename the sdhci nodename from "sdhci@" to "mmc@" to align with
linux common mmc-controller binding.

Signed-off-by: Charan Pedumuru <charan.pedumuru@gmail.com>
Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/boot/dts/pic32/pic32mzda.dtsi | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/boot/dts/pic32/pic32mzda.dtsi b/arch/mips/boot/dts/pic32/pic32mzda.dtsi
index fdc721b414a8..feca35ba56a4 100644
--- a/arch/mips/boot/dts/pic32/pic32mzda.dtsi
+++ b/arch/mips/boot/dts/pic32/pic32mzda.dtsi
@@ -225,7 +225,7 @@
 		gpio-ranges = <&pic32_pinctrl 0 144 16>;
 	};
 
-	sdhci: sdhci@1f8ec000 {
+	sdhci: mmc@1f8ec000 {
 		compatible = "microchip,pic32mzda-sdhci";
 		reg = <0x1f8ec000 0x100>;
 		interrupts = <191 IRQ_TYPE_LEVEL_HIGH>;
-- 
cgit v1.2.3


From 35fb26f94dfa1b291086b84b2421f957214824d1 Mon Sep 17 00:00:00 2001
From: Caleb James DeLisle <cjd@cjdns.fr>
Date: Wed, 7 May 2025 13:44:57 +0000
Subject: mips: Add EcoNet MIPS platform support

Add platform support for EcoNet MIPS SoCs.

Signed-off-by: Caleb James DeLisle <cjd@cjdns.fr>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/Kbuild.platforms             |  1 +
 arch/mips/Kconfig                      | 25 +++++++++++
 arch/mips/boot/compressed/uart-16550.c |  5 +++
 arch/mips/econet/Kconfig               | 37 ++++++++++++++++
 arch/mips/econet/Makefile              |  2 +
 arch/mips/econet/Platform              |  5 +++
 arch/mips/econet/init.c                | 78 ++++++++++++++++++++++++++++++++++
 7 files changed, 153 insertions(+)
 create mode 100644 arch/mips/econet/Kconfig
 create mode 100644 arch/mips/econet/Makefile
 create mode 100644 arch/mips/econet/Platform
 create mode 100644 arch/mips/econet/init.c

(limited to 'arch')

diff --git a/arch/mips/Kbuild.platforms b/arch/mips/Kbuild.platforms
index bca37ddf974b..41a00fa860c1 100644
--- a/arch/mips/Kbuild.platforms
+++ b/arch/mips/Kbuild.platforms
@@ -11,6 +11,7 @@ platform-$(CONFIG_CAVIUM_OCTEON_SOC)	+= cavium-octeon/
 platform-$(CONFIG_EYEQ)			+= mobileye/
 platform-$(CONFIG_MIPS_COBALT)		+= cobalt/
 platform-$(CONFIG_MACH_DECSTATION)	+= dec/
+platform-$(CONFIG_ECONET)		+= econet/
 platform-$(CONFIG_MIPS_GENERIC)		+= generic/
 platform-$(CONFIG_MACH_JAZZ)		+= jazz/
 platform-$(CONFIG_LANTIQ)		+= lantiq/
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index e0e6ce2592b4..c3dbdc808664 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -391,6 +391,30 @@ config MACH_DECSTATION
 
 	  otherwise choose R3000.
 
+config ECONET
+	bool "EcoNet MIPS family"
+	select BOOT_RAW
+	select CPU_BIG_ENDIAN
+	select DEBUG_ZBOOT
+	select EARLY_PRINTK_8250
+	select ECONET_EN751221_TIMER
+	select SERIAL_OF_PLATFORM
+	select SYS_SUPPORTS_BIG_ENDIAN
+	select SYS_HAS_CPU_MIPS32_R1
+	select SYS_HAS_CPU_MIPS32_R2
+	select SYS_HAS_EARLY_PRINTK
+	select SYS_SUPPORTS_32BIT_KERNEL
+	select SYS_SUPPORTS_MIPS16
+	select SYS_SUPPORTS_ZBOOT_UART16550
+	select USE_GENERIC_EARLY_PRINTK_8250
+	select USE_OF
+	help
+	  EcoNet EN75xx MIPS devices are big endian MIPS machines used
+	  in XPON (fiber) and DSL applications. They have SPI, PCI, USB,
+	  GPIO, and Ethernet, with optional XPON, DSL, and VoIP DSP cores.
+	  Don't confuse these with the Airoha ARM devices sometimes referred
+	  to as "EcoNet", this family is for MIPS based devices only.
+
 config MACH_JAZZ
 	bool "Jazz family of machines"
 	select ARC_MEMORY
@@ -1021,6 +1045,7 @@ source "arch/mips/ath79/Kconfig"
 source "arch/mips/bcm47xx/Kconfig"
 source "arch/mips/bcm63xx/Kconfig"
 source "arch/mips/bmips/Kconfig"
+source "arch/mips/econet/Kconfig"
 source "arch/mips/generic/Kconfig"
 source "arch/mips/ingenic/Kconfig"
 source "arch/mips/jazz/Kconfig"
diff --git a/arch/mips/boot/compressed/uart-16550.c b/arch/mips/boot/compressed/uart-16550.c
index db618e72a0c4..529e77a6487c 100644
--- a/arch/mips/boot/compressed/uart-16550.c
+++ b/arch/mips/boot/compressed/uart-16550.c
@@ -20,6 +20,11 @@
 #define PORT(offset) (CKSEG1ADDR(INGENIC_UART_BASE_ADDR) + (4 * offset))
 #endif
 
+#ifdef CONFIG_ECONET
+#define EN75_UART_BASE	0x1fbf0003
+#define PORT(offset)	(CKSEG1ADDR(EN75_UART_BASE) + (4 * (offset)))
+#endif
+
 #ifndef IOTYPE
 #define IOTYPE char
 #endif
diff --git a/arch/mips/econet/Kconfig b/arch/mips/econet/Kconfig
new file mode 100644
index 000000000000..d03f90f3daa4
--- /dev/null
+++ b/arch/mips/econet/Kconfig
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0
+if ECONET
+
+choice
+	prompt "EcoNet SoC selection"
+	default SOC_ECONET_EN751221
+	help
+	  Select EcoNet MIPS SoC type. Individual SoCs within a family are
+	  very similar, so is it enough to select the right family, and
+	  then customize to the specific SoC using the device tree only.
+
+	config SOC_ECONET_EN751221
+		bool "EN751221 family"
+		select COMMON_CLK
+		select ECONET_EN751221_INTC
+		select IRQ_MIPS_CPU
+		select SMP
+		select SMP_UP
+		select SYS_SUPPORTS_SMP
+		help
+		  The EN751221 family includes EN7512, RN7513, EN7521, EN7526.
+		  They are based on single core MIPS 34Kc processors. To boot
+		  this kernel, you will need a device tree such as
+		  MIPS_RAW_APPENDED_DTB=y, and a root filesystem.
+endchoice
+
+choice
+	prompt "Devicetree selection"
+	default DTB_ECONET_NONE
+	help
+	  Select the devicetree.
+
+	config DTB_ECONET_NONE
+		bool "None"
+endchoice
+
+endif
diff --git a/arch/mips/econet/Makefile b/arch/mips/econet/Makefile
new file mode 100644
index 000000000000..7e4529e7d3d7
--- /dev/null
+++ b/arch/mips/econet/Makefile
@@ -0,0 +1,2 @@
+
+obj-y := init.o
diff --git a/arch/mips/econet/Platform b/arch/mips/econet/Platform
new file mode 100644
index 000000000000..ea5616447bcd
--- /dev/null
+++ b/arch/mips/econet/Platform
@@ -0,0 +1,5 @@
+# To address a 7.2MB kernel size limit in the EcoNet SDK bootloader,
+# we put the load address well above where the bootloader loads and then use
+# zboot. So please set CONFIG_ZBOOT_LOAD_ADDRESS to the address where your
+# bootloader actually places the kernel.
+load-$(CONFIG_ECONET)	+= 0xffffffff81000000
diff --git a/arch/mips/econet/init.c b/arch/mips/econet/init.c
new file mode 100644
index 000000000000..6f43ffb209cb
--- /dev/null
+++ b/arch/mips/econet/init.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * EcoNet setup code
+ *
+ * Copyright (C) 2025 Caleb James DeLisle <cjd@cjdns.fr>
+ */
+
+#include <linux/init.h>
+#include <linux/of_clk.h>
+#include <linux/irqchip.h>
+
+#include <asm/addrspace.h>
+#include <asm/io.h>
+#include <asm/bootinfo.h>
+#include <asm/time.h>
+#include <asm/prom.h>
+#include <asm/smp-ops.h>
+#include <asm/reboot.h>
+
+#define CR_AHB_RSTCR		((void __iomem *)CKSEG1ADDR(0x1fb00040))
+#define RESET			BIT(31)
+
+#define UART_BASE		CKSEG1ADDR(0x1fbf0003)
+#define UART_REG_SHIFT		2
+
+static void hw_reset(char *command)
+{
+	iowrite32(RESET, CR_AHB_RSTCR);
+}
+
+/* 1. Bring up early printk. */
+void __init prom_init(void)
+{
+	setup_8250_early_printk_port(UART_BASE, UART_REG_SHIFT, 0);
+	_machine_restart = hw_reset;
+}
+
+/* 2. Parse the DT and find memory */
+void __init plat_mem_setup(void)
+{
+	void *dtb;
+
+	set_io_port_base(KSEG1);
+
+	dtb = get_fdt();
+	if (!dtb)
+		panic("no dtb found");
+
+	__dt_setup_arch(dtb);
+
+	early_init_dt_scan_memory();
+}
+
+/* 3. Overload __weak device_tree_init(), add SMP_UP ops */
+void __init device_tree_init(void)
+{
+	unflatten_and_copy_device_tree();
+
+	register_up_smp_ops();
+}
+
+const char *get_system_type(void)
+{
+	return "EcoNet-EN75xx";
+}
+
+/* 4. Initialize the IRQ subsystem */
+void __init arch_init_irq(void)
+{
+	irqchip_init();
+}
+
+/* 5. Timers */
+void __init plat_time_init(void)
+{
+	of_clk_init(NULL);
+	timer_probe();
+}
-- 
cgit v1.2.3


From 0ec4887009729297f7c10368084e41a8a9fbbd0e Mon Sep 17 00:00:00 2001
From: Caleb James DeLisle <cjd@cjdns.fr>
Date: Wed, 7 May 2025 13:44:59 +0000
Subject: mips: dts: Add EcoNet DTS with EN751221 and SmartFiber XP8421-B board

Add DTS files in support of EcoNet platform, including SmartFiber XP8421-B,
a low cost commercially available board based on EN751221.

Signed-off-by: Caleb James DeLisle <cjd@cjdns.fr>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/boot/dts/Makefile                        |  1 +
 arch/mips/boot/dts/econet/Makefile                 |  2 +
 arch/mips/boot/dts/econet/en751221.dtsi            | 67 ++++++++++++++++++++++
 .../dts/econet/en751221_smartfiber_xp8421-b.dts    | 19 ++++++
 arch/mips/econet/Kconfig                           | 11 ++++
 5 files changed, 100 insertions(+)
 create mode 100644 arch/mips/boot/dts/econet/Makefile
 create mode 100644 arch/mips/boot/dts/econet/en751221.dtsi
 create mode 100644 arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts

(limited to 'arch')

diff --git a/arch/mips/boot/dts/Makefile b/arch/mips/boot/dts/Makefile
index ff468439a8c4..7375c6ced82b 100644
--- a/arch/mips/boot/dts/Makefile
+++ b/arch/mips/boot/dts/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 subdir-$(CONFIG_BMIPS_GENERIC)		+= brcm
 subdir-$(CONFIG_CAVIUM_OCTEON_SOC)	+= cavium-octeon
+subdir-$(CONFIG_ECONET)			+= econet
 subdir-$(CONFIG_EYEQ)			+= mobileye
 subdir-$(CONFIG_FIT_IMAGE_FDT_MARDUK)   += img
 subdir-$(CONFIG_FIT_IMAGE_FDT_BOSTON)	+= img
diff --git a/arch/mips/boot/dts/econet/Makefile b/arch/mips/boot/dts/econet/Makefile
new file mode 100644
index 000000000000..b467d5624e39
--- /dev/null
+++ b/arch/mips/boot/dts/econet/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+dtb-$(CONFIG_DTB_ECONET_SMARTFIBER_XP8421_B)	+= en751221_smartfiber_xp8421-b.dtb
diff --git a/arch/mips/boot/dts/econet/en751221.dtsi b/arch/mips/boot/dts/econet/en751221.dtsi
new file mode 100644
index 000000000000..66197e73d4f0
--- /dev/null
+++ b/arch/mips/boot/dts/econet/en751221.dtsi
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/dts-v1/;
+
+/ {
+	compatible = "econet,en751221";
+	#address-cells = <1>;
+	#size-cells = <1>;
+
+	hpt_clock: clock {
+		compatible = "fixed-clock";
+		#clock-cells = <0>;
+		clock-frequency = <200000000>;  /* 200 MHz */
+	};
+
+	cpus: cpus {
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		cpu@0 {
+			device_type = "cpu";
+			compatible = "mips,mips24KEc";
+			reg = <0>;
+		};
+	};
+
+	cpuintc: interrupt-controller {
+		compatible = "mti,cpu-interrupt-controller";
+		interrupt-controller;
+		#address-cells = <0>;
+		#interrupt-cells = <1>;
+	};
+
+	intc: interrupt-controller@1fb40000 {
+		compatible = "econet,en751221-intc";
+		reg = <0x1fb40000 0x100>;
+		interrupt-parent = <&cpuintc>;
+		interrupts = <2>;
+
+		interrupt-controller;
+		#interrupt-cells = <1>;
+		econet,shadow-interrupts = <7 2>, <8 3>, <13 12>, <30 29>;
+	};
+
+	uart: serial@1fbf0000 {
+		compatible = "ns16550";
+		reg = <0x1fbf0000 0x30>;
+		reg-io-width = <4>;
+		reg-shift = <2>;
+		interrupt-parent = <&intc>;
+		interrupts = <0>;
+		/*
+		 * Conversion of baud rate to clock frequency requires a
+		 * computation that is not in the ns16550 driver, so this
+		 * uart is fixed at 115200 baud.
+		 */
+		clock-frequency = <1843200>;
+	};
+
+	timer_hpt: timer@1fbf0400 {
+		compatible = "econet,en751221-timer";
+		reg = <0x1fbf0400 0x100>;
+
+		interrupt-parent = <&intc>;
+		interrupts = <30>;
+		clocks = <&hpt_clock>;
+	};
+};
diff --git a/arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts b/arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts
new file mode 100644
index 000000000000..8223c5bce67f
--- /dev/null
+++ b/arch/mips/boot/dts/econet/en751221_smartfiber_xp8421-b.dts
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/dts-v1/;
+
+#include "en751221.dtsi"
+
+/ {
+	model = "SmartFiber XP8421-B";
+	compatible = "smartfiber,xp8421-b", "econet,en751221";
+
+	memory@0 {
+		device_type = "memory";
+		reg = <0x00000000 0x1c000000>;
+	};
+
+	chosen {
+		stdout-path = "/serial@1fbf0000:115200";
+		linux,usable-memory-range = <0x00020000 0x1bfe0000>;
+	};
+};
diff --git a/arch/mips/econet/Kconfig b/arch/mips/econet/Kconfig
index d03f90f3daa4..fd69884cc9a8 100644
--- a/arch/mips/econet/Kconfig
+++ b/arch/mips/econet/Kconfig
@@ -32,6 +32,17 @@ choice
 
 	config DTB_ECONET_NONE
 		bool "None"
+
+	config DTB_ECONET_SMARTFIBER_XP8421_B
+		bool "EN751221 SmartFiber XP8421-B"
+		depends on SOC_ECONET_EN751221
+		select BUILTIN_DTB
+		help
+		  The SmartFiber XP8421-B is a device based on the EN751221 SoC.
+		  It has 512MB of memory and 256MB of NAND flash. This kernel
+		  needs only an appended initramfs to boot. It can be loaded
+		  through XMODEM and booted from memory in the bootloader, or
+		  it can be packed in tclinux.trx format and written to flash.
 endchoice
 
 endif
-- 
cgit v1.2.3


From 680ef57915db0f715e99a683c15f0f9f7c700e64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Draszik?= <andre.draszik@linaro.org>
Date: Wed, 9 Apr 2025 21:37:29 +0100
Subject: mfd: sec: Split into core and transport (i2c) drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As a preparation for adding support for Samsung's S2MPG10, which is
connected via SPEEDY / ACPM rather than I2C, split out (move) all
I2C-specific driver code into its own kernel module, sec-i2c, and
make the existing sec-core module be just the transport-agnostic core
driver kernel module.

At the same time, update all defconfigs that reference the old kconfig
symbol name.

While at it, also update file header comments and module description(s)
to drop references to 'mfd', and update comments to be C-style, not
C++.

Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Signed-off-by: André Draszik <andre.draszik@linaro.org>
Link: https://lore.kernel.org/r/20250409-s2mpg10-v4-8-d66d5f39b6bf@linaro.org
Signed-off-by: Lee Jones <lee@kernel.org>
---
 arch/arm/configs/exynos_defconfig   | 2 +-
 arch/arm/configs/multi_v7_defconfig | 2 +-
 arch/arm/configs/pxa_defconfig      | 2 +-
 arch/arm64/configs/defconfig        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index 7ad48fdda1da..251f45be6c14 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -167,7 +167,7 @@ CONFIG_MFD_MAX77686=y
 CONFIG_MFD_MAX77693=y
 CONFIG_MFD_MAX8997=y
 CONFIG_MFD_MAX8998=y
-CONFIG_MFD_SEC_CORE=y
+CONFIG_MFD_SEC_I2C=y
 CONFIG_MFD_STMPE=y
 CONFIG_STMPE_I2C=y
 CONFIG_MFD_TPS65090=y
diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig
index ad037c175fdb..7d06ac5369b1 100644
--- a/arch/arm/configs/multi_v7_defconfig
+++ b/arch/arm/configs/multi_v7_defconfig
@@ -612,7 +612,7 @@ CONFIG_MFD_QCOM_RPM=y
 CONFIG_MFD_SPMI_PMIC=y
 CONFIG_MFD_RK8XX_I2C=y
 CONFIG_MFD_RN5T618=y
-CONFIG_MFD_SEC_CORE=y
+CONFIG_MFD_SEC_I2C=y
 CONFIG_MFD_STMPE=y
 CONFIG_MFD_PALMAS=y
 CONFIG_MFD_TPS65090=y
diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig
index de0ac8f521d7..064e79baf20d 100644
--- a/arch/arm/configs/pxa_defconfig
+++ b/arch/arm/configs/pxa_defconfig
@@ -335,7 +335,7 @@ CONFIG_MFD_MAX77693=y
 CONFIG_MFD_MAX8907=m
 CONFIG_EZX_PCAP=y
 CONFIG_UCB1400_CORE=m
-CONFIG_MFD_SEC_CORE=y
+CONFIG_MFD_SEC_I2C=y
 CONFIG_MFD_PALMAS=y
 CONFIG_MFD_TPS65090=y
 CONFIG_MFD_TPS6586X=y
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 5bb8f09422a2..f021fb29683b 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -769,7 +769,7 @@ CONFIG_MFD_MT6397=y
 CONFIG_MFD_SPMI_PMIC=y
 CONFIG_MFD_RK8XX_I2C=y
 CONFIG_MFD_RK8XX_SPI=y
-CONFIG_MFD_SEC_CORE=y
+CONFIG_MFD_SEC_I2C=y
 CONFIG_MFD_SL28CPLD=y
 CONFIG_RZ_MTU3=y
 CONFIG_MFD_TI_AM335X_TSCADC=m
-- 
cgit v1.2.3


From 46bc169f6f07eca9a7d2346c703ea59bff385e22 Mon Sep 17 00:00:00 2001
From: Johan Hovold <johan+linaro@kernel.org>
Date: Wed, 2 Apr 2025 15:26:34 +0200
Subject: arm64: Kconfig: switch to HAVE_PWRCTRL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The HAVE_PWRCTRL symbol has been renamed to reflect the pwrctrl framework
name. Switch to the non-deprecated symbol.

Signed-off-by: Johan Hovold <johan+linaro@kernel.org>
Signed-off-by: Krzysztof Wilczyński <kwilczynski@kernel.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Link: https://patch.msgid.link/20250402132634.18065-5-johan+linaro@kernel.org
---
 arch/arm64/Kconfig.platforms | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 8b76821f190f..a541bb029aa4 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -269,7 +269,7 @@ config ARCH_QCOM
 	bool "Qualcomm Platforms"
 	select GPIOLIB
 	select PINCTRL
-	select HAVE_PWRCTL if PCI
+	select HAVE_PWRCTRL if PCI
 	help
 	  This enables support for the ARMv8 based Qualcomm chipsets.
 
-- 
cgit v1.2.3


From 79ee1d20e37cd553cc961962fca8107e69a0c293 Mon Sep 17 00:00:00 2001
From: Caleb James DeLisle <cjd@cjdns.fr>
Date: Wed, 21 May 2025 21:33:33 +0000
Subject: mips: econet: Fix incorrect Kconfig dependencies

config ECONET selects SERIAL_OF_PLATFORM and that depends on SERIAL_8250
so we need to select SERIAL_8250 directly.
Also do not enable DEBUG_ZBOOT unless DEBUG_KERNEL is set.

Signed-off-by: Caleb James DeLisle <cjd@cjdns.fr>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202505211654.CBdIsoTq-lkp@intel.com/
Closes: https://lore.kernel.org/oe-kbuild-all/202505211451.WRjyf3a9-lkp@intel.com/
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index c3dbdc808664..1e48184ecf1e 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -395,9 +395,10 @@ config ECONET
 	bool "EcoNet MIPS family"
 	select BOOT_RAW
 	select CPU_BIG_ENDIAN
-	select DEBUG_ZBOOT
+	select DEBUG_ZBOOT if DEBUG_KERNEL
 	select EARLY_PRINTK_8250
 	select ECONET_EN751221_TIMER
+	select SERIAL_8250
 	select SERIAL_OF_PLATFORM
 	select SYS_SUPPORTS_BIG_ENDIAN
 	select SYS_HAS_CPU_MIPS32_R1
-- 
cgit v1.2.3


From ab535361efdf8129dc593f8f2d80b76767c07813 Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@bootlin.com>
Date: Fri, 23 May 2025 09:58:15 +0200
Subject: MIPS: SMP: Move the AP sync point before the calibration delay

In the calibration delay process, some resources are shared, so it's
better to move it after the parallel execution part. Thanks to the
patch optimizing CPU delay calibration, this change has no impact on
the boot time improvements gained from CPU parallel boot.

Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/kernel/smp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 7901b59d8f60..4868e79f3b30 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -371,12 +371,12 @@ asmlinkage void start_secondary(void)
 	 * to an option instead of something based on .cputype
 	 */
 
-	calibrate_delay();
-	cpu_data[cpu].udelay_val = loops_per_jiffy;
-
 #ifdef CONFIG_HOTPLUG_PARALLEL
 	cpuhp_ap_sync_alive();
 #endif
+	calibrate_delay();
+	cpu_data[cpu].udelay_val = loops_per_jiffy;
+
 	set_cpu_sibling_map(cpu);
 	set_cpu_core_map(cpu);
 
-- 
cgit v1.2.3


From 293bb049380e14a176e98d3701b85b6b5d3140fd Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Wed, 2 Apr 2025 08:05:31 +0100
Subject: ARM: 9446/1: Disallow kernel mode NEON when IRQs are disabled

Commit

  c79f81631142 ("ARM: 9283/1: permit non-nested kernel mode NEON in softirq context")

relaxed the rules around the use of SIMD instructions in kernel mode on
ARM, to allow such use when serving a softirq. To avoid having to
preserve/restore kernel mode NEON state when such a softirq is taken,
softirqs are now disabled when using the NEON from task context.

However, the fact that the softirq API does not allow unmasking of
softirqs with interrupts disabled was overlooked, resulting in a WARN()
in some cases, as reported by Guenter:

  WARNING: CPU: 0 PID: 1145 at kernel/softirq.c:369 __local_bh_enable_ip+0x118/0x194
  Call trace:
   unwind_backtrace from show_stack+0x10/0x14
   show_stack from dump_stack_lvl+0x7c/0xac
   dump_stack_lvl from __warn+0x7c/0x1b8
   __warn from warn_slowpath_fmt+0x19c/0x1a4
   warn_slowpath_fmt from __local_bh_enable_ip+0x118/0x194
   __local_bh_enable_ip from crc_t10dif_arch+0xd4/0xe8
   crc_t10dif_arch from crc_t10dif_wrapper+0x14/0x1c
   crc_t10dif_wrapper from crc_main_test+0x178/0x360
   crc_main_test from kunit_try_run_case+0x78/0x1e0
   kunit_try_run_case from kunit_generic_run_threadfn_adapter+0x1c/0x34
   kunit_generic_run_threadfn_adapter from kthread+0x118/0x254
   kthread from ret_from_fork+0x14/0x28

While disabling softirqs is not really needed when running with IRQs
disabled (given that the only way a softirq can be delivered
asynchrously is over the back of an IRQ), let's not complicate this
logic more than needed, and simply disallow use of the NEON in kernel
mode when IRQs are disabled.

Another approach might be to only disable and re-enable softirqs if IRQs
are enabled, but other than the test case above, there are no clear use
cases for doing non-trivial arithmetic processing (hence using an
accelerated SIMD implementation) with IRQs disabled.

Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Link: https://lore.kernel.org/all/389b899f-893c-4855-9e30-d8920a5d6f91@roeck-us.net
Reviewed-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
---
 arch/arm/include/asm/simd.h | 3 ++-
 arch/arm/vfp/vfpmodule.c    | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm/include/asm/simd.h b/arch/arm/include/asm/simd.h
index 82191dbd7e78..e604a9382960 100644
--- a/arch/arm/include/asm/simd.h
+++ b/arch/arm/include/asm/simd.h
@@ -4,5 +4,6 @@
 
 static __must_check inline bool may_use_simd(void)
 {
-	return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq();
+	return IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && !in_hardirq()
+	       && !irqs_disabled();
 }
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 7803d50b90f8..e559ad3cd148 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -877,6 +877,7 @@ void kernel_neon_begin(void)
 	 * the kernel mode NEON register contents never need to be preserved.
 	 */
 	BUG_ON(in_hardirq());
+	BUG_ON(irqs_disabled());
 	cpu = __smp_processor_id();
 
 	fpexc = fmrx(FPEXC) | FPEXC_EN;
-- 
cgit v1.2.3


From 314da9d6b3119328c1a38234c68e720380d14957 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 28 May 2025 20:19:56 -0700
Subject: MIPS: loongson2ef: cs5536: add missing function prototypes

Add missing function prototypes for cs5536, mostly for PCI functions,
and for init_mfgpt_clocksource().

arch/mips/loongson2ef/common/cs5536/cs5536_ide.c:15:6: warning: no previous prototype for 'pci_ide_write_reg' [-Wmissing-prototypes]
   15 | void pci_ide_write_reg(int reg, u32 value)
arch/mips/loongson2ef/common/cs5536/cs5536_ide.c:96:5: warning: no previous prototype for 'pci_ide_read_reg' [-Wmissing-prototypes]
   96 | u32 pci_ide_read_reg(int reg)

arch/mips/loongson2ef/common/cs5536/cs5536_ehci.c:15:6: warning: no previous prototype for 'pci_ehci_write_reg' [-Wmissing-prototypes]
   15 | void pci_ehci_write_reg(int reg, u32 value)
arch/mips/loongson2ef/common/cs5536/cs5536_ehci.c:75:5: warning: no previous prototype for 'pci_ehci_read_reg' [-Wmissing-prototypes]
   75 | u32 pci_ehci_read_reg(int reg)

arch/mips/loongson2ef/common/cs5536/cs5536_acc.c:15:6: warning: no previous prototype for 'pci_acc_write_reg' [-Wmissing-prototypes]
   15 | void pci_acc_write_reg(int reg, u32 value)
arch/mips/loongson2ef/common/cs5536/cs5536_acc.c:62:5: warning: no previous prototype for 'pci_acc_read_reg' [-Wmissing-prototypes]
   62 | u32 pci_acc_read_reg(int reg)

arch/mips/loongson2ef/common/cs5536/cs5536_ohci.c:15:6: warning: no previous prototype for 'pci_ohci_write_reg' [-Wmissing-prototypes]
   15 | void pci_ohci_write_reg(int reg, u32 value)
arch/mips/loongson2ef/common/cs5536/cs5536_ohci.c:70:5: warning: no previous prototype for 'pci_ohci_read_reg' [-Wmissing-prototypes]
   70 | u32 pci_ohci_read_reg(int reg)

arch/mips/loongson2ef/common/cs5536/cs5536_isa.c:84:6: warning: no previous prototype for 'pci_isa_write_bar' [-Wmissing-prototypes]
   84 | void pci_isa_write_bar(int n, u32 value)
arch/mips/loongson2ef/common/cs5536/cs5536_isa.c:110:5: warning: no previous prototype for 'pci_isa_read_bar' [-Wmissing-prototypes]
  110 | u32 pci_isa_read_bar(int n)
arch/mips/loongson2ef/common/cs5536/cs5536_isa.c:134:6: warning: no previous prototype for 'pci_isa_write_reg' [-Wmissing-prototypes]
  134 | void pci_isa_write_reg(int reg, u32 value)
arch/mips/loongson2ef/common/cs5536/cs5536_isa.c:228:5: warning: no previous prototype for 'pci_isa_read_reg' [-Wmissing-prototypes]
  228 | u32 pci_isa_read_reg(int reg)

arch/mips/loongson2ef/common/cs5536/cs5536_mfgpt.c:195:12: warning: no previous prototype for 'init_mfgpt_clocksource' [-Wmissing-prototypes]
  195 | int __init init_mfgpt_clocksource(void)

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: linux-mips@vger.kernel.org
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 .../include/asm/mach-loongson2ef/cs5536/cs5536_pci.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h b/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h
index a0d4b752899e..5dbc9b13d15b 100644
--- a/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h
+++ b/arch/mips/include/asm/mach-loongson2ef/cs5536/cs5536_pci.h
@@ -12,12 +12,32 @@
 #ifndef _CS5536_PCI_H
 #define _CS5536_PCI_H
 
+#include <linux/init.h>
 #include <linux/types.h>
 #include <linux/pci_regs.h>
 
 extern void cs5536_pci_conf_write4(int function, int reg, u32 value);
 extern u32 cs5536_pci_conf_read4(int function, int reg);
 
+extern void pci_ehci_write_reg(int reg, u32 value);
+extern u32 pci_ehci_read_reg(int reg);
+
+extern void pci_ide_write_reg(int reg, u32 value);
+extern u32 pci_ide_read_reg(int reg);
+
+extern void pci_acc_write_reg(int reg, u32 value);
+extern u32 pci_acc_read_reg(int reg);
+
+extern void pci_ohci_write_reg(int reg, u32 value);
+extern u32 pci_ohci_read_reg(int reg);
+
+extern void pci_isa_write_bar(int n, u32 value);
+extern u32 pci_isa_read_bar(int n);
+extern void pci_isa_write_reg(int reg, u32 value);
+extern u32 pci_isa_read_reg(int reg);
+
+extern int __init init_mfgpt_clocksource(void);
+
 #define CS5536_ACC_INTR		9
 #define CS5536_IDE_INTR		14
 #define CS5536_USB_INTR		11
-- 
cgit v1.2.3


From 5a0c749125c001cba673e9951b0002fba7ea2886 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 28 May 2025 20:20:05 -0700
Subject: MIPS: loongson2ef: lemote-2f: add missing function prototypes

Add several missing function prototypes for lemote-2f to eliminate
build warnings:

arch/mips/loongson2ef/lemote-2f/machtype.c:10:13: warning: no previous prototype for 'mach_prom_init_machtype' [-Wmissing-prototypes]
   10 | void __init mach_prom_init_machtype(void)
arch/mips/loongson2ef/common/machtype.c:34:20: warning: no previous prototype for 'mach_prom_init_machtype' [-Wmissing-prototypes]
   34 | void __weak __init mach_prom_init_machtype(void)
arch/mips/loongson2ef/lemote-2f/pm.c:52:6: warning: no previous prototype for 'setup_wakeup_events' [-Wmissing-prototypes]
   52 | void setup_wakeup_events(void)
arch/mips/loongson2ef/lemote-2f/pm.c:90:5: warning: no previous prototype for 'wakeup_loongson' [-Wmissing-prototypes]
   90 | int wakeup_loongson(void)
arch/mips/loongson2ef/lemote-2f/pm.c:137:13: warning: no previous prototype for 'mach_suspend' [-Wmissing-prototypes]
  137 | void __weak mach_suspend(void)
arch/mips/loongson2ef/lemote-2f/pm.c:142:13: warning: no previous prototype for 'mach_resume' [-Wmissing-prototypes]
  142 | void __weak mach_resume(void)

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: linux-mips@vger.kernel.org
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
---
 arch/mips/include/asm/mach-loongson2ef/loongson.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/include/asm/mach-loongson2ef/loongson.h b/arch/mips/include/asm/mach-loongson2ef/loongson.h
index ca039b8dcde3..4a098fb10232 100644
--- a/arch/mips/include/asm/mach-loongson2ef/loongson.h
+++ b/arch/mips/include/asm/mach-loongson2ef/loongson.h
@@ -18,6 +18,9 @@ extern void bonito_irq_init(void);
 extern void mach_prepare_reboot(void);
 extern void mach_prepare_shutdown(void);
 
+/* machine-specific PROM functions */
+extern void __init mach_prom_init_machtype(void);
+
 /* environment arguments from bootloader */
 extern u32 cpu_clock_freq;
 extern u32 memsize, highmemsize;
@@ -45,6 +48,12 @@ extern void __init mach_init_irq(void);
 extern void mach_irq_dispatch(unsigned int pending);
 extern int mach_i8259_irq(void);
 
+/* power management functions */
+extern void setup_wakeup_events(void);
+extern int wakeup_loongson(void);
+extern void __weak mach_suspend(void);
+extern void __weak mach_resume(void);
+
 /* We need this in some places... */
 #define delay() ({		\
 	int x;				\
-- 
cgit v1.2.3


From a96c7330da0b4cc64c6f79044d03f52532bdfc5b Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Fri, 30 May 2025 21:45:33 +0800
Subject: LoongArch: Add a default install.sh

As specified in scripts/install.sh, the priority order is as follows
(from highest to lowest):
  ~/bin/installkernel
  /sbin/installkernel
  arch/loongarch/boot/install.sh

Fallback to default install.sh if installkernel is not found.

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/boot/install.sh | 56 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100755 arch/loongarch/boot/install.sh

(limited to 'arch')

diff --git a/arch/loongarch/boot/install.sh b/arch/loongarch/boot/install.sh
new file mode 100755
index 000000000000..daac197d3315
--- /dev/null
+++ b/arch/loongarch/boot/install.sh
@@ -0,0 +1,56 @@
+#!/bin/sh
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 1995 by Linus Torvalds
+#
+# Adapted from code in arch/i386/boot/Makefile by H. Peter Anvin
+# Adapted from code in arch/i386/boot/install.sh by Russell King
+#
+# "make install" script for the LoongArch Linux port
+#
+# Arguments:
+#   $1 - kernel version
+#   $2 - kernel image file
+#   $3 - kernel map file
+#   $4 - default install path (blank if root directory)
+
+set -e
+
+case "${2##*/}" in
+vmlinux.elf)
+  echo "Installing uncompressed vmlinux.elf kernel"
+  base=vmlinux
+  ;;
+vmlinux.efi)
+  echo "Installing uncompressed vmlinux.efi kernel"
+  base=vmlinux
+  ;;
+vmlinuz.efi)
+  echo "Installing gzip/zstd compressed vmlinuz.efi kernel"
+  base=vmlinuz
+  ;;
+*)
+ echo "Warning: Unexpected kernel type"
+ exit 1
+ ;;
+esac
+
+if [ -f $4/$base-$1 ]; then
+  mv $4/$base-$1 $4/$base-$1.old
+fi
+cat $2 > $4/$base-$1
+
+# Install system map file
+if [ -f $4/System.map-$1 ]; then
+  mv $4/System.map-$1 $4/System.map-$1.old
+fi
+cp $3 $4/System.map-$1
+
+# Install kernel config file
+if [ -f $4/config-$1 ]; then
+  mv $4/config-$1 $4/config-$1.old
+fi
+cp .config $4/config-$1
-- 
cgit v1.2.3


From 75cffd392bfabc30ee88836887ea5439ec3b8089 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Fri, 30 May 2025 21:45:42 +0800
Subject: LoongArch: Using generic scripts/install.sh in `make install`

Use the generic script/install.sh to perform the make install operation.
This will automatically generate the initrd file and modify the grub.cfg
without manual intervention (The previous kernel image, config file and
System.map will also be generated), similar to other architectures.

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index 0304eabbe606..64bdb52ddf7c 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -181,9 +181,7 @@ vmlinux.elf vmlinux.efi vmlinuz.efi: vmlinux
 	$(Q)$(MAKE) $(build)=$(boot) $(bootvars-y) $(boot)/$@
 
 install:
-	$(Q)install -D -m 755 $(KBUILD_IMAGE) $(INSTALL_PATH)/$(image-name-y)-$(KERNELRELEASE)
-	$(Q)install -D -m 644 .config $(INSTALL_PATH)/config-$(KERNELRELEASE)
-	$(Q)install -D -m 644 System.map $(INSTALL_PATH)/System.map-$(KERNELRELEASE)
+	$(call cmd,install)
 
 define archhelp
 	echo '  install              - install kernel into $(INSTALL_PATH)'
-- 
cgit v1.2.3


From 980d4a42d595a00be2cec6a39ae3bfa6011ffcb3 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Fri, 30 May 2025 21:45:42 +0800
Subject: LoongArch: Add some annotations in archhelp

- Add annotations to the kernel image.
- Modify the annotations of make insatll.

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/Makefile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index 64bdb52ddf7c..b0703a4e02a2 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -184,6 +184,11 @@ install:
 	$(call cmd,install)
 
 define archhelp
-	echo '  install              - install kernel into $(INSTALL_PATH)'
+	echo '  vmlinux.elf    - Uncompressed ELF kernel image (arch/loongarch/boot/vmlinux.elf)'
+	echo '  vmlinux.efi    - Uncompressed EFI kernel image (arch/loongarch/boot/vmlinux.efi)'
+	echo '  vmlinuz.efi    - GZIP/ZSTD-compressed EFI kernel image (arch/loongarch/boot/vmlinuz.efi)'
+	echo '                   Default when CONFIG_EFI_ZBOOT=y'
+	echo '  install        - Install kernel using (your) ~/bin/$(INSTALLKERNEL) or'
+	echo '                   (distribution) /sbin/$(INSTALLKERNEL) or install.sh to $$(INSTALL_PATH)'
 	echo
 endef
-- 
cgit v1.2.3


From 93f437315660219245f99724d7597e5b2ea40df3 Mon Sep 17 00:00:00 2001
From: Tianyang Zhang <zhangtianyang@loongson.cn>
Date: Fri, 30 May 2025 21:45:42 +0800
Subject: LoongArch: Add SCHED_MC (Multi-core scheduler) support

In order to achieve more reasonable load balancing behavior, add
SCHED_MC (Multi-core scheduler) support.

The LLC distribution of LoongArch now is consistent with NUMA node,
the balancing domain of SCHED_MC can effectively reduce the situation
where processes are awakened to smt_sibling.

Co-developed-by: Hongliang Wang <wanghongliang@loongson.cn>
Signed-off-by: Hongliang Wang <wanghongliang@loongson.cn>
Signed-off-by: Tianyang Zhang <zhangtianyang@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/Kconfig                |  9 +++++++++
 arch/loongarch/include/asm/smp.h      |  1 +
 arch/loongarch/include/asm/topology.h |  8 ++++++++
 arch/loongarch/kernel/smp.c           | 38 +++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+)

(limited to 'arch')

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 1a2cf012b8f2..609b15a26621 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -456,6 +456,15 @@ config SCHED_SMT
 	  Improves scheduler's performance when there are multiple
 	  threads in one physical core.
 
+config SCHED_MC
+	bool "Multi-core scheduler support"
+	depends on SMP
+	default y
+	help
+	  Multi-core scheduler support improves the CPU scheduler's decision
+	  making when dealing with multi-core CPU chips at a cost of slightly
+	  increased overhead in some places.
+
 config SMP
 	bool "Multi-Processing support"
 	help
diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h
index b87d1d5e5890..ad0bd234a0f1 100644
--- a/arch/loongarch/include/asm/smp.h
+++ b/arch/loongarch/include/asm/smp.h
@@ -25,6 +25,7 @@ extern int smp_num_siblings;
 extern int num_processors;
 extern int disabled_cpus;
 extern cpumask_t cpu_sibling_map[];
+extern cpumask_t cpu_llc_shared_map[];
 extern cpumask_t cpu_core_map[];
 extern cpumask_t cpu_foreign_map[];
 
diff --git a/arch/loongarch/include/asm/topology.h b/arch/loongarch/include/asm/topology.h
index 50273c9187d0..ab206678e47f 100644
--- a/arch/loongarch/include/asm/topology.h
+++ b/arch/loongarch/include/asm/topology.h
@@ -30,6 +30,14 @@ void numa_set_distance(int from, int to, int distance);
 #endif
 
 #ifdef CONFIG_SMP
+/*
+ * Return cpus that shares the last level cache.
+ */
+static inline const struct cpumask *cpu_coregroup_mask(int cpu)
+{
+	return &cpu_llc_shared_map[cpu];
+}
+
 #define topology_physical_package_id(cpu)	(cpu_data[cpu].package)
 #define topology_core_id(cpu)			(cpu_data[cpu].core)
 #define topology_core_cpumask(cpu)		(&cpu_core_map[cpu])
diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c
index 4b24589c0b56..46036d98da75 100644
--- a/arch/loongarch/kernel/smp.c
+++ b/arch/loongarch/kernel/smp.c
@@ -46,6 +46,10 @@ EXPORT_SYMBOL(__cpu_logical_map);
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_sibling_map);
 
+/* Representing the last level cache shared map of each logical CPU */
+cpumask_t cpu_llc_shared_map[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(cpu_llc_shared_map);
+
 /* Representing the core map of multi-core chips of each logical CPU */
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_core_map);
@@ -63,6 +67,9 @@ EXPORT_SYMBOL(cpu_foreign_map);
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
+/* representing cpus for which llc shared maps can be computed */
+static cpumask_t cpu_llc_shared_setup_map;
+
 /* representing cpus for which core maps can be computed */
 static cpumask_t cpu_core_setup_map;
 
@@ -102,6 +109,34 @@ static inline void set_cpu_core_map(int cpu)
 	}
 }
 
+static inline void set_cpu_llc_shared_map(int cpu)
+{
+	int i;
+
+	cpumask_set_cpu(cpu, &cpu_llc_shared_setup_map);
+
+	for_each_cpu(i, &cpu_llc_shared_setup_map) {
+		if (cpu_to_node(cpu) == cpu_to_node(i)) {
+			cpumask_set_cpu(i, &cpu_llc_shared_map[cpu]);
+			cpumask_set_cpu(cpu, &cpu_llc_shared_map[i]);
+		}
+	}
+}
+
+static inline void clear_cpu_llc_shared_map(int cpu)
+{
+	int i;
+
+	for_each_cpu(i, &cpu_llc_shared_setup_map) {
+		if (cpu_to_node(cpu) == cpu_to_node(i)) {
+			cpumask_clear_cpu(i, &cpu_llc_shared_map[cpu]);
+			cpumask_clear_cpu(cpu, &cpu_llc_shared_map[i]);
+		}
+	}
+
+	cpumask_clear_cpu(cpu, &cpu_llc_shared_setup_map);
+}
+
 static inline void set_cpu_sibling_map(int cpu)
 {
 	int i;
@@ -406,6 +441,7 @@ int loongson_cpu_disable(void)
 #endif
 	set_cpu_online(cpu, false);
 	clear_cpu_sibling_map(cpu);
+	clear_cpu_llc_shared_map(cpu);
 	calculate_cpu_foreign_map();
 	local_irq_save(flags);
 	irq_migrate_all_off_this_cpu();
@@ -572,6 +608,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	current_thread_info()->cpu = 0;
 	loongson_prepare_cpus(max_cpus);
 	set_cpu_sibling_map(0);
+	set_cpu_llc_shared_map(0);
 	set_cpu_core_map(0);
 	calculate_cpu_foreign_map();
 #ifndef CONFIG_HOTPLUG_CPU
@@ -613,6 +650,7 @@ asmlinkage void start_secondary(void)
 	loongson_init_secondary();
 
 	set_cpu_sibling_map(cpu);
+	set_cpu_llc_shared_map(cpu);
 	set_cpu_core_map(cpu);
 
 	notify_cpu_starting(cpu);
-- 
cgit v1.2.3


From b37981ce540dffa64a4664ccf0e20dbef6c2c638 Mon Sep 17 00:00:00 2001
From: Yuli Wang <wangyuli@uniontech.com>
Date: Fri, 30 May 2025 21:45:42 +0800
Subject: LoongArch: Enable ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS

Provide support for CONFIG_MSEAL_SYSTEM_MAPPINGS on LoongArch, covering
the vdso.

Link: https://lore.kernel.org/all/25bad37f-273e-4626-999c-e1890be96182@lucifer.local/
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Acked-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Jeff Xu <jeffxu@chromium.org>
Tested-by: Yuli Wang <wangyuli@uniontech.com>
Signed-off-by: Yuli Wang <wangyuli@uniontech.com>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/Kconfig       | 1 +
 arch/loongarch/kernel/vdso.c | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 609b15a26621..bfc5604c494d 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -69,6 +69,7 @@ config LOONGARCH
 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
 	select ARCH_SUPPORTS_LTO_CLANG
 	select ARCH_SUPPORTS_LTO_CLANG_THIN
+	select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS
 	select ARCH_SUPPORTS_NUMA_BALANCING
 	select ARCH_SUPPORTS_RT
 	select ARCH_USE_BUILTIN_BSWAP
diff --git a/arch/loongarch/kernel/vdso.c b/arch/loongarch/kernel/vdso.c
index 10cf1608c7b3..7b888d9085a0 100644
--- a/arch/loongarch/kernel/vdso.c
+++ b/arch/loongarch/kernel/vdso.c
@@ -105,7 +105,9 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
 	vdso_addr = data_addr + VVAR_SIZE;
 	vma = _install_special_mapping(mm, vdso_addr, info->size,
-				       VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC,
+				       VM_READ | VM_EXEC |
+				       VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC |
+				       VM_SEALED_SYSMAP,
 				       &info->code_mapping);
 	if (IS_ERR(vma)) {
 		ret = PTR_ERR(vma);
-- 
cgit v1.2.3


From a45728fd4120011c78af1d056e571b84d47dfcc1 Mon Sep 17 00:00:00 2001
From: Youling Tang <tangyouling@kylinos.cn>
Date: Fri, 30 May 2025 21:45:42 +0800
Subject: LoongArch: Enable HAVE_ARCH_STACKLEAK

Add support for the stackleak feature. It initializes the stack with the
poison value before returning from system calls which improves the kernel
security.

At the same time, disables the plugin in EFI stub code because EFI stub
is out of scope for the protection.

Tested on Loongson-3A5000 (enable GCC_PLUGIN_STACKLEAK and LKDTM):
 # echo STACKLEAK_ERASING > /sys/kernel/debug/provoke-crash/DIRECT
 # dmesg
   lkdtm: Performing direct entry STACKLEAK_ERASING
   lkdtm: stackleak stack usage:
      high offset: 320 bytes
      current:     448 bytes
      lowest:      1264 bytes
      tracked:     1264 bytes
      untracked:   208 bytes
      poisoned:    14528 bytes
      low offset:  64 bytes
   lkdtm: OK: the rest of the thread stack is properly erased

Signed-off-by: Youling Tang <tangyouling@kylinos.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/Kconfig                    | 1 +
 arch/loongarch/include/asm/entry-common.h | 8 +-------
 arch/loongarch/include/asm/stackframe.h   | 6 ++++++
 arch/loongarch/include/asm/stacktrace.h   | 5 +++++
 arch/loongarch/kernel/entry.S             | 3 +++
 5 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index bfc5604c494d..38706186cf13 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -124,6 +124,7 @@ config LOONGARCH
 	select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
 	select HAVE_ARCH_SECCOMP
 	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARCH_USERFAULTFD_MINOR if USERFAULTFD
diff --git a/arch/loongarch/include/asm/entry-common.h b/arch/loongarch/include/asm/entry-common.h
index 0fe2a098ded9..099132980dc9 100644
--- a/arch/loongarch/include/asm/entry-common.h
+++ b/arch/loongarch/include/asm/entry-common.h
@@ -2,12 +2,6 @@
 #ifndef ARCH_LOONGARCH_ENTRY_COMMON_H
 #define ARCH_LOONGARCH_ENTRY_COMMON_H
 
-#include <linux/sched.h>
-#include <linux/processor.h>
-
-static inline bool on_thread_stack(void)
-{
-	return !(((unsigned long)(current->stack) ^ current_stack_pointer) & ~(THREAD_SIZE - 1));
-}
+#include <asm/stacktrace.h> /* For on_thread_stack() */
 
 #endif
diff --git a/arch/loongarch/include/asm/stackframe.h b/arch/loongarch/include/asm/stackframe.h
index 66736837085b..3eda298702b1 100644
--- a/arch/loongarch/include/asm/stackframe.h
+++ b/arch/loongarch/include/asm/stackframe.h
@@ -57,6 +57,12 @@
 	jirl	zero, \temp1, 0xc
 	.endm
 
+	.macro STACKLEAK_ERASE
+#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
+	bl	stackleak_erase_on_task_stack
+#endif
+	.endm
+
 	.macro BACKUP_T0T1
 	csrwr	t0, EXCEPTION_KS0
 	csrwr	t1, EXCEPTION_KS1
diff --git a/arch/loongarch/include/asm/stacktrace.h b/arch/loongarch/include/asm/stacktrace.h
index fc8b64773794..5c8be156567c 100644
--- a/arch/loongarch/include/asm/stacktrace.h
+++ b/arch/loongarch/include/asm/stacktrace.h
@@ -31,6 +31,11 @@ bool in_irq_stack(unsigned long stack, struct stack_info *info);
 bool in_task_stack(unsigned long stack, struct task_struct *task, struct stack_info *info);
 int get_stack_info(unsigned long stack, struct task_struct *task, struct stack_info *info);
 
+static __always_inline bool on_thread_stack(void)
+{
+	return !(((unsigned long)(current->stack) ^ current_stack_pointer) & ~(THREAD_SIZE - 1));
+}
+
 #define STR_LONG_L    __stringify(LONG_L)
 #define STR_LONG_S    __stringify(LONG_S)
 #define STR_LONGSIZE  __stringify(LONGSIZE)
diff --git a/arch/loongarch/kernel/entry.S b/arch/loongarch/kernel/entry.S
index 2abc29e57381..47e1db9a1ce4 100644
--- a/arch/loongarch/kernel/entry.S
+++ b/arch/loongarch/kernel/entry.S
@@ -73,6 +73,7 @@ SYM_CODE_START(handle_syscall)
 	move		a0, sp
 	bl		do_syscall
 
+	STACKLEAK_ERASE
 	RESTORE_ALL_AND_RET
 SYM_CODE_END(handle_syscall)
 _ASM_NOKPROBE(handle_syscall)
@@ -81,6 +82,7 @@ SYM_CODE_START(ret_from_fork_asm)
 	UNWIND_HINT_REGS
 	move		a1, sp
 	bl 		ret_from_fork
+	STACKLEAK_ERASE
 	RESTORE_STATIC
 	RESTORE_SOME
 	RESTORE_SP_AND_RET
@@ -92,6 +94,7 @@ SYM_CODE_START(ret_from_kernel_thread_asm)
 	move		a2, s0
 	move		a3, s1
 	bl		ret_from_kernel_thread
+	STACKLEAK_ERASE
 	RESTORE_STATIC
 	RESTORE_SOME
 	RESTORE_SP_AND_RET
-- 
cgit v1.2.3


From 9559d5806319a9254d9053b22a31324e1929aac6 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Fri, 30 May 2025 21:45:43 +0800
Subject: LoongArch: Increase max supported CPUs up to 2048

Increase max supported CPUs up to 2048, including:
1. Increase CSR.CPUID register's effective width;
2. Define MAX_CORE_PIC (a.k.a. max physical ID) to 2048;
3. Allow NR_CPUS (a.k.a. max logical ID) to be as large as 2048;
4. Introduce acpi_numa_x2apic_affinity_init() to handle ACPI SRAT
   for CPUID >= 256.

Note: The reason of increasing to 2048 rather than 4096/8192 is because
      the IPI hardware can only support 2048 as a maximum.

Reviewed-by: Yanteng Si <si.yanteng@linux.dev>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/Kconfig                 |  6 ++---
 arch/loongarch/include/asm/acpi.h      |  2 +-
 arch/loongarch/include/asm/loongarch.h |  4 ++--
 arch/loongarch/kernel/acpi.c           | 41 +++++++++++++++++++++++++++++-----
 4 files changed, 41 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 38706186cf13..0c356de91bc4 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -496,10 +496,10 @@ config HOTPLUG_CPU
 	  Say N if you want to disable CPU hotplug.
 
 config NR_CPUS
-	int "Maximum number of CPUs (2-256)"
-	range 2 256
+	int "Maximum number of CPUs (2-2048)"
+	range 2 2048
+	default "2048"
 	depends on SMP
-	default "64"
 	help
 	  This allows you to specify the maximum number of CPUs which this
 	  kernel will support.
diff --git a/arch/loongarch/include/asm/acpi.h b/arch/loongarch/include/asm/acpi.h
index 313f66f7913a..7376840fa9f7 100644
--- a/arch/loongarch/include/asm/acpi.h
+++ b/arch/loongarch/include/asm/acpi.h
@@ -33,7 +33,7 @@ static inline bool acpi_has_cpu_in_madt(void)
 	return true;
 }
 
-#define MAX_CORE_PIC 256
+#define MAX_CORE_PIC 2048
 
 extern struct list_head acpi_wakeup_device_list;
 extern struct acpi_madt_core_pic acpi_core_pic[MAX_CORE_PIC];
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h
index 52651aa0e583..d84dac88a584 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -411,8 +411,8 @@
 
 /* Config CSR registers */
 #define LOONGARCH_CSR_CPUID		0x20	/* CPU core id */
-#define  CSR_CPUID_COREID_WIDTH		9
-#define  CSR_CPUID_COREID		_ULCAST_(0x1ff)
+#define  CSR_CPUID_COREID_WIDTH		11
+#define  CSR_CPUID_COREID		_ULCAST_(0x7ff)
 
 #define LOONGARCH_CSR_PRCFG1		0x21	/* Config1 */
 #define  CSR_CONF1_VSMAX_SHIFT		12
diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c
index 1120ac2824f6..fba4bb93e1bd 100644
--- a/arch/loongarch/kernel/acpi.c
+++ b/arch/loongarch/kernel/acpi.c
@@ -244,11 +244,6 @@ fdt_earlycon:
 
 #ifdef CONFIG_ACPI_NUMA
 
-static __init int setup_node(int pxm)
-{
-	return acpi_map_pxm_to_node(pxm);
-}
-
 void __init numa_set_distance(int from, int to, int distance)
 {
 	if ((u8)distance != distance || (from == to && distance != LOCAL_DISTANCE)) {
@@ -280,7 +275,41 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 		pxm |= (pa->proximity_domain_hi[1] << 16);
 		pxm |= (pa->proximity_domain_hi[2] << 24);
 	}
-	node = setup_node(pxm);
+	node = acpi_map_pxm_to_node(pxm);
+	if (node < 0) {
+		pr_err("SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+
+	if (pa->apic_id >= CONFIG_NR_CPUS) {
+		pr_info("SRAT: PXM %u -> CPU 0x%02x -> Node %u skipped apicid that is too big\n",
+				pxm, pa->apic_id, node);
+		return;
+	}
+
+	early_numa_add_cpu(pa->apic_id, node);
+
+	set_cpuid_to_node(pa->apic_id, node);
+	node_set(node, numa_nodes_parsed);
+	pr_info("SRAT: PXM %u -> CPU 0x%02x -> Node %u\n", pxm, pa->apic_id, node);
+}
+
+void __init
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+	int pxm, node;
+
+	if (srat_disabled())
+		return;
+	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
+	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+		return;
+	pxm = pa->proximity_domain;
+	node = acpi_map_pxm_to_node(pxm);
 	if (node < 0) {
 		pr_err("SRAT: Too many proximity domains %x\n", pxm);
 		bad_srat();
-- 
cgit v1.2.3


From a24f2fb70cb62180486ad4d74f809ff35ddd1cf9 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Fri, 30 May 2025 21:45:43 +0800
Subject: LoongArch: Introduce the numa_memblks conversion

Commit 87482708210ff3333a ("mm: introduce numa_memblks") has moved
numa_memblks from x86 to the generic code, but LoongArch was left out
of this conversion.

This patch introduces the generic numa_memblks for LoongArch.

In detail:
1. Enable NUMA_MEMBLKS (but disable NUMA_EMU) in Kconfig;
2. Use generic definition for numa_memblk and numa_meminfo;
3. Use generic implementation for numa_add_memblk() and its friends;
4. Use generic implementation for numa_set_distance() and its friends;
5. Use generic implementation for memory_add_physaddr_to_nid() and its
   friends.

Note: Disable NUMA_EMU because it needs more efforts and no obvious
demand now.

Tested-by: Binbin Zhou <zhoubinbin@loongson.cn>
Signed-off-by: Yuquan Wang <wangyuquan1236@phytium.com.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/Kconfig                 |   1 +
 arch/loongarch/include/asm/numa.h      |  14 -----
 arch/loongarch/include/asm/sparsemem.h |   5 --
 arch/loongarch/include/asm/topology.h  |   7 +--
 arch/loongarch/kernel/acpi.c           |  11 ----
 arch/loongarch/kernel/numa.c           | 108 ++++-----------------------------
 arch/loongarch/mm/init.c               |   8 ---
 7 files changed, 15 insertions(+), 139 deletions(-)

(limited to 'arch')

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 0c356de91bc4..4b19f93379a1 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -189,6 +189,7 @@ config LOONGARCH
 	select MODULES_USE_ELF_RELA if MODULES
 	select NEED_PER_CPU_EMBED_FIRST_CHUNK
 	select NEED_PER_CPU_PAGE_FIRST_CHUNK
+	select NUMA_MEMBLKS if NUMA
 	select OF
 	select OF_EARLY_FLATTREE
 	select PCI
diff --git a/arch/loongarch/include/asm/numa.h b/arch/loongarch/include/asm/numa.h
index b5f9de9f102e..bbf9f70bd25f 100644
--- a/arch/loongarch/include/asm/numa.h
+++ b/arch/loongarch/include/asm/numa.h
@@ -22,20 +22,6 @@ extern int numa_off;
 extern s16 __cpuid_to_node[CONFIG_NR_CPUS];
 extern nodemask_t numa_nodes_parsed __initdata;
 
-struct numa_memblk {
-	u64			start;
-	u64			end;
-	int			nid;
-};
-
-#define NR_NODE_MEMBLKS		(MAX_NUMNODES*2)
-struct numa_meminfo {
-	int			nr_blks;
-	struct numa_memblk	blk[NR_NODE_MEMBLKS];
-};
-
-extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
-
 extern void __init early_numa_add_cpu(int cpuid, s16 node);
 extern void numa_add_cpu(unsigned int cpu);
 extern void numa_remove_cpu(unsigned int cpu);
diff --git a/arch/loongarch/include/asm/sparsemem.h b/arch/loongarch/include/asm/sparsemem.h
index 8d4af6aff8a8..4501efac1a87 100644
--- a/arch/loongarch/include/asm/sparsemem.h
+++ b/arch/loongarch/include/asm/sparsemem.h
@@ -21,11 +21,6 @@
 #define VMEMMAP_SIZE	0	/* 1, For FLATMEM; 2, For SPARSEMEM without VMEMMAP. */
 #endif
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-int memory_add_physaddr_to_nid(u64 addr);
-#define memory_add_physaddr_to_nid memory_add_physaddr_to_nid
-#endif
-
 #define INIT_MEMBLOCK_RESERVED_REGIONS	(INIT_MEMBLOCK_REGIONS + NR_CPUS)
 
 #endif /* _LOONGARCH_SPARSEMEM_H */
diff --git a/arch/loongarch/include/asm/topology.h b/arch/loongarch/include/asm/topology.h
index ab206678e47f..f06e7ff25bb7 100644
--- a/arch/loongarch/include/asm/topology.h
+++ b/arch/loongarch/include/asm/topology.h
@@ -19,11 +19,8 @@ extern int pcibus_to_node(struct pci_bus *);
 
 #define cpumask_of_pcibus(bus)	(cpu_online_mask)
 
-extern unsigned char node_distances[MAX_NUMNODES][MAX_NUMNODES];
-
-void numa_set_distance(int from, int to, int distance);
-
-#define node_distance(from, to)	(node_distances[(from)][(to)])
+int __node_distance(int from, int to);
+#define node_distance(from, to) __node_distance(from, to)
 
 #else
 #define pcibus_to_node(bus)	0
diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c
index fba4bb93e1bd..a54cd6fd3796 100644
--- a/arch/loongarch/kernel/acpi.c
+++ b/arch/loongarch/kernel/acpi.c
@@ -244,17 +244,6 @@ fdt_earlycon:
 
 #ifdef CONFIG_ACPI_NUMA
 
-void __init numa_set_distance(int from, int to, int distance)
-{
-	if ((u8)distance != distance || (from == to && distance != LOCAL_DISTANCE)) {
-		pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
-				from, to, distance);
-		return;
-	}
-
-	node_distances[from][to] = distance;
-}
-
 /* Callback for Proximity Domain -> CPUID mapping */
 void __init
 acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c
index 30a72fd528c0..d6e73e8f9c0b 100644
--- a/arch/loongarch/kernel/numa.c
+++ b/arch/loongarch/kernel/numa.c
@@ -11,6 +11,7 @@
 #include <linux/mmzone.h>
 #include <linux/export.h>
 #include <linux/nodemask.h>
+#include <linux/numa_memblks.h>
 #include <linux/swap.h>
 #include <linux/memblock.h>
 #include <linux/pfn.h>
@@ -27,10 +28,6 @@
 #include <asm/time.h>
 
 int numa_off;
-unsigned char node_distances[MAX_NUMNODES][MAX_NUMNODES];
-EXPORT_SYMBOL(node_distances);
-
-static struct numa_meminfo numa_meminfo;
 cpumask_t cpus_on_node[MAX_NUMNODES];
 cpumask_t phys_cpus_on_node[MAX_NUMNODES];
 EXPORT_SYMBOL(cpus_on_node);
@@ -43,8 +40,6 @@ s16 __cpuid_to_node[CONFIG_NR_CPUS] = {
 };
 EXPORT_SYMBOL(__cpuid_to_node);
 
-nodemask_t numa_nodes_parsed __initdata;
-
 #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
@@ -145,48 +140,6 @@ void numa_remove_cpu(unsigned int cpu)
 	cpumask_clear_cpu(cpu, &cpus_on_node[nid]);
 }
 
-static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
-				     struct numa_meminfo *mi)
-{
-	/* ignore zero length blks */
-	if (start == end)
-		return 0;
-
-	/* whine about and ignore invalid blks */
-	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
-		pr_warn("NUMA: Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
-			   nid, start, end - 1);
-		return 0;
-	}
-
-	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
-		pr_err("NUMA: too many memblk ranges\n");
-		return -EINVAL;
-	}
-
-	mi->blk[mi->nr_blks].start = PFN_ALIGN(start);
-	mi->blk[mi->nr_blks].end = PFN_ALIGN(end - PAGE_SIZE + 1);
-	mi->blk[mi->nr_blks].nid = nid;
-	mi->nr_blks++;
-	return 0;
-}
-
-/**
- * numa_add_memblk - Add one numa_memblk to numa_meminfo
- * @nid: NUMA node ID of the new memblk
- * @start: Start address of the new memblk
- * @end: End address of the new memblk
- *
- * Add a new memblk to the default numa_meminfo.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
-{
-	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
-}
-
 static void __init node_mem_init(unsigned int node)
 {
 	unsigned long start_pfn, end_pfn;
@@ -205,18 +158,6 @@ static void __init node_mem_init(unsigned int node)
 
 #ifdef CONFIG_ACPI_NUMA
 
-static void __init add_node_intersection(u32 node, u64 start, u64 size, u32 type)
-{
-	static unsigned long num_physpages;
-
-	num_physpages += (size >> PAGE_SHIFT);
-	pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n",
-		node, type, start, size);
-	pr_info("       start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n",
-		start >> PAGE_SHIFT, (start + size) >> PAGE_SHIFT, num_physpages);
-	memblock_set_node(start, size, &memblock.memory, node);
-}
-
 /*
  * add_numamem_region
  *
@@ -228,28 +169,21 @@ static void __init add_node_intersection(u32 node, u64 start, u64 size, u32 type
  */
 static void __init add_numamem_region(u64 start, u64 end, u32 type)
 {
-	u32 i;
-	u64 ofs = start;
+	u32 node = pa_to_nid(start);
+	u64 size = end - start;
+	static unsigned long num_physpages;
 
 	if (start >= end) {
 		pr_debug("Invalid region: %016llx-%016llx\n", start, end);
 		return;
 	}
 
-	for (i = 0; i < numa_meminfo.nr_blks; i++) {
-		struct numa_memblk *mb = &numa_meminfo.blk[i];
-
-		if (ofs > mb->end)
-			continue;
-
-		if (end > mb->end) {
-			add_node_intersection(mb->nid, ofs, mb->end - ofs, type);
-			ofs = mb->end;
-		} else {
-			add_node_intersection(mb->nid, ofs, end - ofs, type);
-			break;
-		}
-	}
+	num_physpages += (size >> PAGE_SHIFT);
+	pr_info("Node%d: mem_type:%d, mem_start:0x%llx, mem_size:0x%llx Bytes\n",
+		node, type, start, size);
+	pr_info("       start_pfn:0x%llx, end_pfn:0x%llx, num_physpages:0x%lx\n",
+		start >> PAGE_SHIFT, end >> PAGE_SHIFT, num_physpages);
+	memblock_set_node(start, size, &memblock.memory, node);
 }
 
 static void __init init_node_memblock(void)
@@ -291,24 +225,6 @@ static void __init init_node_memblock(void)
 	}
 }
 
-static void __init numa_default_distance(void)
-{
-	int row, col;
-
-	for (row = 0; row < MAX_NUMNODES; row++)
-		for (col = 0; col < MAX_NUMNODES; col++) {
-			if (col == row)
-				node_distances[row][col] = LOCAL_DISTANCE;
-			else
-				/* We assume that one node per package here!
-				 *
-				 * A SLIT should be used for multiple nodes
-				 * per package to override default setting.
-				 */
-				node_distances[row][col] = REMOTE_DISTANCE;
-	}
-}
-
 /*
  * fake_numa_init() - For Non-ACPI systems
  * Return: 0 on success, -errno on failure.
@@ -333,11 +249,11 @@ int __init init_numa_memory(void)
 	for (i = 0; i < NR_CPUS; i++)
 		set_cpuid_to_node(i, NUMA_NO_NODE);
 
-	numa_default_distance();
+	numa_reset_distance();
 	nodes_clear(numa_nodes_parsed);
 	nodes_clear(node_possible_map);
 	nodes_clear(node_online_map);
-	memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+	WARN_ON(memblock_clear_hotplug(0, PHYS_ADDR_MAX));
 
 	/* Parse SRAT and SLIT if provided by firmware. */
 	ret = acpi_disabled ? fake_numa_init() : acpi_numa_init();
diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c
index 06f11d9e4ec1..c3e4586a7975 100644
--- a/arch/loongarch/mm/init.c
+++ b/arch/loongarch/mm/init.c
@@ -106,14 +106,6 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 		page += vmem_altmap_offset(altmap);
 	__remove_pages(start_pfn, nr_pages, altmap);
 }
-
-#ifdef CONFIG_NUMA
-int memory_add_physaddr_to_nid(u64 start)
-{
-	return pa_to_nid(start);
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
 #endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-- 
cgit v1.2.3


From 52c22661c79a7b6af7fad9f77200738fc6c51878 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Fri, 30 May 2025 21:45:48 +0800
Subject: LoongArch: Avoid using $r0/$r1 as "mask" for csrxchg

When building kernel with LLVM there are occasionally such errors:

In file included from ./include/linux/spinlock.h:59:
In file included from ./include/linux/irqflags.h:17:
arch/loongarch/include/asm/irqflags.h:38:3: error: must not be $r0 or $r1
   38 |                 "csrxchg %[val], %[mask], %[reg]\n\t"
      |                 ^
<inline asm>:1:16: note: instantiated into assembly here
    1 |         csrxchg $a1, $ra, 0
      |                       ^

To prevent the compiler from allocating $r0 or $r1 for the "mask" of the
csrxchg instruction, the 'q' constraint must be used but Clang < 21 does
not support it. So force to use $t0 in the inline asm, in order to avoid
using $r0/$r1 while keeping the backward compatibility.

Cc: stable@vger.kernel.org
Link: https://github.com/llvm/llvm-project/pull/141037
Reviewed-by: Yanteng Si <si.yanteng@linux.dev>
Suggested-by: WANG Rui <wangrui@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/include/asm/irqflags.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/loongarch/include/asm/irqflags.h b/arch/loongarch/include/asm/irqflags.h
index 319a8c616f1f..003172b8406b 100644
--- a/arch/loongarch/include/asm/irqflags.h
+++ b/arch/loongarch/include/asm/irqflags.h
@@ -14,40 +14,48 @@
 static inline void arch_local_irq_enable(void)
 {
 	u32 flags = CSR_CRMD_IE;
+	register u32 mask asm("t0") = CSR_CRMD_IE;
+
 	__asm__ __volatile__(
 		"csrxchg %[val], %[mask], %[reg]\n\t"
 		: [val] "+r" (flags)
-		: [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD)
+		: [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD)
 		: "memory");
 }
 
 static inline void arch_local_irq_disable(void)
 {
 	u32 flags = 0;
+	register u32 mask asm("t0") = CSR_CRMD_IE;
+
 	__asm__ __volatile__(
 		"csrxchg %[val], %[mask], %[reg]\n\t"
 		: [val] "+r" (flags)
-		: [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD)
+		: [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD)
 		: "memory");
 }
 
 static inline unsigned long arch_local_irq_save(void)
 {
 	u32 flags = 0;
+	register u32 mask asm("t0") = CSR_CRMD_IE;
+
 	__asm__ __volatile__(
 		"csrxchg %[val], %[mask], %[reg]\n\t"
 		: [val] "+r" (flags)
-		: [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD)
+		: [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD)
 		: "memory");
 	return flags;
 }
 
 static inline void arch_local_irq_restore(unsigned long flags)
 {
+	register u32 mask asm("t0") = CSR_CRMD_IE;
+
 	__asm__ __volatile__(
 		"csrxchg %[val], %[mask], %[reg]\n\t"
 		: [val] "+r" (flags)
-		: [mask] "r" (CSR_CRMD_IE), [reg] "i" (LOONGARCH_CSR_CRMD)
+		: [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD)
 		: "memory");
 }
 
-- 
cgit v1.2.3


From 712e6a914328e9672d534fc77edd596234ec2d7f Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Fri, 30 May 2025 21:45:48 +0800
Subject: LoongArch: Preserve firmware configuration when desired

If we must preserve the firmware resource assignments, claim the existing
resources rather than reassigning everything.

According to PCI Firmware Specification: if ACPI DSM#5 function returns
0, the OS must retain the resource allocation for PCI in the firmware; if
ACPI DSM#5 function returns 1, the OS can ignore the resource allocation
for PCI and reallocate it.

Signed-off-by: Qihang Gao <gaoqihang@loongson.cn>
Signed-off-by: Juxin Gao <gaojuxin@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/pci/acpi.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/loongarch/pci/acpi.c b/arch/loongarch/pci/acpi.c
index 1da4dc46df43..50c9016641a4 100644
--- a/arch/loongarch/pci/acpi.c
+++ b/arch/loongarch/pci/acpi.c
@@ -194,6 +194,7 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 {
 	struct pci_bus *bus;
 	struct pci_root_info *info;
+	struct pci_host_bridge *host;
 	struct acpi_pci_root_ops *root_ops;
 	int domain = root->segment;
 	int busnum = root->secondary.start;
@@ -237,8 +238,17 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
 			return NULL;
 		}
 
-		pci_bus_size_bridges(bus);
-		pci_bus_assign_resources(bus);
+		/* If we must preserve the resource configuration, claim now */
+		host = pci_find_host_bridge(bus);
+		if (host->preserve_config)
+			pci_bus_claim_resources(bus);
+
+		/*
+		 * Assign whatever was left unassigned. If we didn't claim above,
+		 * this will reassign everything.
+		 */
+		pci_assign_unassigned_root_bus_resources(bus);
+
 		list_for_each_entry(child, &bus->children, node)
 			pcie_bus_configure_settings(child);
 	}
-- 
cgit v1.2.3


From ee084fa96123ede8b0563a1b5a9b23adc43cd50d Mon Sep 17 00:00:00 2001
From: Tianyang Zhang <zhangtianyang@loongson.cn>
Date: Fri, 30 May 2025 21:45:57 +0800
Subject: LoongArch: Fix panic caused by NULL-PMD in huge_pte_offset()

ERROR INFO:

CPU 25 Unable to handle kernel paging request at virtual address 0x0
         ...
 Call Trace:
 [<900000000023c30c>] huge_pte_offset+0x3c/0x58
 [<900000000057fd4c>] hugetlb_follow_page_mask+0x74/0x438
 [<900000000051fee8>] __get_user_pages+0xe0/0x4c8
 [<9000000000522414>] faultin_page_range+0x84/0x380
 [<9000000000564e8c>] madvise_vma_behavior+0x534/0xa48
 [<900000000056689c>] do_madvise+0x1bc/0x3e8
 [<9000000000566df4>] sys_madvise+0x24/0x38
 [<90000000015b9e88>] do_syscall+0x78/0x98
 [<9000000000221f18>] handle_syscall+0xb8/0x158

In some cases, pmd may be NULL and rely on NULL as the return value for
processing, so it is necessary to determine this situation here.

Cc: stable@vger.kernel.org
Fixes: bd51834d1cf6 ("LoongArch: Return NULL from huge_pte_offset() for invalid PMD")
Signed-off-by: Tianyang Zhang <zhangtianyang@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/mm/hugetlbpage.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/loongarch/mm/hugetlbpage.c b/arch/loongarch/mm/hugetlbpage.c
index cea84d7f2b91..02dad4624fe3 100644
--- a/arch/loongarch/mm/hugetlbpage.c
+++ b/arch/loongarch/mm/hugetlbpage.c
@@ -47,7 +47,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
 				pmd = pmd_offset(pud, addr);
 		}
 	}
-	return pmd_none(pmdp_get(pmd)) ? NULL : (pte_t *) pmd;
+
+	return (!pmd || pmd_none(pmdp_get(pmd))) ? NULL : (pte_t *) pmd;
 }
 
 uint64_t pmd_to_entrylo(unsigned long pmd_val)
-- 
cgit v1.2.3


From 99850a1c93fe7ca40ad9efddc00acec6e85c5e48 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 29 May 2025 13:10:24 -0400
Subject: x86/fpu: Remove unused trace events

The following trace events are not used and defining them just wastes
memory:

  x86_fpu_before_restore
  x86_fpu_after_restore
  x86_fpu_init_state

Simply remove them.

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Linux Trace Kernel <linux-trace-kernel@vger.kernel.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/all/20250529130138.544ffec4@gandalf.local.home  # background
Link: https://lore.kernel.org/r/20250529131024.7c2ef96f@gandalf.local.home    # x86 submission
---
 arch/x86/include/asm/trace/fpu.h | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 0454d5e60e5d..721b408d9a67 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -44,16 +44,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_after_save,
 	TP_ARGS(fpu)
 );
 
-DEFINE_EVENT(x86_fpu, x86_fpu_before_restore,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
-DEFINE_EVENT(x86_fpu, x86_fpu_after_restore,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
 DEFINE_EVENT(x86_fpu, x86_fpu_regs_activated,
 	TP_PROTO(struct fpu *fpu),
 	TP_ARGS(fpu)
@@ -64,11 +54,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated,
 	TP_ARGS(fpu)
 );
 
-DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
 DEFINE_EVENT(x86_fpu, x86_fpu_dropped,
 	TP_PROTO(struct fpu *fpu),
 	TP_ARGS(fpu)
-- 
cgit v1.2.3


From 86aa94cd50b138be0dd872b0779fa3036e641881 Mon Sep 17 00:00:00 2001
From: Dapeng Mi <dapeng1.mi@linux.intel.com>
Date: Thu, 29 May 2025 08:02:36 +0000
Subject: perf/x86/intel: Fix incorrect MSR index calculations in
 intel_pmu_config_acr()

The MSR offset calculations in intel_pmu_config_acr() are buggy.

To calculate fixed counter MSR addresses in intel_pmu_config_acr(),
the HW counter index "idx" is subtracted by INTEL_PMC_IDX_FIXED.

This leads to the ACR mask value of fixed counters to be incorrectly
saved to the positions of GP counters in acr_cfg_b[], e.g.

For fixed counter 0, its ACR counter mask should be saved to
acr_cfg_b[32], but it's saved to acr_cfg_b[0] incorrectly.

Fix this issue.

[ mingo: Clarified & improved the changelog. ]

Fixes: ec980e4facef ("perf/x86/intel: Support auto counter reload")
Signed-off-by: Dapeng Mi <dapeng1.mi@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: https://lore.kernel.org/r/20250529080236.2552247-2-dapeng1.mi@linux.intel.com
---
 arch/x86/events/intel/core.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 466283326630..741b229f0718 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2900,6 +2900,7 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	int msr_b, msr_c;
+	int msr_offset;
 
 	if (!mask && !cpuc->acr_cfg_b[idx])
 		return;
@@ -2907,19 +2908,20 @@ static void intel_pmu_config_acr(int idx, u64 mask, u32 reload)
 	if (idx < INTEL_PMC_IDX_FIXED) {
 		msr_b = MSR_IA32_PMC_V6_GP0_CFG_B;
 		msr_c = MSR_IA32_PMC_V6_GP0_CFG_C;
+		msr_offset = x86_pmu.addr_offset(idx, false);
 	} else {
 		msr_b = MSR_IA32_PMC_V6_FX0_CFG_B;
 		msr_c = MSR_IA32_PMC_V6_FX0_CFG_C;
-		idx -= INTEL_PMC_IDX_FIXED;
+		msr_offset = x86_pmu.addr_offset(idx - INTEL_PMC_IDX_FIXED, false);
 	}
 
 	if (cpuc->acr_cfg_b[idx] != mask) {
-		wrmsrl(msr_b + x86_pmu.addr_offset(idx, false), mask);
+		wrmsrl(msr_b + msr_offset, mask);
 		cpuc->acr_cfg_b[idx] = mask;
 	}
 	/* Only need to update the reload value when there is a valid config value. */
 	if (mask && cpuc->acr_cfg_c[idx] != reload) {
-		wrmsrl(msr_c + x86_pmu.addr_offset(idx, false), reload);
+		wrmsrl(msr_c + msr_offset, reload);
 		cpuc->acr_cfg_c[idx] = reload;
 	}
 }
-- 
cgit v1.2.3


From b8c9c3b822fe8e033b9802516f6466099d915488 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 9 May 2025 10:40:52 +0200
Subject: um: stop using PCI port I/O

arch/um is one of the last users of CONFIG_GENERIC_IOMAP, but upon
closer look it appears that the PCI host bridge does not register
any port I/O, and the absense of both custom inb/outb functions and
a PCI_IOBASE constant means that actually trying to use port I/O
results on a NULL pointer access.

Build testing with clang confirms this by warning about this exact
problem:

include/asm-generic/io.h:549:31: error: performing pointer arithmetic on a null pointer has undefined behavior [-Werror,-Wnull-pointer-arithmetic]
  549 |         val = __raw_readb(PCI_IOBASE + addr);
      |                           ~~~~~~~~~~ ^

Remove all the Kconfig selects that refer to legacy port I/O
and instead just build the normal MMIO path that is emulated
by the virtio PCI host.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Link: https://patch.msgid.link/20250509084125.1488601-1-arnd@kernel.org
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/Kconfig         |  6 ------
 arch/um/kernel/Makefile |  1 -
 arch/um/kernel/ioport.c | 13 -------------
 3 files changed, 20 deletions(-)
 delete mode 100644 arch/um/kernel/ioport.c

(limited to 'arch')

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 79509c7f39de..f08e8a7fac93 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -52,13 +52,7 @@ config NO_IOMEM
 config UML_IOMEM_EMULATION
 	bool
 	select INDIRECT_IOMEM
-	select HAS_IOPORT
 	select GENERIC_PCI_IOMAP
-	select GENERIC_IOMAP
-	select NO_GENERIC_PCI_IOPORT_MAP
-
-config NO_IOPORT_MAP
-	def_bool !UML_IOMEM_EMULATION
 
 config ISA
 	bool
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 4df1cd0d2017..4669db2aa9be 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -25,7 +25,6 @@ obj-$(CONFIG_GPROF)	+= gprof_syms.o
 obj-$(CONFIG_OF) += dtb.o
 obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
-obj-$(CONFIG_GENERIC_PCI_IOMAP) += ioport.o
 
 USER_OBJS := config.o
 
diff --git a/arch/um/kernel/ioport.c b/arch/um/kernel/ioport.c
deleted file mode 100644
index 7220615b3beb..000000000000
--- a/arch/um/kernel/ioport.c
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2021 Intel Corporation
- * Author: Johannes Berg <johannes@sipsolutions.net>
- */
-#include <asm/iomap.h>
-#include <asm-generic/pci_iomap.h>
-
-void __iomem *__pci_ioport_map(struct pci_dev *dev, unsigned long port,
-			       unsigned int nr)
-{
-	return NULL;
-}
-- 
cgit v1.2.3


From fd054188999ff19746cc09f4e0f196a113964db9 Mon Sep 17 00:00:00 2001
From: Yongting Lin <linyongting@gmail.com>
Date: Tue, 27 May 2025 23:12:22 +0800
Subject: um: Fix tgkill compile error on old host OSes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tgkill is a quite old syscall since kernel 2.5.75, but unfortunately glibc
doesn't support it before 2.30. Thus some systems fail to compile the
latest UserMode Linux.

Here is the compile error I encountered when I tried to compile UML in
my system shipped with glibc-2.28.

    CALL    scripts/checksyscalls.sh
    CC      arch/um/os-Linux/sigio.o
  In file included from arch/um/os-Linux/sigio.c:17:
  arch/um/os-Linux/sigio.c: In function ‘write_sigio_thread’:
  arch/um/os-Linux/sigio.c:49:19: error: implicit declaration of function ‘tgkill’; did you mean ‘kill’? [-Werror=implicit-function-declaration]
     CATCH_EINTR(r = tgkill(pid, pid, SIGIO));
                     ^~~~~~
  ./arch/um/include/shared/os.h:21:48: note: in definition of macro ‘CATCH_EINTR’
  #define CATCH_EINTR(expr) while ((errno = 0, ((expr) < 0)) && (errno == EINTR))
                                                ^~~~
  cc1: some warnings being treated as errors

Fix it by Replacing glibc call with raw syscall.

Fixes: 33c9da5dfb18 ("um: Rewrite the sigio workaround based on epoll and tgkill")
Signed-off-by: Yongting Lin <linyongting@gmail.com>
Link: https://patch.msgid.link/20250527151222.40371-1-linyongting@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/os-Linux/sigio.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/os-Linux/sigio.c b/arch/um/os-Linux/sigio.c
index a05a6ecee756..6de145f8fe3d 100644
--- a/arch/um/os-Linux/sigio.c
+++ b/arch/um/os-Linux/sigio.c
@@ -12,6 +12,7 @@
 #include <signal.h>
 #include <string.h>
 #include <sys/epoll.h>
+#include <asm/unistd.h>
 #include <kern_util.h>
 #include <init.h>
 #include <os.h>
@@ -46,7 +47,7 @@ static void *write_sigio_thread(void *unused)
 			       __func__, errno);
 		}
 
-		CATCH_EINTR(r = tgkill(pid, pid, SIGIO));
+		CATCH_EINTR(r = syscall(__NR_tgkill, pid, pid, SIGIO));
 		if (r < 0)
 			printk(UM_KERN_ERR "%s: tgkill failed, errno = %d\n",
 			       __func__, errno);
-- 
cgit v1.2.3


From 10eabeca45fdfa48e8efd2c7fc0fd97314b65266 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 5 May 2025 10:33:59 +0200
Subject: um: chan_kern: use raw spinlock for irqs_to_free_lock

Since this is called deep in the ARCH=um IRQ infrastructure
it must use a raw spinlock. It's not really part of the
driver, but rather the core UML IRQ code.

Link: https://patch.msgid.link/20250505103358.ae7dc659f8b4.I64ca7aece30e0b4b0b5b35ad89cdd63db197c0ce@changeid
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/chan_kern.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index e78a99816c86..26442db7d608 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -212,7 +212,7 @@ int enable_chan(struct line *line)
  * be permanently disabled.  This is discovered in IRQ context, but
  * the freeing of the IRQ must be done later.
  */
-static DEFINE_SPINLOCK(irqs_to_free_lock);
+static DEFINE_RAW_SPINLOCK(irqs_to_free_lock);
 static LIST_HEAD(irqs_to_free);
 
 void free_irqs(void)
@@ -222,9 +222,9 @@ void free_irqs(void)
 	struct list_head *ele;
 	unsigned long flags;
 
-	spin_lock_irqsave(&irqs_to_free_lock, flags);
+	raw_spin_lock_irqsave(&irqs_to_free_lock, flags);
 	list_splice_init(&irqs_to_free, &list);
-	spin_unlock_irqrestore(&irqs_to_free_lock, flags);
+	raw_spin_unlock_irqrestore(&irqs_to_free_lock, flags);
 
 	list_for_each(ele, &list) {
 		chan = list_entry(ele, struct chan, free_list);
@@ -246,9 +246,9 @@ static void close_one_chan(struct chan *chan, int delay_free_irq)
 		return;
 
 	if (delay_free_irq) {
-		spin_lock_irqsave(&irqs_to_free_lock, flags);
+		raw_spin_lock_irqsave(&irqs_to_free_lock, flags);
 		list_add(&chan->free_list, &irqs_to_free);
-		spin_unlock_irqrestore(&irqs_to_free_lock, flags);
+		raw_spin_unlock_irqrestore(&irqs_to_free_lock, flags);
 	} else {
 		if (chan->input && chan->enabled)
 			um_free_irq(chan->line->read_irq, chan);
-- 
cgit v1.2.3


From 477c1c21da0d6ffc4dfe924178b90377f2938ba5 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Tue, 6 May 2025 12:51:16 +0800
Subject: um: vector: Clean up and modernize log messages

Use pr_*() and netdev_*() to print log messages. While at it,
join split messages for easier grepping.

Suggested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250506045117.1896661-2-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/vector_kern.c | 41 ++++++++++++++++-------------------------
 1 file changed, 16 insertions(+), 25 deletions(-)

(limited to 'arch')

diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c
index 0d5c96897fa0..7822421057ea 100644
--- a/arch/um/drivers/vector_kern.c
+++ b/arch/um/drivers/vector_kern.c
@@ -8,6 +8,8 @@
  * Copyright (C) 2001 by various other people who didn't put their name here.
  */
 
+#define pr_fmt(fmt) "uml-vector: " fmt
+
 #include <linux/memblock.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
@@ -1551,41 +1553,33 @@ static void vector_setup_etheraddr(struct net_device *dev, char *str)
 		addr[i] = simple_strtoul(str, &end, 16);
 		if ((end == str) ||
 		   ((*end != ':') && (*end != ',') && (*end != '\0'))) {
-			printk(KERN_ERR
-			       "setup_etheraddr: failed to parse '%s' "
-			       "as an ethernet address\n", str);
+			netdev_err(dev,
+				"Failed to parse '%s' as an ethernet address\n", str);
 			goto random;
 		}
 		str = end + 1;
 	}
 	if (is_multicast_ether_addr(addr)) {
-		printk(KERN_ERR
-		       "Attempt to assign a multicast ethernet address to a "
-		       "device disallowed\n");
+		netdev_err(dev,
+			"Attempt to assign a multicast ethernet address to a device disallowed\n");
 		goto random;
 	}
 	if (!is_valid_ether_addr(addr)) {
-		printk(KERN_ERR
-		       "Attempt to assign an invalid ethernet address to a "
-		       "device disallowed\n");
+		netdev_err(dev,
+			"Attempt to assign an invalid ethernet address to a device disallowed\n");
 		goto random;
 	}
 	if (!is_local_ether_addr(addr)) {
-		printk(KERN_WARNING
-		       "Warning: Assigning a globally valid ethernet "
-		       "address to a device\n");
-		printk(KERN_WARNING "You should set the 2nd rightmost bit in "
-		       "the first byte of the MAC,\n");
-		printk(KERN_WARNING "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n",
-		       addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4],
-		       addr[5]);
+		netdev_warn(dev, "Warning: Assigning a globally valid ethernet address to a device\n");
+		netdev_warn(dev, "You should set the 2nd rightmost bit in the first byte of the MAC,\n");
+		netdev_warn(dev, "i.e. %02x:%02x:%02x:%02x:%02x:%02x\n",
+			addr[0] | 0x02, addr[1], addr[2], addr[3], addr[4], addr[5]);
 	}
 	eth_hw_addr_set(dev, addr);
 	return;
 
 random:
-	printk(KERN_INFO
-	       "Choosing a random ethernet address for device %s\n", dev->name);
+	netdev_info(dev, "Choosing a random ethernet address\n");
 	eth_hw_addr_random(dev);
 }
 
@@ -1601,14 +1595,12 @@ static void vector_eth_configure(
 
 	device = kzalloc(sizeof(*device), GFP_KERNEL);
 	if (device == NULL) {
-		printk(KERN_ERR "eth_configure failed to allocate struct "
-				 "vector_device\n");
+		pr_err("Failed to allocate struct vector_device for vec%d\n", n);
 		return;
 	}
 	dev = alloc_etherdev(sizeof(struct vector_private));
 	if (dev == NULL) {
-		printk(KERN_ERR "eth_configure: failed to allocate struct "
-				 "net_device for vec%d\n", n);
+		pr_err("Failed to allocate struct net_device for vec%d\n", n);
 		goto out_free_device;
 	}
 
@@ -1738,8 +1730,7 @@ static int __init vector_setup(char *str)
 
 	err = vector_parse(str, &n, &str, &error);
 	if (err) {
-		printk(KERN_ERR "vector_setup - Couldn't parse '%s' : %s\n",
-				 str, error);
+		pr_err("Couldn't parse '%s': %s\n", str, error);
 		return 1;
 	}
 	new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES);
-- 
cgit v1.2.3


From b76d18b53a3f52880452a58cc982910abcccd111 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Tue, 6 May 2025 12:51:17 +0800
Subject: um: vector: Use mac_pton() for MAC address parsing

Use mac_pton() instead of custom approach.

Suggested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250506045117.1896661-3-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/vector_kern.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c
index 7822421057ea..5226d2c52e6a 100644
--- a/arch/um/drivers/vector_kern.c
+++ b/arch/um/drivers/vector_kern.c
@@ -1543,21 +1543,14 @@ static void vector_timer_expire(struct timer_list *t)
 static void vector_setup_etheraddr(struct net_device *dev, char *str)
 {
 	u8 addr[ETH_ALEN];
-	char *end;
-	int i;
 
 	if (str == NULL)
 		goto random;
 
-	for (i = 0; i < 6; i++) {
-		addr[i] = simple_strtoul(str, &end, 16);
-		if ((end == str) ||
-		   ((*end != ':') && (*end != ',') && (*end != '\0'))) {
-			netdev_err(dev,
-				"Failed to parse '%s' as an ethernet address\n", str);
-			goto random;
-		}
-		str = end + 1;
+	if (!mac_pton(str, addr)) {
+		netdev_err(dev,
+			"Failed to parse '%s' as an ethernet address\n", str);
+		goto random;
 	}
 	if (is_multicast_ether_addr(addr)) {
 		netdev_err(dev,
-- 
cgit v1.2.3


From e21560b7d33c4f692cc9cd5b75ff09024f5a69d2 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Thu, 29 May 2025 09:35:08 +0200
Subject: arm64: Disable LLD linker ASSERT()s for the time being

It turns out [1] that the way LLD handles ASSERT()s in the linker script
can result in spurious failures, so disable them for the newly
introduced BSS symbol export checks. Since we're not aware of any issues
with the existing assertions in vmlinux.lds.S, leave those alone for now
so that they can continue to provide useful coverage.

A linker fix [2] is due to land in version 21 of LLD.

Link: https://lore.kernel.org/r/202505261019.OUlitN6m-lkp@intel.com [1]
Link: https://github.com/llvm/llvm-project/commit/5859863bab7f [2]
Link: https://github.com/ClangBuiltLinux/linux/issues/2094
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202505261019.OUlitN6m-lkp@intel.com/
Link: https://lore.kernel.org/r/20250529073507.2984959-2-ardb+git@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/image-vars.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index c5266430284b..86f088a16147 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -10,6 +10,10 @@
 #error This file should only be included in vmlinux.lds.S
 #endif
 
+#if defined(CONFIG_LD_IS_LLD) && CONFIG_LLD_VERSION < 210000
+#define ASSERT(...)
+#endif
+
 #define PI_EXPORT_SYM(sym)		\
 	__PI_EXPORT_SYM(sym, __pi_ ## sym, Cannot export BSS symbol sym to startup code)
 #define __PI_EXPORT_SYM(sym, pisym, msg)\
@@ -142,4 +146,6 @@ KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
 _kernel_codesize = ABSOLUTE(__inittext_end - _text);
 #endif
 
+#undef ASSERT
+
 #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
-- 
cgit v1.2.3


From dc0a083948040ff364d065da8bb50c29f77a39ad Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Sat, 31 May 2025 14:30:05 +0200
Subject: arm64: Work around convergence issue with LLD linker

LLD will occasionally error out with a '__init_end does not converge'
error if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this
results in a circular dependency.

Counter this by dimensioning the initial IDMAP page tables based on a
new boundary marker 'kimage_limit', and define it such that its value
should not change as a result of the initdata segment being pushed over
a 64k segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided
that its value doesn't change by more than 2M between linker passes.

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20250531123005.3866382-2-ardb+git@google.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kernel-pgtable.h |  2 +-
 arch/arm64/kernel/image-vars.h          | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 9e93733523f6..74a4f738c5f5 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -58,7 +58,7 @@
 #define INIT_DIR_SIZE (PAGE_SIZE * (EARLY_PAGES(SWAPPER_PGTABLE_LEVELS, KIMAGE_VADDR, _end, EXTRA_PAGE) \
 				    + EARLY_SEGMENT_EXTRA_PAGES))
 
-#define INIT_IDMAP_DIR_PAGES	(EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, KIMAGE_VADDR, _end, 1))
+#define INIT_IDMAP_DIR_PAGES	(EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, KIMAGE_VADDR, kimage_limit, 1))
 #define INIT_IDMAP_DIR_SIZE	((INIT_IDMAP_DIR_PAGES + EARLY_IDMAP_EXTRA_PAGES) * PAGE_SIZE)
 
 #define INIT_IDMAP_FDT_PAGES	(EARLY_PAGES(INIT_IDMAP_PGTABLE_LEVELS, 0UL, UL(MAX_FDT_SIZE), 1) - 1)
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h
index 86f088a16147..1cee48feb309 100644
--- a/arch/arm64/kernel/image-vars.h
+++ b/arch/arm64/kernel/image-vars.h
@@ -146,6 +146,17 @@ KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
 _kernel_codesize = ABSOLUTE(__inittext_end - _text);
 #endif
 
+/*
+ * LLD will occasionally error out with a '__init_end does not converge' error
+ * if INIT_IDMAP_DIR_SIZE is defined in terms of _end, as this results in a
+ * circular dependency. Counter this by dimensioning the initial IDMAP page
+ * tables based on kimage_limit, which is defined such that its value should
+ * not change as a result of the initdata segment being pushed over a 64k
+ * segment boundary due to changes in INIT_IDMAP_DIR_SIZE, provided that its
+ * value doesn't change by more than 2M between linker passes.
+ */
+kimage_limit = ALIGN(ABSOLUTE(_end + SZ_64K), SZ_2M);
+
 #undef ASSERT
 
 #endif /* __ARM64_KERNEL_IMAGE_VARS_H */
-- 
cgit v1.2.3


From 4b634918384c0f84c33aeb4dd9fd4c38e7be5ccb Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Fri, 30 May 2025 16:23:47 +0100
Subject: arm64/mm: Close theoretical race where stale TLB entry remains valid

Commit 3ea277194daa ("mm, mprotect: flush TLB if potentially racing with
a parallel reclaim leaving stale TLB entries") describes a race that,
prior to the commit, could occur between reclaim and operations such as
mprotect() when using reclaim's tlbbatch mechanism. See that commit for
details but the summary is:

"""
Nadav Amit identified a theoritical race between page reclaim and
mprotect due to TLB flushes being batched outside of the PTL being held.

He described the race as follows:

	CPU0				CPU1
	----				----
					user accesses memory using RW PTE
					[PTE now cached in TLB]
	try_to_unmap_one()
	==> ptep_get_and_clear()
	==> set_tlb_ubc_flush_pending()
					mprotect(addr, PROT_READ)
					==> change_pte_range()
					==> [ PTE non-present - no flush ]

					user writes using cached RW PTE
	...

	try_to_unmap_flush()
"""

The solution was to insert flush_tlb_batched_pending() in mprotect() and
friends to explcitly drain any pending reclaim TLB flushes. In the
modern version of this solution, arch_flush_tlb_batched_pending() is
called to do that synchronisation.

arm64's tlbbatch implementation simply issues TLBIs at queue-time
(arch_tlbbatch_add_pending()), eliding the trailing dsb(ish). The
trailing dsb(ish) is finally issued in arch_tlbbatch_flush() at the end
of the batch to wait for all the issued TLBIs to complete.

Now, the Arm ARM states:

"""
The completion of the TLB maintenance instruction is guaranteed only by
the execution of a DSB by the observer that performed the TLB
maintenance instruction. The execution of a DSB by a different observer
does not have this effect, even if the DSB is known to be executed after
the TLB maintenance instruction is observed by that different observer.
"""

arch_tlbbatch_add_pending() and arch_tlbbatch_flush() conform to this
requirement because they are called from the same task (either kswapd or
caller of madvise(MADV_PAGEOUT)), so either they are on the same CPU or
if the task was migrated, __switch_to() contains an extra dsb(ish).

HOWEVER, arm64's arch_flush_tlb_batched_pending() is also implemented as
a dsb(ish). But this may be running on a CPU remote from the one that
issued the outstanding TLBIs. So there is no architectural gurantee of
synchonization. Therefore we are still vulnerable to the theoretical
race described in Commit 3ea277194daa ("mm, mprotect: flush TLB if
potentially racing with a parallel reclaim leaving stale TLB entries").

Fix this by flushing the entire mm in arch_flush_tlb_batched_pending().
This aligns with what the other arches that implement the tlbbatch
feature do.

Cc: <stable@vger.kernel.org>
Fixes: 43b3dfdd0455 ("arm64: support batched/deferred tlb shootdown during page reclamation/migration")
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Link: https://lore.kernel.org/r/20250530152445.2430295-1-ryan.roberts@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/tlbflush.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index eba1a98657f1..aa9efee17277 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -323,13 +323,14 @@ static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
 }
 
 /*
- * If mprotect/munmap/etc occurs during TLB batched flushing, we need to
- * synchronise all the TLBI issued with a DSB to avoid the race mentioned in
- * flush_tlb_batched_pending().
+ * If mprotect/munmap/etc occurs during TLB batched flushing, we need to ensure
+ * all the previously issued TLBIs targeting mm have completed. But since we
+ * can be executing on a remote CPU, a DSB cannot guarantee this like it can
+ * for arch_tlbbatch_flush(). Our only option is to flush the entire mm.
  */
 static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
 {
-	dsb(ish);
+	flush_tlb_mm(mm);
 }
 
 /*
-- 
cgit v1.2.3


From 10f885d63a0efd50b0d22bf27eb3cf727838e99e Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Mon, 2 Jun 2025 12:33:21 +0800
Subject: arm64: Add override for MPAM

As the message of the commit 09e6b306f3ba ("arm64: cpufeature: discover
CPU support for MPAM") already states, if a buggy firmware fails to
either enable MPAM or emulate the trap as if it were disabled, the
kernel will just fail to boot.  While upgrading the firmware should be
the best solution, we have some hardware of which the vendor have made
no response 2 months after we requested a firmware update.  Allow
overriding it so our devices don't become some e-waste.

Cc: James Morse <james.morse@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Cc: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Cc: Mingcong Bai <jeffbai@aosc.io>
Cc: Shaopeng Tan <tan.shaopeng@fujitsu.com>
Cc: Ben Horgan <ben.horgan@arm.com>
Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250602043723.216338-1-xry111@xry111.site
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/el2_setup.h    | 24 ++++++++++--------------
 arch/arm64/kernel/cpufeature.c        |  7 +++++--
 arch/arm64/kernel/cpuinfo.c           |  7 +++++--
 arch/arm64/kernel/pi/idreg-override.c |  3 +++
 4 files changed, 23 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h
index 30f57b0334a3..af7807f11df4 100644
--- a/arch/arm64/include/asm/el2_setup.h
+++ b/arch/arm64/include/asm/el2_setup.h
@@ -298,19 +298,6 @@
 .Lskip_gcs_\@:
 .endm
 
-.macro __init_el2_mpam
-	/* Memory Partitioning And Monitoring: disable EL2 traps */
-	mrs	x1, id_aa64pfr0_el1
-	ubfx	x0, x1, #ID_AA64PFR0_EL1_MPAM_SHIFT, #4
-	cbz	x0, .Lskip_mpam_\@		// skip if no MPAM
-	msr_s	SYS_MPAM2_EL2, xzr		// use the default partition
-						// and disable lower traps
-	mrs_s	x0, SYS_MPAMIDR_EL1
-	tbz	x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@	// skip if no MPAMHCR reg
-	msr_s	SYS_MPAMHCR_EL2, xzr		// clear TRAP_MPAMIDR_EL1 -> EL2
-.Lskip_mpam_\@:
-.endm
-
 /**
  * Initialize EL2 registers to sane values. This should be called early on all
  * cores that were booted in EL2. Note that everything gets initialised as
@@ -328,7 +315,6 @@
 	__init_el2_stage2
 	__init_el2_gicv3
 	__init_el2_hstr
-	__init_el2_mpam
 	__init_el2_nvhe_idregs
 	__init_el2_cptr
 	__init_el2_fgt
@@ -375,6 +361,16 @@
 #endif
 
 .macro finalise_el2_state
+	check_override id_aa64pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT, .Linit_mpam_\@, .Lskip_mpam_\@, x1, x2
+
+.Linit_mpam_\@:
+	msr_s	SYS_MPAM2_EL2, xzr		// use the default partition
+						// and disable lower traps
+	mrs_s	x0, SYS_MPAMIDR_EL1
+	tbz	x0, #MPAMIDR_EL1_HAS_HCR_SHIFT, .Lskip_mpam_\@  // skip if no MPAMHCR reg
+	msr_s   SYS_MPAMHCR_EL2, xzr		// clear TRAP_MPAMIDR_EL1 -> EL2
+
+.Lskip_mpam_\@:
 	check_override id_aa64pfr0, ID_AA64PFR0_EL1_SVE_SHIFT, .Linit_sve_\@, .Lskip_sve_\@, x1, x2
 
 .Linit_sve_\@:	/* SVE register access */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 379c82d22c75..fcc20d13e938 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1198,8 +1198,10 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 		cpacr_restore(cpacr);
 	}
 
-	if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0))
+	if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
+		info->reg_mpamidr = read_cpuid(MPAMIDR_EL1);
 		init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr);
+	}
 
 	if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
 		init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid);
@@ -1452,7 +1454,8 @@ void update_cpu_features(int cpu,
 		cpacr_restore(cpacr);
 	}
 
-	if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) {
+	if (id_aa64pfr0_mpam(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
+		info->reg_mpamidr = read_cpuid(MPAMIDR_EL1);
 		taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu,
 					info->reg_mpamidr, boot->reg_mpamidr);
 	}
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 94525abd1c22..c1f2b6b04b41 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -496,8 +496,11 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
 		__cpuinfo_store_cpu_32bit(&info->aarch32);
 
-	if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0))
-		info->reg_mpamidr = read_cpuid(MPAMIDR_EL1);
+	/*
+	 * info->reg_mpamidr deferred to {init,update}_cpu_features because we
+	 * don't want to read it (and trigger a trap on buggy firmware) if
+	 * using an aa64pfr0_el1 override to unconditionally disable MPAM.
+	 */
 
 	if (IS_ENABLED(CONFIG_ARM64_SME) &&
 	    id_aa64pfr1_sme(info->reg_id_aa64pfr1)) {
diff --git a/arch/arm64/kernel/pi/idreg-override.c b/arch/arm64/kernel/pi/idreg-override.c
index c6b185b885f7..bc57b290e5e7 100644
--- a/arch/arm64/kernel/pi/idreg-override.c
+++ b/arch/arm64/kernel/pi/idreg-override.c
@@ -127,6 +127,7 @@ static const struct ftr_set_desc pfr0 __prel64_initconst = {
 	.fields		= {
 	        FIELD("sve", ID_AA64PFR0_EL1_SVE_SHIFT, pfr0_sve_filter),
 		FIELD("el0", ID_AA64PFR0_EL1_EL0_SHIFT, NULL),
+		FIELD("mpam", ID_AA64PFR0_EL1_MPAM_SHIFT, NULL),
 		{}
 	},
 };
@@ -154,6 +155,7 @@ static const struct ftr_set_desc pfr1 __prel64_initconst = {
 		FIELD("gcs", ID_AA64PFR1_EL1_GCS_SHIFT, NULL),
 		FIELD("mte", ID_AA64PFR1_EL1_MTE_SHIFT, NULL),
 		FIELD("sme", ID_AA64PFR1_EL1_SME_SHIFT, pfr1_sme_filter),
+		FIELD("mpam_frac", ID_AA64PFR1_EL1_MPAM_frac_SHIFT, NULL),
 		{}
 	},
 };
@@ -246,6 +248,7 @@ static const struct {
 	{ "rodata=off",			"arm64_sw.rodataoff=1" },
 	{ "arm64.nolva",		"id_aa64mmfr2.varange=0" },
 	{ "arm64.no32bit_el0",		"id_aa64pfr0.el0=1" },
+	{ "arm64.nompam",		"id_aa64pfr0.mpam=0 id_aa64pfr1.mpam_frac=0" },
 };
 
 static int __init parse_hexdigit(const char *p, u64 *v)
-- 
cgit v1.2.3


From 247ed9e4a6869f3bf07bffd277a341a6833abdbc Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin@sipsolutions.net>
Date: Mon, 2 Jun 2025 15:00:46 +0200
Subject: um: Move faultinfo extraction into userspace routine

The segv handler is called slightly differently depending on whether
PTRACE_FULL_FAULTINFO is set or not (32bit vs. 64bit). The only
difference is that we don't try to pass the registers and instruction
pointer to the segv handler.

It would be good to either document or remove the difference, but I do
not know why this difference exists. And, passing NULL can even result
in a crash.

Signed-off-by: Benjamin Berg <benjamin@sipsolutions.net>
Link: https://patch.msgid.link/20250602130052.545733-2-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/os-Linux/skas/process.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index ae2aea062f06..97c2f964f5fc 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -163,12 +163,6 @@ static void get_skas_faultinfo(int pid, struct faultinfo *fi)
 	memcpy(fi, (void *)current_stub_stack(), sizeof(*fi));
 }
 
-static void handle_segv(int pid, struct uml_pt_regs *regs)
-{
-	get_skas_faultinfo(pid, &regs->faultinfo);
-	segv(regs->faultinfo, 0, 1, NULL, NULL);
-}
-
 static void handle_trap(int pid, struct uml_pt_regs *regs)
 {
 	if ((UPT_IP(regs) >= STUB_START) && (UPT_IP(regs) < STUB_END))
@@ -521,13 +515,14 @@ void userspace(struct uml_pt_regs *regs)
 
 			switch (sig) {
 			case SIGSEGV:
-				if (PTRACE_FULL_FAULTINFO) {
-					get_skas_faultinfo(pid,
-							   &regs->faultinfo);
+				get_skas_faultinfo(pid, &regs->faultinfo);
+
+				if (PTRACE_FULL_FAULTINFO)
 					(*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si,
 							     regs, NULL);
-				}
-				else handle_segv(pid, regs);
+				else
+					segv(regs->faultinfo, 0, 1, NULL, NULL);
+
 				break;
 			case SIGTRAP + 0x80:
 				handle_trap(pid, regs);
-- 
cgit v1.2.3


From dac494bf54f764a114f16621ef04f534dd754ac1 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin@sipsolutions.net>
Date: Mon, 2 Jun 2025 15:00:47 +0200
Subject: um: Add stub side of SECCOMP/futex based process handling

This adds the stub side for the new seccomp process management code. In
this case we do register save/restore through the signal handler
mcontext.

Add special code for handling TLS, which for x86_64 means setting the
FS_BASE/GS_BASE registers while for i386 it means calling the
set_thread_area syscall.

Co-authored-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250602130052.545733-3-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/include/shared/common-offsets.h |  2 ++
 arch/um/include/shared/skas/stub-data.h | 14 ++++++++++
 arch/um/kernel/skas/stub.c              | 49 +++++++++++++++++++++++++++++++++
 arch/x86/um/shared/sysdep/stub-data.h   | 23 ++++++++++++++++
 arch/x86/um/shared/sysdep/stub.h        |  2 ++
 arch/x86/um/shared/sysdep/stub_32.h     | 13 +++++++++
 arch/x86/um/shared/sysdep/stub_64.h     | 17 ++++++++++++
 7 files changed, 120 insertions(+)
 create mode 100644 arch/x86/um/shared/sysdep/stub-data.h

(limited to 'arch')

diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h
index 73f3a4792ed8..93e7097a2922 100644
--- a/arch/um/include/shared/common-offsets.h
+++ b/arch/um/include/shared/common-offsets.h
@@ -14,3 +14,5 @@ DEFINE(UM_THREAD_SIZE, THREAD_SIZE);
 
 DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC);
 DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC);
+
+DEFINE(UM_KERN_GDT_ENTRY_TLS_ENTRIES, GDT_ENTRY_TLS_ENTRIES);
diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h
index 81a4cace032c..81ac2cd12112 100644
--- a/arch/um/include/shared/skas/stub-data.h
+++ b/arch/um/include/shared/skas/stub-data.h
@@ -11,6 +11,10 @@
 #include <linux/compiler_types.h>
 #include <as-layout.h>
 #include <sysdep/tls.h>
+#include <sysdep/stub-data.h>
+
+#define FUTEX_IN_CHILD 0
+#define FUTEX_IN_KERN 1
 
 struct stub_init_data {
 	unsigned long stub_start;
@@ -52,6 +56,16 @@ struct stub_data {
 	/* 128 leaves enough room for additional fields in the struct */
 	struct stub_syscall syscall_data[(UM_KERN_PAGE_SIZE - 128) / sizeof(struct stub_syscall)] __aligned(16);
 
+	/* data shared with signal handler (only used in seccomp mode) */
+	short restart_wait;
+	unsigned int futex;
+	int signal;
+	unsigned short si_offset;
+	unsigned short mctx_offset;
+
+	/* seccomp architecture specific state restore */
+	struct stub_data_arch arch_data;
+
 	/* Stack for our signal handlers and for calling into . */
 	unsigned char sigstack[UM_KERN_PAGE_SIZE] __aligned(UM_KERN_PAGE_SIZE);
 };
diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c
index 796fc266d3bb..9041f6b6e28b 100644
--- a/arch/um/kernel/skas/stub.c
+++ b/arch/um/kernel/skas/stub.c
@@ -5,6 +5,9 @@
 
 #include <sysdep/stub.h>
 
+#include <linux/futex.h>
+#include <errno.h>
+
 static __always_inline int syscall_handler(struct stub_data *d)
 {
 	int i;
@@ -57,3 +60,49 @@ stub_syscall_handler(void)
 
 	trap_myself();
 }
+
+void __section(".__syscall_stub")
+stub_signal_interrupt(int sig, siginfo_t *info, void *p)
+{
+	struct stub_data *d = get_stub_data();
+	ucontext_t *uc = p;
+	long res;
+
+	d->signal = sig;
+	d->si_offset = (unsigned long)info - (unsigned long)&d->sigstack[0];
+	d->mctx_offset = (unsigned long)&uc->uc_mcontext - (unsigned long)&d->sigstack[0];
+
+restart_wait:
+	d->futex = FUTEX_IN_KERN;
+	do {
+		res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
+				    FUTEX_WAKE, 1);
+	} while (res == -EINTR);
+	do {
+		res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
+				    FUTEX_WAIT, FUTEX_IN_KERN, 0);
+	} while (res == -EINTR || d->futex == FUTEX_IN_KERN);
+
+	if (res < 0 && res != -EAGAIN)
+		stub_syscall1(__NR_exit_group, 1);
+
+	/* Try running queued syscalls. */
+	if (syscall_handler(d) < 0 || d->restart_wait) {
+		/* Report SIGSYS if we restart. */
+		d->signal = SIGSYS;
+		d->restart_wait = 0;
+		goto restart_wait;
+	}
+
+	/* Restore arch dependent state that is not part of the mcontext */
+	stub_seccomp_restore_state(&d->arch_data);
+
+	/* Return so that the host modified mcontext is restored. */
+}
+
+void __section(".__syscall_stub")
+stub_signal_restorer(void)
+{
+	/* We must not have anything on the stack when doing rt_sigreturn */
+	stub_syscall0(__NR_rt_sigreturn);
+}
diff --git a/arch/x86/um/shared/sysdep/stub-data.h b/arch/x86/um/shared/sysdep/stub-data.h
new file mode 100644
index 000000000000..82b1b7f8ac3d
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/stub-data.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ARCH_STUB_DATA_H
+#define __ARCH_STUB_DATA_H
+
+#ifdef __i386__
+#include <generated/asm-offsets.h>
+#include <asm/ldt.h>
+
+struct stub_data_arch {
+	int sync;
+	struct user_desc tls[UM_KERN_GDT_ENTRY_TLS_ENTRIES];
+};
+#else
+#define STUB_SYNC_FS_BASE (1 << 0)
+#define STUB_SYNC_GS_BASE (1 << 1)
+struct stub_data_arch {
+	int sync;
+	unsigned long fs_base;
+	unsigned long gs_base;
+};
+#endif
+
+#endif /* __ARCH_STUB_DATA_H */
diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h
index dc89f4423454..4fa58f5b4fca 100644
--- a/arch/x86/um/shared/sysdep/stub.h
+++ b/arch/x86/um/shared/sysdep/stub.h
@@ -13,3 +13,5 @@
 
 extern void stub_segv_handler(int, siginfo_t *, void *);
 extern void stub_syscall_handler(void);
+extern void stub_signal_interrupt(int, siginfo_t *, void *);
+extern void stub_signal_restorer(void);
diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h
index 390988132c0a..df568fc3ceb4 100644
--- a/arch/x86/um/shared/sysdep/stub_32.h
+++ b/arch/x86/um/shared/sysdep/stub_32.h
@@ -131,4 +131,17 @@ static __always_inline void *get_stub_data(void)
 		"call *%%eax ;"						\
 		:: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE),	\
 		   "i" (&fn))
+
+static __always_inline void
+stub_seccomp_restore_state(struct stub_data_arch *arch)
+{
+	for (int i = 0; i < sizeof(arch->tls) / sizeof(arch->tls[0]); i++) {
+		if (arch->sync & (1 << i))
+			stub_syscall1(__NR_set_thread_area,
+				      (unsigned long) &arch->tls[i]);
+	}
+
+	arch->sync = 0;
+}
+
 #endif
diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h
index 294affbec742..9cfd31afa769 100644
--- a/arch/x86/um/shared/sysdep/stub_64.h
+++ b/arch/x86/um/shared/sysdep/stub_64.h
@@ -10,6 +10,7 @@
 #include <sysdep/ptrace_user.h>
 #include <generated/asm-offsets.h>
 #include <linux/stddef.h>
+#include <asm/prctl.h>
 
 #define STUB_MMAP_NR __NR_mmap
 #define MMAP_OFFSET(o) (o)
@@ -134,4 +135,20 @@ static __always_inline void *get_stub_data(void)
 		"call *%%rax ;"						\
 		:: "i" ((1 + STUB_DATA_PAGES) * UM_KERN_PAGE_SIZE),	\
 		   "i" (&fn))
+
+static __always_inline void
+stub_seccomp_restore_state(struct stub_data_arch *arch)
+{
+	/*
+	 * We could use _writefsbase_u64/_writegsbase_u64 if the host reports
+	 * support in the hwcaps (HWCAP2_FSGSBASE).
+	 */
+	if (arch->sync & STUB_SYNC_FS_BASE)
+		stub_syscall2(__NR_arch_prctl, ARCH_SET_FS, arch->fs_base);
+	if (arch->sync & STUB_SYNC_GS_BASE)
+		stub_syscall2(__NR_arch_prctl, ARCH_SET_GS, arch->gs_base);
+
+	arch->sync = 0;
+}
+
 #endif
-- 
cgit v1.2.3


From b1e1bd2e69430445021394536740352be1b41cd0 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin@sipsolutions.net>
Date: Mon, 2 Jun 2025 15:00:48 +0200
Subject: um: Add helper functions to get/set state for SECCOMP

When not using ptrace, we need to both save and restore registers
through the mcontext as provided by the host kernel to our signal
handlers.

Add corresponding functions to store the state to an mcontext and
helpers to access the mcontext of the subprocess through the stub data.

Signed-off-by: Benjamin Berg <benjamin@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250602130052.545733-4-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/x86/um/os-Linux/mcontext.c      | 218 ++++++++++++++++++++++++++++++++++-
 arch/x86/um/ptrace.c                 |  76 +++++++++---
 arch/x86/um/shared/sysdep/mcontext.h |   9 ++
 3 files changed, 283 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c
index 37decaa74761..e661fdc44db9 100644
--- a/arch/x86/um/os-Linux/mcontext.c
+++ b/arch/x86/um/os-Linux/mcontext.c
@@ -1,7 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <sys/ucontext.h>
 #define __FRAME_OFFSETS
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <sys/ucontext.h>
 #include <asm/ptrace.h>
+#include <asm/sigcontext.h>
 #include <sysdep/ptrace.h>
 #include <sysdep/mcontext.h>
 #include <arch.h>
@@ -18,6 +21,10 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc)
 	COPY2(UESP, ESP); /* sic */
 	COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX);
 	COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS);
+#undef COPY2
+#undef COPY
+#undef COPY_SEG
+#undef COPY_SEG_CPL3
 #else
 #define COPY2(X,Y) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##Y]
 #define COPY(X) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##X]
@@ -29,6 +36,8 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc)
 	COPY2(EFLAGS, EFL);
 	COPY2(CS, CSGSFS);
 	regs->gp[SS / sizeof(unsigned long)] = mc->gregs[REG_CSGSFS] >> 48;
+#undef COPY2
+#undef COPY
 #endif
 }
 
@@ -42,3 +51,210 @@ void mc_set_rip(void *_mc, void *target)
 	mc->gregs[REG_RIP] = (unsigned long)target;
 #endif
 }
+
+/* Same thing, but the copy macros are turned around. */
+void get_mc_from_regs(struct uml_pt_regs *regs, mcontext_t *mc, int single_stepping)
+{
+#ifdef __i386__
+#define COPY2(X,Y) mc->gregs[REG_##Y] = regs->gp[X]
+#define COPY(X) mc->gregs[REG_##X] = regs->gp[X]
+#define COPY_SEG(X) mc->gregs[REG_##X] = regs->gp[X] & 0xffff;
+#define COPY_SEG_CPL3(X) mc->gregs[REG_##X] = (regs->gp[X] & 0xffff) | 3;
+	COPY_SEG(GS); COPY_SEG(FS); COPY_SEG(ES); COPY_SEG(DS);
+	COPY(EDI); COPY(ESI); COPY(EBP);
+	COPY2(UESP, ESP); /* sic */
+	COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX);
+	COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS);
+#else
+#define COPY2(X,Y) mc->gregs[REG_##Y] = regs->gp[X/sizeof(unsigned long)]
+#define COPY(X) mc->gregs[REG_##X] = regs->gp[X/sizeof(unsigned long)]
+	COPY(R8); COPY(R9); COPY(R10); COPY(R11);
+	COPY(R12); COPY(R13); COPY(R14); COPY(R15);
+	COPY(RDI); COPY(RSI); COPY(RBP); COPY(RBX);
+	COPY(RDX); COPY(RAX); COPY(RCX); COPY(RSP);
+	COPY(RIP);
+	COPY2(EFLAGS, EFL);
+	mc->gregs[REG_CSGSFS] = mc->gregs[REG_CSGSFS] & 0xffffffffffffl;
+	mc->gregs[REG_CSGSFS] |= (regs->gp[SS / sizeof(unsigned long)] & 0xffff) << 48;
+#endif
+
+	if (single_stepping)
+		mc->gregs[REG_EFL] |= X86_EFLAGS_TF;
+	else
+		mc->gregs[REG_EFL] &= ~X86_EFLAGS_TF;
+}
+
+#ifdef CONFIG_X86_32
+struct _xstate_64 {
+	struct _fpstate_64		fpstate;
+	struct _header			xstate_hdr;
+	struct _ymmh_state		ymmh;
+	/* New processor state extensions go here: */
+};
+
+/* Not quite the right structures as these contain more information */
+int um_i387_from_fxsr(struct _fpstate_32 *i387,
+		      const struct _fpstate_64 *fxsave);
+int um_fxsr_from_i387(struct _fpstate_64 *fxsave,
+		      const struct _fpstate_32 *from);
+#else
+#define _xstate_64 _xstate
+#endif
+
+static struct _fpstate *get_fpstate(struct stub_data *data,
+				    mcontext_t *mcontext,
+				    int *fp_size)
+{
+	struct _fpstate *res;
+
+	/* Assume floating point registers are on the same page */
+	res = (void *)(((unsigned long)mcontext->fpregs &
+			(UM_KERN_PAGE_SIZE - 1)) +
+		       (unsigned long)&data->sigstack[0]);
+
+	if ((void *)res + sizeof(struct _fpstate) >
+	    (void *)data->sigstack + sizeof(data->sigstack))
+		return NULL;
+
+	if (res->sw_reserved.magic1 != FP_XSTATE_MAGIC1) {
+		*fp_size = sizeof(struct _fpstate);
+	} else {
+		char *magic2_addr;
+
+		magic2_addr = (void *)res;
+		magic2_addr += res->sw_reserved.extended_size;
+		magic2_addr -= FP_XSTATE_MAGIC2_SIZE;
+
+		/* We still need to be within our stack */
+		if ((void *)magic2_addr >
+		    (void *)data->sigstack + sizeof(data->sigstack))
+			return NULL;
+
+		/* If we do not read MAGIC2, then we did something wrong */
+		if (*(__u32 *)magic2_addr != FP_XSTATE_MAGIC2)
+			return NULL;
+
+		/* Remove MAGIC2 from the size, we do not save/restore it */
+		*fp_size = res->sw_reserved.extended_size -
+			   FP_XSTATE_MAGIC2_SIZE;
+	}
+
+	return res;
+}
+
+int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data,
+		   unsigned long *fp_size_out)
+{
+	mcontext_t *mcontext;
+	struct _fpstate *fpstate_stub;
+	struct _xstate_64 *xstate_stub;
+	int fp_size, xstate_size;
+
+	/* mctx_offset is verified by wait_stub_done_seccomp */
+	mcontext = (void *)&data->sigstack[data->mctx_offset];
+
+	get_regs_from_mc(regs, mcontext);
+
+	fpstate_stub = get_fpstate(data, mcontext, &fp_size);
+	if (!fpstate_stub)
+		return -EINVAL;
+
+#ifdef CONFIG_X86_32
+	xstate_stub = (void *)&fpstate_stub->_fxsr_env;
+	xstate_size = fp_size - offsetof(struct _fpstate_32, _fxsr_env);
+#else
+	xstate_stub = (void *)fpstate_stub;
+	xstate_size = fp_size;
+#endif
+
+	if (fp_size_out)
+		*fp_size_out = xstate_size;
+
+	if (xstate_size > host_fp_size)
+		return -ENOSPC;
+
+	memcpy(&regs->fp, xstate_stub, xstate_size);
+
+	/* We do not need to read the x86_64 FS_BASE/GS_BASE registers as
+	 * we do not permit userspace to set them directly.
+	 */
+
+#ifdef CONFIG_X86_32
+	/* Read the i387 legacy FP registers */
+	if (um_fxsr_from_i387((void *)&regs->fp, fpstate_stub))
+		return -EINVAL;
+#endif
+
+	return 0;
+}
+
+/* Copied because we cannot include regset.h here. */
+struct task_struct;
+struct user_regset;
+struct membuf {
+	void *p;
+	size_t left;
+};
+
+int fpregs_legacy_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      struct membuf to);
+
+int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data,
+		   int single_stepping)
+{
+	mcontext_t *mcontext;
+	struct _fpstate *fpstate_stub;
+	struct _xstate_64 *xstate_stub;
+	int fp_size, xstate_size;
+
+	/* mctx_offset is verified by wait_stub_done_seccomp */
+	mcontext = (void *)&data->sigstack[data->mctx_offset];
+
+	if ((unsigned long)mcontext < (unsigned long)data->sigstack ||
+	    (unsigned long)mcontext >
+			(unsigned long) data->sigstack +
+			sizeof(data->sigstack) - sizeof(*mcontext))
+		return -EINVAL;
+
+	get_mc_from_regs(regs, mcontext, single_stepping);
+
+	fpstate_stub = get_fpstate(data, mcontext, &fp_size);
+	if (!fpstate_stub)
+		return -EINVAL;
+
+#ifdef CONFIG_X86_32
+	xstate_stub = (void *)&fpstate_stub->_fxsr_env;
+	xstate_size = fp_size - offsetof(struct _fpstate_32, _fxsr_env);
+#else
+	xstate_stub = (void *)fpstate_stub;
+	xstate_size = fp_size;
+#endif
+
+	memcpy(fpstate_stub, &regs->fp, fp_size);
+
+#ifdef __i386__
+	/*
+	 * On x86, the GDT entries are updated by arch_set_tls.
+	 */
+
+	/* Store the i387 legacy FP registers which the host will use */
+	if (um_i387_from_fxsr(fpstate_stub, (void *)&regs->fp))
+		return -EINVAL;
+#else
+	/*
+	 * On x86_64, we need to sync the FS_BASE/GS_BASE registers using the
+	 * arch specific data.
+	 */
+	if (data->arch_data.fs_base != regs->gp[FS_BASE / sizeof(unsigned long)]) {
+		data->arch_data.fs_base = regs->gp[FS_BASE / sizeof(unsigned long)];
+		data->arch_data.sync |= STUB_SYNC_FS_BASE;
+	}
+	if (data->arch_data.gs_base != regs->gp[GS_BASE / sizeof(unsigned long)]) {
+		data->arch_data.gs_base = regs->gp[GS_BASE / sizeof(unsigned long)];
+		data->arch_data.sync |= STUB_SYNC_GS_BASE;
+	}
+#endif
+
+	return 0;
+}
diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c
index 57c504fd5626..3275870330fe 100644
--- a/arch/x86/um/ptrace.c
+++ b/arch/x86/um/ptrace.c
@@ -25,7 +25,8 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
 	return tmp;
 }
 
-static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave)
+static inline unsigned long
+twd_fxsr_to_i387(const struct user_fxsr_struct *fxsave)
 {
 	struct _fpxreg *st = NULL;
 	unsigned long twd = (unsigned long) fxsave->twd;
@@ -69,12 +70,16 @@ static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave)
 	return ret;
 }
 
-/* Get/set the old 32bit i387 registers (pre-FPX) */
-static int fpregs_legacy_get(struct task_struct *target,
-			     const struct user_regset *regset,
-			     struct membuf to)
+/*
+ * Get/set the old 32bit i387 registers (pre-FPX)
+ *
+ * We provide simple wrappers for mcontext.c, they are only defined locally
+ * because mcontext.c is userspace facing and needs to a different definition
+ * of the structures.
+ */
+static int _um_i387_from_fxsr(struct membuf to,
+			      const struct user_fxsr_struct *fxsave)
 {
-	struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp;
 	int i;
 
 	membuf_store(&to, (unsigned long)fxsave->cwd | 0xffff0000ul);
@@ -91,23 +96,36 @@ static int fpregs_legacy_get(struct task_struct *target,
 	return 0;
 }
 
-static int fpregs_legacy_set(struct task_struct *target,
+int um_i387_from_fxsr(struct user_i387_struct *i387,
+		      const struct user_fxsr_struct *fxsave);
+
+int um_i387_from_fxsr(struct user_i387_struct *i387,
+		      const struct user_fxsr_struct *fxsave)
+{
+	struct membuf to = {
+		.p = i387,
+		.left = sizeof(*i387),
+	};
+
+	return _um_i387_from_fxsr(to, fxsave);
+}
+
+static int fpregs_legacy_get(struct task_struct *target,
 			     const struct user_regset *regset,
-			     unsigned int pos, unsigned int count,
-			     const void *kbuf, const void __user *ubuf)
+			     struct membuf to)
 {
 	struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp;
-	const struct user_i387_struct *from;
-	struct user_i387_struct buf;
-	int i;
 
-	if (ubuf) {
-		if (copy_from_user(&buf, ubuf, sizeof(buf)))
-			return -EFAULT;
-		from = &buf;
-	} else {
-		from = kbuf;
-	}
+	return _um_i387_from_fxsr(to, fxsave);
+}
+
+int um_fxsr_from_i387(struct user_fxsr_struct *fxsave,
+		      const struct user_i387_struct *from);
+
+int um_fxsr_from_i387(struct user_fxsr_struct *fxsave,
+		      const struct user_i387_struct *from)
+{
+	int i;
 
 	fxsave->cwd = (unsigned short)(from->cwd & 0xffff);
 	fxsave->swd = (unsigned short)(from->swd & 0xffff);
@@ -125,6 +143,26 @@ static int fpregs_legacy_set(struct task_struct *target,
 
 	return 0;
 }
+
+static int fpregs_legacy_set(struct task_struct *target,
+			     const struct user_regset *regset,
+			     unsigned int pos, unsigned int count,
+			     const void *kbuf, const void __user *ubuf)
+{
+	struct user_fxsr_struct *fxsave = (void *)target->thread.regs.regs.fp;
+	const struct user_i387_struct *from;
+	struct user_i387_struct buf;
+
+	if (ubuf) {
+		if (copy_from_user(&buf, ubuf, sizeof(buf)))
+			return -EFAULT;
+		from = &buf;
+	} else {
+		from = kbuf;
+	}
+
+	return um_fxsr_from_i387(fxsave, &buf);
+}
 #endif
 
 static int genregs_get(struct task_struct *target,
diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h
index b724c54da316..6fe490cc5b98 100644
--- a/arch/x86/um/shared/sysdep/mcontext.h
+++ b/arch/x86/um/shared/sysdep/mcontext.h
@@ -6,7 +6,16 @@
 #ifndef __SYS_SIGCONTEXT_X86_H
 #define __SYS_SIGCONTEXT_X86_H
 
+#include <stub-data.h>
+
 extern void get_regs_from_mc(struct uml_pt_regs *, mcontext_t *);
+extern void get_mc_from_regs(struct uml_pt_regs *regs, mcontext_t *mc,
+			     int single_stepping);
+
+extern int get_stub_state(struct uml_pt_regs *regs, struct stub_data *data,
+			  unsigned long *fp_size_out);
+extern int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data,
+			  int single_stepping);
 
 #ifdef __i386__
 
-- 
cgit v1.2.3


From 8420e08fe3a594b6ffa07705ac270faa2ed452c5 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin@sipsolutions.net>
Date: Mon, 2 Jun 2025 15:00:49 +0200
Subject: um: Track userspace children dying in SECCOMP mode

When in seccomp mode, we would hang forever on the futex if a child has
died unexpectedly. In contrast, ptrace mode will notice it and kill the
corresponding thread when it fails to run it.

Fix this issue using a new IRQ that is fired after a SIGCHLD and keeping
an (internal) list of all MMs. In the IRQ handler, find the affected MM
and set its PID to -1 as well as the futex variable to FUTEX_IN_KERN.

This, together with futex returning -EINTR after the signal is
sufficient to implement a race-free detection of a child dying.

Note that this also enables IRQ handling while starting a userspace
process. This should be safe and SECCOMP requires the IRQ in case the
process does not come up properly.

Signed-off-by: Benjamin Berg <benjamin@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250602130052.545733-5-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/include/asm/irq.h           |  5 ++-
 arch/um/include/asm/mmu.h           |  3 ++
 arch/um/include/shared/irq_user.h   |  2 +
 arch/um/include/shared/os.h         |  1 +
 arch/um/include/shared/skas/mm_id.h |  2 +
 arch/um/include/shared/skas/skas.h  |  1 +
 arch/um/kernel/irq.c                |  6 +++
 arch/um/kernel/skas/mmu.c           | 82 ++++++++++++++++++++++++++++++++++---
 arch/um/os-Linux/process.c          | 31 ++++++++++++++
 arch/um/os-Linux/signal.c           | 19 ++++++++-
 arch/um/os-Linux/skas/process.c     |  1 +
 11 files changed, 145 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h
index 749dfe8512e8..36dbedd1af48 100644
--- a/arch/um/include/asm/irq.h
+++ b/arch/um/include/asm/irq.h
@@ -13,17 +13,18 @@
 #define TELNETD_IRQ 		8
 #define XTERM_IRQ 		9
 #define RANDOM_IRQ 		10
+#define SIGCHLD_IRQ		11
 
 #ifdef CONFIG_UML_NET_VECTOR
 
-#define VECTOR_BASE_IRQ		(RANDOM_IRQ + 1)
+#define VECTOR_BASE_IRQ		(SIGCHLD_IRQ + 1)
 #define VECTOR_IRQ_SPACE	8
 
 #define UM_FIRST_DYN_IRQ (VECTOR_IRQ_SPACE + VECTOR_BASE_IRQ)
 
 #else
 
-#define UM_FIRST_DYN_IRQ (RANDOM_IRQ + 1)
+#define UM_FIRST_DYN_IRQ (SIGCHLD_IRQ + 1)
 
 #endif
 
diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h
index a3eaca41ff61..4d0e4239f3cc 100644
--- a/arch/um/include/asm/mmu.h
+++ b/arch/um/include/asm/mmu.h
@@ -6,11 +6,14 @@
 #ifndef __ARCH_UM_MMU_H
 #define __ARCH_UM_MMU_H
 
+#include "linux/types.h"
 #include <mm_id.h>
 
 typedef struct mm_context {
 	struct mm_id id;
 
+	struct list_head list;
+
 	/* Address range in need of a TLB sync */
 	unsigned long sync_tlb_range_from;
 	unsigned long sync_tlb_range_to;
diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h
index 88835b52ae2b..746abc24a5d5 100644
--- a/arch/um/include/shared/irq_user.h
+++ b/arch/um/include/shared/irq_user.h
@@ -17,6 +17,8 @@ enum um_irq_type {
 struct siginfo;
 extern void sigio_handler(int sig, struct siginfo *unused_si,
 			  struct uml_pt_regs *regs, void *mc);
+extern void sigchld_handler(int sig, struct siginfo *unused_si,
+			   struct uml_pt_regs *regs, void *mc);
 void sigio_run_timetravel_handlers(void);
 extern void free_irq_by_fd(int fd);
 extern void deactivate_fd(int fd, int irqnum);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index e63a74a5ff19..3046728ec42e 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -197,6 +197,7 @@ extern int create_mem_file(unsigned long long len);
 extern void report_enomem(void);
 
 /* process.c */
+pid_t os_reap_child(void);
 extern void os_alarm_process(int pid);
 extern void os_kill_process(int pid, int reap_child);
 extern void os_kill_ptraced_process(int pid, int reap_child);
diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h
index 140388c282f6..0654c57bb28e 100644
--- a/arch/um/include/shared/skas/mm_id.h
+++ b/arch/um/include/shared/skas/mm_id.h
@@ -14,4 +14,6 @@ struct mm_id {
 
 void __switch_mm(struct mm_id *mm_idp);
 
+void notify_mm_kill(int pid);
+
 #endif
diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h
index 85c50122ab98..7d1de4cab551 100644
--- a/arch/um/include/shared/skas/skas.h
+++ b/arch/um/include/shared/skas/skas.h
@@ -8,6 +8,7 @@
 
 #include <sysdep/ptrace.h>
 
+extern int using_seccomp;
 extern int userspace_pid[];
 
 extern void new_thread_handler(void);
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index abe8f30a521c..f1787be3983c 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -690,3 +690,9 @@ void __init init_IRQ(void)
 	/* Initialize EPOLL Loop */
 	os_setup_epoll();
 }
+
+extern void sigchld_handler(int sig, struct siginfo *unused_si,
+			    struct uml_pt_regs *regs, void *mc)
+{
+	do_IRQ(SIGCHLD_IRQ, regs);
+}
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 0eb5a1d3ba70..1e146a0f9549 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -8,6 +8,7 @@
 #include <linux/sched/signal.h>
 #include <linux/slab.h>
 
+#include <shared/irq_kern.h>
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
 #include <asm/mmu_context.h>
@@ -19,6 +20,9 @@
 /* Ensure the stub_data struct covers the allocated area */
 static_assert(sizeof(struct stub_data) == STUB_DATA_PAGES * UM_KERN_PAGE_SIZE);
 
+spinlock_t mm_list_lock;
+struct list_head mm_list;
+
 int init_new_context(struct task_struct *task, struct mm_struct *mm)
 {
 	struct mm_id *new_id = &mm->context.id;
@@ -31,10 +35,12 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
 
 	new_id->stack = stack;
 
-	block_signals_trace();
-	new_id->pid = start_userspace(stack);
-	unblock_signals_trace();
+	scoped_guard(spinlock_irqsave, &mm_list_lock) {
+		/* Insert into list, used for lookups when the child dies */
+		list_add(&mm->context.list, &mm_list);
+	}
 
+	new_id->pid = start_userspace(stack);
 	if (new_id->pid < 0) {
 		ret = new_id->pid;
 		goto out_free;
@@ -60,13 +66,79 @@ void destroy_context(struct mm_struct *mm)
 	 * zero, resulting in a kill(0), which will result in the
 	 * whole UML suddenly dying.  Also, cover negative and
 	 * 1 cases, since they shouldn't happen either.
+	 *
+	 * Negative cases happen if the child died unexpectedly.
 	 */
-	if (mmu->id.pid < 2) {
+	if (mmu->id.pid >= 0 && mmu->id.pid < 2) {
 		printk(KERN_ERR "corrupt mm_context - pid = %d\n",
 		       mmu->id.pid);
 		return;
 	}
-	os_kill_ptraced_process(mmu->id.pid, 1);
+
+	if (mmu->id.pid > 0) {
+		os_kill_ptraced_process(mmu->id.pid, 1);
+		mmu->id.pid = -1;
+	}
 
 	free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES));
+
+	guard(spinlock_irqsave)(&mm_list_lock);
+
+	list_del(&mm->context.list);
+}
+
+static irqreturn_t mm_sigchld_irq(int irq, void* dev)
+{
+	struct mm_context *mm_context;
+	pid_t pid;
+
+	guard(spinlock)(&mm_list_lock);
+
+	while ((pid = os_reap_child()) > 0) {
+		/*
+		* A child died, check if we have an MM with the PID. This is
+		* only relevant in SECCOMP mode (as ptrace will fail anyway).
+		*
+		* See wait_stub_done_seccomp for more details.
+		*/
+		list_for_each_entry(mm_context, &mm_list, list) {
+			if (mm_context->id.pid == pid) {
+				struct stub_data *stub_data;
+				printk("Unexpectedly lost MM child! Affected tasks will segfault.");
+
+				/* Marks the MM as dead */
+				mm_context->id.pid = -1;
+
+				/*
+				 * NOTE: If SMP is implemented, a futex_wake
+				 * needs to be added here.
+				 */
+				stub_data = (void *)mm_context->id.stack;
+				stub_data->futex = FUTEX_IN_KERN;
+
+				/*
+				 * NOTE: Currently executing syscalls by
+				 * affected tasks may finish normally.
+				 */
+				break;
+			}
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int __init init_child_tracking(void)
+{
+	int err;
+
+	spin_lock_init(&mm_list_lock);
+	INIT_LIST_HEAD(&mm_list);
+
+	err = request_irq(SIGCHLD_IRQ, mm_sigchld_irq, 0, "SIGCHLD", NULL);
+	if (err < 0)
+		panic("Failed to register SIGCHLD IRQ: %d", err);
+
+	return 0;
 }
+early_initcall(init_child_tracking)
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index 184566edeee9..00b49e90d05f 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -18,17 +18,29 @@
 #include <init.h>
 #include <longjmp.h>
 #include <os.h>
+#include <skas/skas.h>
 
 void os_alarm_process(int pid)
 {
+	if (pid <= 0)
+		return;
+
 	kill(pid, SIGALRM);
 }
 
 void os_kill_process(int pid, int reap_child)
 {
+	if (pid <= 0)
+		return;
+
+	/* Block signals until child is reaped */
+	block_signals();
+
 	kill(pid, SIGKILL);
 	if (reap_child)
 		CATCH_EINTR(waitpid(pid, NULL, __WALL));
+
+	unblock_signals();
 }
 
 /* Kill off a ptraced child by all means available.  kill it normally first,
@@ -38,11 +50,27 @@ void os_kill_process(int pid, int reap_child)
 
 void os_kill_ptraced_process(int pid, int reap_child)
 {
+	if (pid <= 0)
+		return;
+
+	/* Block signals until child is reaped */
+	block_signals();
+
 	kill(pid, SIGKILL);
 	ptrace(PTRACE_KILL, pid);
 	ptrace(PTRACE_CONT, pid);
 	if (reap_child)
 		CATCH_EINTR(waitpid(pid, NULL, __WALL));
+
+	unblock_signals();
+}
+
+pid_t os_reap_child(void)
+{
+	int status;
+
+	/* Try to reap a child */
+	return waitpid(-1, &status, WNOHANG);
 }
 
 /* Don't use the glibc version, which caches the result in TLS. It misses some
@@ -151,6 +179,9 @@ void init_new_thread_signals(void)
 	set_handler(SIGBUS);
 	signal(SIGHUP, SIG_IGN);
 	set_handler(SIGIO);
+	/* We (currently) only use the child reaper IRQ in seccomp mode */
+	if (using_seccomp)
+		set_handler(SIGCHLD);
 	signal(SIGWINCH, SIG_IGN);
 }
 
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index e71e5b4878d1..11f07f498270 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -29,6 +29,7 @@ void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) =
 	[SIGBUS]	= relay_signal,
 	[SIGSEGV]	= segv_handler,
 	[SIGIO]		= sigio_handler,
+	[SIGCHLD]	= sigchld_handler,
 };
 
 static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
@@ -44,7 +45,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
 	}
 
 	/* enable signals if sig isn't IRQ signal */
-	if ((sig != SIGIO) && (sig != SIGWINCH))
+	if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGCHLD))
 		unblock_signals_trace();
 
 	(*sig_info[sig])(sig, si, &r, mc);
@@ -64,6 +65,9 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
 #define SIGALRM_BIT 1
 #define SIGALRM_MASK (1 << SIGALRM_BIT)
 
+#define SIGCHLD_BIT 2
+#define SIGCHLD_MASK (1 << SIGCHLD_BIT)
+
 int signals_enabled;
 #if IS_ENABLED(CONFIG_UML_TIME_TRAVEL_SUPPORT)
 static int signals_blocked, signals_blocked_pending;
@@ -102,6 +106,11 @@ static void sig_handler(int sig, struct siginfo *si, mcontext_t *mc)
 		return;
 	}
 
+	if (!enabled && (sig == SIGCHLD)) {
+		signals_pending |= SIGCHLD_MASK;
+		return;
+	}
+
 	block_signals_trace();
 
 	sig_handler_common(sig, si, mc);
@@ -181,6 +190,8 @@ static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = {
 
 	[SIGIO] = sig_handler,
 	[SIGWINCH] = sig_handler,
+	/* SIGCHLD is only actually registered in seccomp mode. */
+	[SIGCHLD] = sig_handler,
 	[SIGALRM] = timer_alarm_handler,
 
 	[SIGUSR1] = sigusr1_handler,
@@ -309,6 +320,12 @@ void unblock_signals(void)
 		if (save_pending & SIGIO_MASK)
 			sig_handler_common(SIGIO, NULL, NULL);
 
+		if (save_pending & SIGCHLD_MASK) {
+			struct uml_pt_regs regs = {};
+
+			sigchld_handler(SIGCHLD, NULL, &regs, NULL);
+		}
+
 		/* Do not reenter the handler */
 
 		if ((save_pending & SIGALRM_MASK) && (!(signals_active & SIGALRM_MASK)))
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 97c2f964f5fc..150e4f6ba633 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -309,6 +309,7 @@ static int __init init_stub_exe_fd(void)
 }
 __initcall(init_stub_exe_fd);
 
+int using_seccomp;
 int userspace_pid[NR_CPUS];
 
 /**
-- 
cgit v1.2.3


From 406d17c6c370a33cfb54067d9e205305293d4604 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin@sipsolutions.net>
Date: Mon, 2 Jun 2025 15:00:50 +0200
Subject: um: Implement kernel side of SECCOMP based process handling

This adds the kernel side of the seccomp based process handling.

Co-authored-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250602130052.545733-6-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/include/shared/common-offsets.h    |   2 +
 arch/um/include/shared/os.h                |   2 +-
 arch/um/include/shared/skas/stub-data.h    |   5 +-
 arch/um/kernel/skas/mmu.c                  |   6 +-
 arch/um/kernel/skas/stub_exe.c             | 141 ++++++++++-
 arch/um/os-Linux/internal.h                |   5 +-
 arch/um/os-Linux/skas/mem.c                |  37 +--
 arch/um/os-Linux/skas/process.c            | 374 +++++++++++++++++++++--------
 arch/x86/um/shared/sysdep/kernel-offsets.h |   2 +
 arch/x86/um/tls_32.c                       |  23 +-
 10 files changed, 459 insertions(+), 138 deletions(-)

(limited to 'arch')

diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h
index 93e7097a2922..8ca66a1918c3 100644
--- a/arch/um/include/shared/common-offsets.h
+++ b/arch/um/include/shared/common-offsets.h
@@ -16,3 +16,5 @@ DEFINE(UM_NSEC_PER_SEC, NSEC_PER_SEC);
 DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC);
 
 DEFINE(UM_KERN_GDT_ENTRY_TLS_ENTRIES, GDT_ENTRY_TLS_ENTRIES);
+
+DEFINE(UM_SECCOMP_ARCH_NATIVE, SECCOMP_ARCH_NATIVE);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 3046728ec42e..b35cc8ce333b 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -286,7 +286,7 @@ int unmap(struct mm_id *mm_idp, unsigned long addr, unsigned long len);
 
 /* skas/process.c */
 extern int is_skas_winch(int pid, int fd, void *data);
-extern int start_userspace(unsigned long stub_stack);
+extern int start_userspace(struct mm_id *mm_id);
 extern void userspace(struct uml_pt_regs *regs);
 extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void));
 extern void switch_threads(jmp_buf *me, jmp_buf *you);
diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h
index 81ac2cd12112..675f1a0a1390 100644
--- a/arch/um/include/shared/skas/stub-data.h
+++ b/arch/um/include/shared/skas/stub-data.h
@@ -17,6 +17,8 @@
 #define FUTEX_IN_KERN 1
 
 struct stub_init_data {
+	int seccomp;
+
 	unsigned long stub_start;
 
 	int stub_code_fd;
@@ -24,7 +26,8 @@ struct stub_init_data {
 	int stub_data_fd;
 	unsigned long stub_data_offset;
 
-	unsigned long segv_handler;
+	unsigned long signal_handler;
+	unsigned long signal_restorer;
 };
 
 #define STUB_NEXT_SYSCALL(s) \
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 1e146a0f9549..87a18ae4da19 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -40,11 +40,9 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
 		list_add(&mm->context.list, &mm_list);
 	}
 
-	new_id->pid = start_userspace(stack);
-	if (new_id->pid < 0) {
-		ret = new_id->pid;
+	ret = start_userspace(new_id);
+	if (ret < 0)
 		goto out_free;
-	}
 
 	/* Ensure the new MM is clean and nothing unwanted is mapped */
 	unmap(new_id, 0, STUB_START);
diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c
index 23c99b285e82..f40f2332b676 100644
--- a/arch/um/kernel/skas/stub_exe.c
+++ b/arch/um/kernel/skas/stub_exe.c
@@ -3,6 +3,9 @@
 #include <asm/unistd.h>
 #include <sysdep/stub.h>
 #include <stub-data.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <generated/asm-offsets.h>
 
 void _start(void);
 
@@ -25,8 +28,6 @@ noinline static void real_init(void)
 	} sa = {
 		/* Need to set SA_RESTORER (but the handler never returns) */
 		.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO | 0x04000000,
-		/* no need to mask any signals */
-		.sa_mask = 0,
 	};
 
 	/* set a nice name */
@@ -35,6 +36,9 @@ noinline static void real_init(void)
 	/* Make sure this process dies if the kernel dies */
 	stub_syscall2(__NR_prctl, PR_SET_PDEATHSIG, SIGKILL);
 
+	/* Needed in SECCOMP mode (and safe to do anyway) */
+	stub_syscall5(__NR_prctl, PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+
 	/* read information from STDIN and close it */
 	res = stub_syscall3(__NR_read, 0,
 			    (unsigned long)&init_data, sizeof(init_data));
@@ -63,18 +67,133 @@ noinline static void real_init(void)
 	stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
 	stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
 
-	/* register SIGSEGV handler */
-	sa.sa_handler_ = (void *) init_data.segv_handler;
-	res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned long)&sa, 0,
-			    sizeof(sa.sa_mask));
-	if (res != 0)
-		stub_syscall1(__NR_exit, 13);
+	/* register signal handlers */
+	sa.sa_handler_ = (void *) init_data.signal_handler;
+	sa.sa_restorer = (void *) init_data.signal_restorer;
+	if (!init_data.seccomp) {
+		/* In ptrace mode, the SIGSEGV handler never returns */
+		sa.sa_mask = 0;
+
+		res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
+				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+		if (res != 0)
+			stub_syscall1(__NR_exit, 13);
+	} else {
+		/* SECCOMP mode uses rt_sigreturn, need to mask all signals */
+		sa.sa_mask = ~0ULL;
+
+		res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
+				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+		if (res != 0)
+			stub_syscall1(__NR_exit, 14);
+
+		res = stub_syscall4(__NR_rt_sigaction, SIGSYS,
+				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+		if (res != 0)
+			stub_syscall1(__NR_exit, 15);
+
+		res = stub_syscall4(__NR_rt_sigaction, SIGALRM,
+				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+		if (res != 0)
+			stub_syscall1(__NR_exit, 16);
+
+		res = stub_syscall4(__NR_rt_sigaction, SIGTRAP,
+				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+		if (res != 0)
+			stub_syscall1(__NR_exit, 17);
+
+		res = stub_syscall4(__NR_rt_sigaction, SIGILL,
+				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+		if (res != 0)
+			stub_syscall1(__NR_exit, 18);
+
+		res = stub_syscall4(__NR_rt_sigaction, SIGFPE,
+				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
+		if (res != 0)
+			stub_syscall1(__NR_exit, 19);
+	}
+
+	/*
+	 * If in seccomp mode, install the SECCOMP filter and trigger a syscall.
+	 * Otherwise set PTRACE_TRACEME and do a SIGSTOP.
+	 */
+	if (init_data.seccomp) {
+		struct sock_filter filter[] = {
+#if __BITS_PER_LONG > 32
+			/* [0] Load upper 32bit of instruction pointer from seccomp_data */
+			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+				 (offsetof(struct seccomp_data, instruction_pointer) + 4)),
+
+			/* [1] Jump forward 3 instructions if the upper address is not identical */
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) >> 32, 0, 3),
+#endif
+			/* [2] Load lower 32bit of instruction pointer from seccomp_data */
+			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+				 (offsetof(struct seccomp_data, instruction_pointer))),
+
+			/* [3] Mask out lower bits */
+			BPF_STMT(BPF_ALU | BPF_AND | BPF_K, 0xfffff000),
+
+			/* [4] Jump to [6] if the lower bits are not on the expected page */
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, (init_data.stub_start) & 0xfffff000, 1, 0),
+
+			/* [5] Trap call, allow */
+			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
+
+			/* [6,7] Check architecture */
+			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+				 offsetof(struct seccomp_data, arch)),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,
+				 UM_SECCOMP_ARCH_NATIVE, 1, 0),
+
+			/* [8] Kill (for architecture check) */
+			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
+
+			/* [9] Load syscall number */
+			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+				 offsetof(struct seccomp_data, nr)),
+
+			/* [10-14] Check against permitted syscalls */
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
+				 5, 0),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
+				 4, 0),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap,
+				 3, 0),
+#ifdef __i386__
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area,
+				 2, 0),
+#else
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_arch_prctl,
+				 2, 0),
+#endif
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
+				 1, 0),
+
+			/* [15] Not one of the permitted syscalls */
+			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
+
+			/* [16] Permitted call for the stub */
+			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+		};
+		struct sock_fprog prog = {
+			.len = sizeof(filter) / sizeof(filter[0]),
+			.filter = filter,
+		};
+
+		if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
+				  SECCOMP_FILTER_FLAG_TSYNC,
+				  (unsigned long)&prog) != 0)
+			stub_syscall1(__NR_exit, 20);
 
-	stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
+		/* Fall through, the exit syscall will cause SIGSYS */
+	} else {
+		stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
 
-	stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
+		stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid), SIGSTOP);
+	}
 
-	stub_syscall1(__NR_exit, 14);
+	stub_syscall1(__NR_exit, 30);
 
 	__builtin_unreachable();
 }
diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h
index 317fca190c2b..5d8d3b0817a9 100644
--- a/arch/um/os-Linux/internal.h
+++ b/arch/um/os-Linux/internal.h
@@ -2,6 +2,9 @@
 #ifndef __UM_OS_LINUX_INTERNAL_H
 #define __UM_OS_LINUX_INTERNAL_H
 
+#include <mm_id.h>
+#include <stub-data.h>
+
 /*
  * elf_aux.c
  */
@@ -16,5 +19,5 @@ void check_tmpexec(void);
  * skas/process.c
  */
 void wait_stub_done(int pid);
-
+void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys);
 #endif /* __UM_OS_LINUX_INTERNAL_H */
diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c
index d7f1814b0e5a..31bf3a52047a 100644
--- a/arch/um/os-Linux/skas/mem.c
+++ b/arch/um/os-Linux/skas/mem.c
@@ -80,27 +80,32 @@ static inline long do_syscall_stub(struct mm_id *mm_idp)
 	int n, i;
 	int err, pid = mm_idp->pid;
 
-	n = ptrace_setregs(pid, syscall_regs);
-	if (n < 0) {
-		printk(UM_KERN_ERR "Registers - \n");
-		for (i = 0; i < MAX_REG_NR; i++)
-			printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]);
-		panic("%s : PTRACE_SETREGS failed, errno = %d\n",
-		      __func__, -n);
-	}
-
 	/* Inform process how much we have filled in. */
 	proc_data->syscall_data_len = mm_idp->syscall_data_len;
 
-	err = ptrace(PTRACE_CONT, pid, 0, 0);
-	if (err)
-		panic("Failed to continue stub, pid = %d, errno = %d\n", pid,
-		      errno);
-
-	wait_stub_done(pid);
+	if (using_seccomp) {
+		proc_data->restart_wait = 1;
+		wait_stub_done_seccomp(mm_idp, 0, 1);
+	} else {
+		n = ptrace_setregs(pid, syscall_regs);
+		if (n < 0) {
+			printk(UM_KERN_ERR "Registers -\n");
+			for (i = 0; i < MAX_REG_NR; i++)
+				printk(UM_KERN_ERR "\t%d\t0x%lx\n", i, syscall_regs[i]);
+			panic("%s : PTRACE_SETREGS failed, errno = %d\n",
+			      __func__, -n);
+		}
+
+		err = ptrace(PTRACE_CONT, pid, 0, 0);
+		if (err)
+			panic("Failed to continue stub, pid = %d, errno = %d\n",
+			      pid, errno);
+
+		wait_stub_done(pid);
+	}
 
 	/*
-	 * proc_data->err will be non-zero if there was an (unexpected) error.
+	 * proc_data->err will be negative if there was an (unexpected) error.
 	 * In that case, syscall_data_len points to the last executed syscall,
 	 * otherwise it will be zero (but we do not need to rely on that).
 	 */
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 150e4f6ba633..e1aaa144e273 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
+ * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
  * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
  * Copyright (C) 2002- 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  */
@@ -25,8 +26,11 @@
 #include <registers.h>
 #include <skas.h>
 #include <sysdep/stub.h>
+#include <sysdep/mcontext.h>
+#include <linux/futex.h>
 #include <linux/threads.h>
 #include <timetravel.h>
+#include <asm-generic/rwonce.h>
 #include "../internal.h"
 
 int is_skas_winch(int pid, int fd, void *data)
@@ -142,6 +146,73 @@ bad_wait:
 	fatal_sigsegv();
 }
 
+void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
+{
+	struct stub_data *data = (void *)mm_idp->stack;
+	int ret;
+
+	do {
+		if (!running) {
+			data->signal = 0;
+			data->futex = FUTEX_IN_CHILD;
+			CATCH_EINTR(syscall(__NR_futex, &data->futex,
+					    FUTEX_WAKE, 1, NULL, NULL, 0));
+		}
+
+		do {
+			/*
+			 * We need to check whether the child is still alive
+			 * before and after the FUTEX_WAIT call. Before, in
+			 * case it just died but we still updated data->futex
+			 * to FUTEX_IN_CHILD. And after, in case it died while
+			 * we were waiting (and SIGCHLD woke us up, see the
+			 * IRQ handler in mmu.c).
+			 *
+			 * Either way, if PID is negative, then we have no
+			 * choice but to kill the task.
+			 */
+			if (__READ_ONCE(mm_idp->pid) < 0)
+				goto out_kill;
+
+			ret = syscall(__NR_futex, &data->futex,
+				      FUTEX_WAIT, FUTEX_IN_CHILD,
+				      NULL, NULL, 0);
+			if (ret < 0 && errno != EINTR && errno != EAGAIN) {
+				printk(UM_KERN_ERR "%s : FUTEX_WAIT failed, errno = %d\n",
+				       __func__, errno);
+				goto out_kill;
+			}
+		} while (data->futex == FUTEX_IN_CHILD);
+
+		if (__READ_ONCE(mm_idp->pid) < 0)
+			goto out_kill;
+
+		running = 0;
+
+		/* We may receive a SIGALRM before SIGSYS, iterate again. */
+	} while (wait_sigsys && data->signal == SIGALRM);
+
+	if (data->mctx_offset > sizeof(data->sigstack) - sizeof(mcontext_t)) {
+		printk(UM_KERN_ERR "%s : invalid mcontext offset", __func__);
+		goto out_kill;
+	}
+
+	if (wait_sigsys && data->signal != SIGSYS) {
+		printk(UM_KERN_ERR "%s : expected SIGSYS but got %d",
+		       __func__, data->signal);
+		goto out_kill;
+	}
+
+	return;
+
+out_kill:
+	printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = %d\n",
+	       __func__, mm_idp->pid, errno);
+	/* This is not true inside start_userspace */
+	if (current_mm_id() == mm_idp)
+		fatal_sigsegv();
+}
+
 extern unsigned long current_stub_stack(void);
 
 static void get_skas_faultinfo(int pid, struct faultinfo *fi)
@@ -185,14 +256,26 @@ static int userspace_tramp(void *stack)
 	int pipe_fds[2];
 	unsigned long long offset;
 	struct stub_init_data init_data = {
+		.seccomp = using_seccomp,
 		.stub_start = STUB_START,
-		.segv_handler = STUB_CODE +
-				(unsigned long) stub_segv_handler -
-				(unsigned long) __syscall_stub_start,
 	};
 	struct iomem_region *iomem;
 	int ret;
 
+	if (using_seccomp) {
+		init_data.signal_handler = STUB_CODE +
+					   (unsigned long) stub_signal_interrupt -
+					   (unsigned long) __syscall_stub_start;
+		init_data.signal_restorer = STUB_CODE +
+					   (unsigned long) stub_signal_restorer -
+					   (unsigned long) __syscall_stub_start;
+	} else {
+		init_data.signal_handler = STUB_CODE +
+					   (unsigned long) stub_segv_handler -
+					   (unsigned long) __syscall_stub_start;
+		init_data.signal_restorer = 0;
+	}
+
 	init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start),
 					      &offset);
 	init_data.stub_code_offset = MMAP_OFFSET(offset);
@@ -323,8 +406,9 @@ int userspace_pid[NR_CPUS];
  *         when negative: an error number.
  * FIXME: can PIDs become negative?!
  */
-int start_userspace(unsigned long stub_stack)
+int start_userspace(struct mm_id *mm_id)
 {
+	struct stub_data *proc_data = (void *)mm_id->stack;
 	void *stack;
 	unsigned long sp;
 	int pid, status, n, err;
@@ -343,10 +427,13 @@ int start_userspace(unsigned long stub_stack)
 	/* set stack pointer to the end of the stack page, so it can grow downwards */
 	sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
 
+	if (using_seccomp)
+		proc_data->futex = FUTEX_IN_CHILD;
+
 	/* clone into new userspace process */
 	pid = clone(userspace_tramp, (void *) sp,
 		    CLONE_VFORK | CLONE_VM | SIGCHLD,
-		    (void *)stub_stack);
+		    (void *)mm_id->stack);
 	if (pid < 0) {
 		err = -errno;
 		printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
@@ -354,29 +441,34 @@ int start_userspace(unsigned long stub_stack)
 		return err;
 	}
 
-	do {
-		CATCH_EINTR(n = waitpid(pid, &status, WUNTRACED | __WALL));
-		if (n < 0) {
+	if (using_seccomp) {
+		wait_stub_done_seccomp(mm_id, 1, 1);
+	} else {
+		do {
+			CATCH_EINTR(n = waitpid(pid, &status,
+						WUNTRACED | __WALL));
+			if (n < 0) {
+				err = -errno;
+				printk(UM_KERN_ERR "%s : wait failed, errno = %d\n",
+				       __func__, errno);
+				goto out_kill;
+			}
+		} while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
+
+		if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
+			err = -EINVAL;
+			printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n",
+			       __func__, status);
+			goto out_kill;
+		}
+
+		if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
+			   (void *) PTRACE_O_TRACESYSGOOD) < 0) {
 			err = -errno;
-			printk(UM_KERN_ERR "%s : wait failed, errno = %d\n",
+			printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n",
 			       __func__, errno);
 			goto out_kill;
 		}
-	} while (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGALRM));
-
-	if (!WIFSTOPPED(status) || (WSTOPSIG(status) != SIGSTOP)) {
-		err = -EINVAL;
-		printk(UM_KERN_ERR "%s : expected SIGSTOP, got status = %d\n",
-		       __func__, status);
-		goto out_kill;
-	}
-
-	if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
-		   (void *) PTRACE_O_TRACESYSGOOD) < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n",
-		       __func__, errno);
-		goto out_kill;
 	}
 
 	if (munmap(stack, UM_KERN_PAGE_SIZE) < 0) {
@@ -386,6 +478,8 @@ int start_userspace(unsigned long stub_stack)
 		goto out_kill;
 	}
 
+	mm_id->pid = pid;
+
 	return pid;
 
  out_kill:
@@ -399,7 +493,9 @@ extern unsigned long tt_extra_sched_jiffies;
 void userspace(struct uml_pt_regs *regs)
 {
 	int err, status, op, pid = userspace_pid[0];
-	siginfo_t si;
+	siginfo_t si_ptrace;
+	siginfo_t *si;
+	int sig;
 
 	/* Handle any immediate reschedules or signals */
 	interrupt_end();
@@ -432,104 +528,182 @@ void userspace(struct uml_pt_regs *regs)
 
 		current_mm_sync();
 
-		/* Flush out any pending syscalls */
-		err = syscall_stub_flush(current_mm_id());
-		if (err) {
-			if (err == -ENOMEM)
-				report_enomem();
+		if (using_seccomp) {
+			struct mm_id *mm_id = current_mm_id();
+			struct stub_data *proc_data = (void *) mm_id->stack;
+			int ret;
 
-			printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d",
-				__func__, -err);
-			fatal_sigsegv();
-		}
+			ret = set_stub_state(regs, proc_data, singlestepping());
+			if (ret) {
+				printk(UM_KERN_ERR "%s - failed to set regs: %d",
+				       __func__, ret);
+				fatal_sigsegv();
+			}
 
-		/*
-		 * This can legitimately fail if the process loads a
-		 * bogus value into a segment register.  It will
-		 * segfault and PTRACE_GETREGS will read that value
-		 * out of the process.  However, PTRACE_SETREGS will
-		 * fail.  In this case, there is nothing to do but
-		 * just kill the process.
-		 */
-		if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) {
-			printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n",
-			       __func__, errno);
-			fatal_sigsegv();
-		}
+			/* Must have been reset by the syscall caller */
+			if (proc_data->restart_wait != 0)
+				panic("Programming error: Flag to only run syscalls in child was not cleared!");
+
+			/* Mark pending syscalls for flushing */
+			proc_data->syscall_data_len = mm_id->syscall_data_len;
+			mm_id->syscall_data_len = 0;
+
+			proc_data->signal = 0;
+			proc_data->futex = FUTEX_IN_CHILD;
+			CATCH_EINTR(syscall(__NR_futex, &proc_data->futex,
+					    FUTEX_WAKE, 1, NULL, NULL, 0));
+			do {
+				ret = syscall(__NR_futex, &proc_data->futex,
+					      FUTEX_WAIT, FUTEX_IN_CHILD, NULL, NULL, 0);
+			} while ((ret == -1 && errno == EINTR) ||
+				 proc_data->futex == FUTEX_IN_CHILD);
+
+			sig = proc_data->signal;
+
+			if (sig == SIGTRAP && proc_data->err != 0) {
+				printk(UM_KERN_ERR "%s - Error flushing stub syscalls",
+				       __func__);
+				syscall_stub_dump_error(mm_id);
+				fatal_sigsegv();
+			}
 
-		if (put_fp_registers(pid, regs->fp)) {
-			printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n",
-			       __func__, errno);
-			fatal_sigsegv();
-		}
+			ret = get_stub_state(regs, proc_data, NULL);
+			if (ret) {
+				printk(UM_KERN_ERR "%s - failed to get regs: %d",
+				       __func__, ret);
+				fatal_sigsegv();
+			}
 
-		if (singlestepping())
-			op = PTRACE_SYSEMU_SINGLESTEP;
-		else
-			op = PTRACE_SYSEMU;
+			if (proc_data->si_offset > sizeof(proc_data->sigstack) - sizeof(*si))
+				panic("%s - Invalid siginfo offset from child",
+				      __func__);
+			si = (void *)&proc_data->sigstack[proc_data->si_offset];
 
-		if (ptrace(op, pid, 0, 0)) {
-			printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n",
-			       __func__, op, errno);
-			fatal_sigsegv();
-		}
+			regs->is_user = 1;
 
-		CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL));
-		if (err < 0) {
-			printk(UM_KERN_ERR "%s - wait failed, errno = %d\n",
-			       __func__, errno);
-			fatal_sigsegv();
-		}
+			/* Fill in ORIG_RAX and extract fault information */
+			PT_SYSCALL_NR(regs->gp) = si->si_syscall;
+			if (sig == SIGSEGV) {
+				mcontext_t *mcontext = (void *)&proc_data->sigstack[proc_data->mctx_offset];
 
-		regs->is_user = 1;
-		if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) {
-			printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n",
-			       __func__, errno);
-			fatal_sigsegv();
-		}
+				GET_FAULTINFO_FROM_MC(regs->faultinfo, mcontext);
+			}
+		} else {
+			/* Flush out any pending syscalls */
+			err = syscall_stub_flush(current_mm_id());
+			if (err) {
+				if (err == -ENOMEM)
+					report_enomem();
+
+				printk(UM_KERN_ERR "%s - Error flushing stub syscalls: %d",
+					__func__, -err);
+				fatal_sigsegv();
+			}
 
-		if (get_fp_registers(pid, regs->fp)) {
-			printk(UM_KERN_ERR "%s -  get_fp_registers failed, errno = %d\n",
-			       __func__, errno);
-			fatal_sigsegv();
-		}
+			/*
+			 * This can legitimately fail if the process loads a
+			 * bogus value into a segment register.  It will
+			 * segfault and PTRACE_GETREGS will read that value
+			 * out of the process.  However, PTRACE_SETREGS will
+			 * fail.  In this case, there is nothing to do but
+			 * just kill the process.
+			 */
+			if (ptrace(PTRACE_SETREGS, pid, 0, regs->gp)) {
+				printk(UM_KERN_ERR "%s - ptrace set regs failed, errno = %d\n",
+				       __func__, errno);
+				fatal_sigsegv();
+			}
 
-		UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
+			if (put_fp_registers(pid, regs->fp)) {
+				printk(UM_KERN_ERR "%s - ptrace set fp regs failed, errno = %d\n",
+				       __func__, errno);
+				fatal_sigsegv();
+			}
 
-		if (WIFSTOPPED(status)) {
-			int sig = WSTOPSIG(status);
+			if (singlestepping())
+				op = PTRACE_SYSEMU_SINGLESTEP;
+			else
+				op = PTRACE_SYSEMU;
 
-			/* These signal handlers need the si argument.
-			 * The SIGIO and SIGALARM handlers which constitute the
-			 * majority of invocations, do not use it.
-			 */
-			switch (sig) {
-			case SIGSEGV:
-			case SIGTRAP:
-			case SIGILL:
-			case SIGBUS:
-			case SIGFPE:
-			case SIGWINCH:
-				ptrace(PTRACE_GETSIGINFO, pid, 0, (struct siginfo *)&si);
-				break;
+			if (ptrace(op, pid, 0, 0)) {
+				printk(UM_KERN_ERR "%s - ptrace continue failed, op = %d, errno = %d\n",
+				       __func__, op, errno);
+				fatal_sigsegv();
+			}
+
+			CATCH_EINTR(err = waitpid(pid, &status, WUNTRACED | __WALL));
+			if (err < 0) {
+				printk(UM_KERN_ERR "%s - wait failed, errno = %d\n",
+				       __func__, errno);
+				fatal_sigsegv();
 			}
 
+			regs->is_user = 1;
+			if (ptrace(PTRACE_GETREGS, pid, 0, regs->gp)) {
+				printk(UM_KERN_ERR "%s - PTRACE_GETREGS failed, errno = %d\n",
+				       __func__, errno);
+				fatal_sigsegv();
+			}
+
+			if (get_fp_registers(pid, regs->fp)) {
+				printk(UM_KERN_ERR "%s -  get_fp_registers failed, errno = %d\n",
+				       __func__, errno);
+				fatal_sigsegv();
+			}
+
+			if (WIFSTOPPED(status)) {
+				sig = WSTOPSIG(status);
+
+				/*
+				 * These signal handlers need the si argument
+				 * and SIGSEGV needs the faultinfo.
+				 * The SIGIO and SIGALARM handlers which constitute
+				 * the majority of invocations, do not use it.
+				 */
+				switch (sig) {
+				case SIGSEGV:
+					get_skas_faultinfo(pid,
+							   &regs->faultinfo);
+					fallthrough;
+				case SIGTRAP:
+				case SIGILL:
+				case SIGBUS:
+				case SIGFPE:
+				case SIGWINCH:
+					ptrace(PTRACE_GETSIGINFO, pid, 0,
+					       (struct siginfo *)&si_ptrace);
+					si = &si_ptrace;
+					break;
+				default:
+					si = NULL;
+					break;
+				}
+			} else {
+				sig = 0;
+			}
+		}
+
+		UPT_SYSCALL_NR(regs) = -1; /* Assume: It's not a syscall */
+
+		if (sig) {
 			switch (sig) {
 			case SIGSEGV:
-				get_skas_faultinfo(pid, &regs->faultinfo);
-
-				if (PTRACE_FULL_FAULTINFO)
-					(*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si,
+				if (using_seccomp || PTRACE_FULL_FAULTINFO)
+					(*sig_info[SIGSEGV])(SIGSEGV,
+							     (struct siginfo *)si,
 							     regs, NULL);
 				else
 					segv(regs->faultinfo, 0, 1, NULL, NULL);
 
+				break;
+			case SIGSYS:
+				handle_syscall(regs);
 				break;
 			case SIGTRAP + 0x80:
 				handle_trap(pid, regs);
 				break;
 			case SIGTRAP:
-				relay_signal(SIGTRAP, (struct siginfo *)&si, regs, NULL);
+				relay_signal(SIGTRAP, (struct siginfo *)si, regs, NULL);
 				break;
 			case SIGALRM:
 				break;
@@ -539,7 +713,7 @@ void userspace(struct uml_pt_regs *regs)
 			case SIGFPE:
 			case SIGWINCH:
 				block_signals_trace();
-				(*sig_info[sig])(sig, (struct siginfo *)&si, regs, NULL);
+				(*sig_info[sig])(sig, (struct siginfo *)si, regs, NULL);
 				unblock_signals_trace();
 				break;
 			default:
diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h
index 48de3a71f845..6fd1ed400399 100644
--- a/arch/x86/um/shared/sysdep/kernel-offsets.h
+++ b/arch/x86/um/shared/sysdep/kernel-offsets.h
@@ -4,7 +4,9 @@
 #include <linux/elf.h>
 #include <linux/crypto.h>
 #include <linux/kbuild.h>
+#include <linux/audit.h>
 #include <asm/mman.h>
+#include <asm/seccomp.h>
 
 /* workaround for a warning with -Wmissing-prototypes */
 void foo(void);
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index fbb129023080..21cbb70cf771 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -12,6 +12,7 @@
 #include <skas.h>
 #include <sysdep/tls.h>
 #include <asm/desc.h>
+#include <stub-data.h>
 
 /*
  * If needed we can detect when it's uninitialized.
@@ -21,13 +22,27 @@
 static int host_supports_tls = -1;
 int host_gdt_entry_tls_min;
 
-static int do_set_thread_area(struct user_desc *info)
+static int do_set_thread_area(struct task_struct* task, struct user_desc *info)
 {
 	int ret;
 	u32 cpu;
 
+	if (info->entry_number < host_gdt_entry_tls_min ||
+	    info->entry_number >= host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES)
+		return -EINVAL;
+
+	if (using_seccomp) {
+		int idx = info->entry_number - host_gdt_entry_tls_min;
+		struct stub_data *data = (void *)task->mm->context.id.stack;
+
+		data->arch_data.tls[idx] = *info;
+		data->arch_data.sync |= BIT(idx);
+
+		return 0;
+	}
+
 	cpu = get_cpu();
-	ret = os_set_thread_area(info, userspace_pid[cpu]);
+	ret = os_set_thread_area(info, task->mm->context.id.pid);
 	put_cpu();
 
 	if (ret)
@@ -97,7 +112,7 @@ static int load_TLS(int flags, struct task_struct *to)
 		if (!(flags & O_FORCE) && curr->flushed)
 			continue;
 
-		ret = do_set_thread_area(&curr->tls);
+		ret = do_set_thread_area(current, &curr->tls);
 		if (ret)
 			goto out;
 
@@ -275,7 +290,7 @@ SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, user_desc)
 			return -EFAULT;
 	}
 
-	ret = do_set_thread_area(&info);
+	ret = do_set_thread_area(current, &info);
 	if (ret)
 		return ret;
 	return set_tls_entry(current, &info, idx, 1);
-- 
cgit v1.2.3


From beddc9fb1cb161e1bf779b180750b648ff9690c7 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin@sipsolutions.net>
Date: Mon, 2 Jun 2025 15:00:51 +0200
Subject: um: Add SECCOMP support detection and initialization

This detects seccomp support, sets the global using_seccomp variable and
initilizes the exec registers. The support is only enabled if the
seccomp= kernel parameter is set to either "on" or "auto". With "auto" a
fallback to ptrace mode will happen if initialization failed.

Signed-off-by: Benjamin Berg <benjamin@sipsolutions.net>
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250602130052.545733-7-benjamin@sipsolutions.net
[extend help with Kconfig text from v2, use exit syscall instead of libc,
 remove unneeded mctx_offset assignment, disable on 32-bit for now]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/os-Linux/registers.c |   4 +-
 arch/um/os-Linux/start_up.c  | 195 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 195 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/um/os-Linux/registers.c b/arch/um/os-Linux/registers.c
index d7ca148807b2..bfba2cbc9478 100644
--- a/arch/um/os-Linux/registers.c
+++ b/arch/um/os-Linux/registers.c
@@ -14,8 +14,8 @@
 
 /* This is set once at boot time and not changed thereafter */
 
-static unsigned long exec_regs[MAX_REG_NR];
-static unsigned long *exec_fp_regs;
+unsigned long exec_regs[MAX_REG_NR];
+unsigned long *exec_fp_regs;
 
 int init_pid_registers(int pid)
 {
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 93fc82c01aba..d95dc9034b26 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
+ * Copyright (C) 2021 Benjamin Berg <benjamin@sipsolutions.net>
  * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  */
 
@@ -24,6 +25,13 @@
 #include <kern_util.h>
 #include <mem_user.h>
 #include <ptrace_user.h>
+#include <stdbool.h>
+#include <stub-data.h>
+#include <sys/prctl.h>
+#include <linux/seccomp.h>
+#include <linux/filter.h>
+#include <sysdep/mcontext.h>
+#include <sysdep/stub.h>
 #include <registers.h>
 #include <skas.h>
 #include "internal.h"
@@ -224,6 +232,140 @@ static void __init check_ptrace(void)
 	check_sysemu();
 }
 
+extern unsigned long host_fp_size;
+extern unsigned long exec_regs[MAX_REG_NR];
+extern unsigned long *exec_fp_regs;
+
+__initdata static struct stub_data *seccomp_test_stub_data;
+
+static void __init sigsys_handler(int sig, siginfo_t *info, void *p)
+{
+	ucontext_t *uc = p;
+
+	/* Stow away the location of the mcontext in the stack */
+	seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext -
+					      (unsigned long)&seccomp_test_stub_data->sigstack[0];
+
+	/* Prevent libc from clearing memory (mctx_offset in particular) */
+	syscall(__NR_exit, 0);
+}
+
+static int __init seccomp_helper(void *data)
+{
+	static struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+			 offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0),
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+		BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
+	};
+	static struct sock_fprog prog = {
+		.len = ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+	struct sigaction sa;
+
+	set_sigstack(seccomp_test_stub_data->sigstack,
+			sizeof(seccomp_test_stub_data->sigstack));
+
+	sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
+	sa.sa_sigaction = (void *) sigsys_handler;
+	sa.sa_restorer = NULL;
+	if (sigaction(SIGSYS, &sa, NULL) < 0)
+		exit(1);
+
+	prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
+			SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0)
+		exit(2);
+
+	sleep(0);
+
+	/* Never reached. */
+	_exit(3);
+}
+
+static bool __init init_seccomp(void)
+{
+	int pid;
+	int status;
+	int n;
+	unsigned long sp;
+
+	/* doesn't work on 32-bit right now */
+	if (!IS_ENABLED(CONFIG_64BIT))
+		return false;
+
+	/*
+	 * We check that we can install a seccomp filter and then exit(0)
+	 * from a trapped syscall.
+	 *
+	 * Note that we cannot verify that no seccomp filter already exists
+	 * for a syscall that results in the process/thread to be killed.
+	 */
+
+	os_info("Checking that seccomp filters can be installed...");
+
+	seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data),
+				      PROT_READ | PROT_WRITE,
+				      MAP_SHARED | MAP_ANON, 0, 0);
+
+	/* Use the syscall data area as stack, we just need something */
+	sp = (unsigned long)&seccomp_test_stub_data->syscall_data +
+	     sizeof(seccomp_test_stub_data->syscall_data) -
+	     sizeof(void *);
+	pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL);
+
+	if (pid < 0)
+		fatal_perror("check_seccomp : clone failed");
+
+	CATCH_EINTR(n = waitpid(pid, &status, __WCLONE));
+	if (n < 0)
+		fatal_perror("check_seccomp : waitpid failed");
+
+	if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
+		struct uml_pt_regs *regs;
+		unsigned long fp_size;
+		int r;
+
+		/* Fill in the host_fp_size from the mcontext. */
+		regs = calloc(1, sizeof(struct uml_pt_regs));
+		get_stub_state(regs, seccomp_test_stub_data, &fp_size);
+		host_fp_size = fp_size;
+		free(regs);
+
+		/* Repeat with the correct size */
+		regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size);
+		r = get_stub_state(regs, seccomp_test_stub_data, NULL);
+
+		/* Store as the default startup registers */
+		exec_fp_regs = malloc(host_fp_size);
+		memcpy(exec_regs, regs->gp, sizeof(exec_regs));
+		memcpy(exec_fp_regs, regs->fp, host_fp_size);
+
+		munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
+
+		free(regs);
+
+		if (r) {
+			os_info("failed to fetch registers: %d\n", r);
+			return false;
+		}
+
+		os_info("OK\n");
+		return true;
+	}
+
+	if (WIFEXITED(status) && WEXITSTATUS(status) == 2)
+		os_info("missing\n");
+	else
+		os_info("error\n");
+
+	munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
+	return false;
+}
+
+
 static void __init check_coredump_limit(void)
 {
 	struct rlimit lim;
@@ -278,6 +420,44 @@ void  __init get_host_cpu_features(
 	}
 }
 
+static int seccomp_config __initdata;
+
+static int __init uml_seccomp_config(char *line, int *add)
+{
+	*add = 0;
+
+	if (strcmp(line, "off") == 0)
+		seccomp_config = 0;
+	else if (strcmp(line, "auto") == 0)
+		seccomp_config = 1;
+	else if (strcmp(line, "on") == 0)
+		seccomp_config = 2;
+	else
+		fatal("Invalid seccomp option '%s', expected on/auto/off\n",
+		      line);
+
+	return 0;
+}
+
+__uml_setup("seccomp=", uml_seccomp_config,
+"seccomp=<on/auto/off>\n"
+"    Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
+"    processes work collaboratively with the kernel instead of being\n"
+"    traced using ptrace. All syscalls from the application are caught and\n"
+"    redirected using a signal. This signal handler in turn is permitted to\n"
+"    do the selected set of syscalls to communicate with the UML kernel and\n"
+"    do the required memory management.\n"
+"\n"
+"    This method is overall faster than the ptrace based userspace, primarily\n"
+"    because it reduces the number of context switches for (minor) page faults.\n"
+"\n"
+"    However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
+"    userspace from reading and writing all physical memory. Userspace\n"
+"    processes could also trick the stub into disabling SIGALRM which\n"
+"    prevents it from being interrupted for scheduling purposes.\n"
+"\n"
+"    This is insecure and should only be used with a trusted userspace\n\n"
+);
 
 void __init os_early_checks(void)
 {
@@ -286,13 +466,24 @@ void __init os_early_checks(void)
 	/* Print out the core dump limits early */
 	check_coredump_limit();
 
-	check_ptrace();
-
 	/* Need to check this early because mmapping happens before the
 	 * kernel is running.
 	 */
 	check_tmpexec();
 
+	if (seccomp_config) {
+		if (init_seccomp()) {
+			using_seccomp = 1;
+			return;
+		}
+
+		if (seccomp_config == 2)
+			fatal("SECCOMP userspace requested but not functional!\n");
+	}
+
+	using_seccomp = 0;
+	check_ptrace();
+
 	pid = start_ptraced_child();
 	if (init_pid_registers(pid))
 		fatal("Failed to initialize default registers");
-- 
cgit v1.2.3


From e92e2552858142b60238b9828d802f128e4acccd Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Mon, 2 Jun 2025 15:00:52 +0200
Subject: um: pass FD for memory operations when needed

Instead of always sharing the FDs with the userspace process, only hand
over the FDs needed for mmap when required. The idea is that userspace
might be able to force the stub into executing an mmap syscall, however,
it will not be able to manipulate the control flow sufficiently to have
access to an FD that would allow mapping arbitrary memory.

Security wise, we need to be sure that only the expected syscalls are
executed after the kernel sends FDs through the socket. This is
currently not the case, as userspace can trivially jump to the
rt_sigreturn syscall instruction to execute any syscall that the stub is
permitted to do. With this, it can trick the kernel to send the FD,
which in turn allows userspace to freely map any physical memory.

As such, this is currently *not* secure. However, in principle the
approach should be fine with a more strict SECCOMP filter and a careful
review of the stub control flow (as userspace can prepare a stack). With
some care, it is likely possible to extend the security model to SMP if
desired.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250602130052.545733-8-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/include/shared/skas/mm_id.h     |   7 ++
 arch/um/include/shared/skas/stub-data.h |   1 +
 arch/um/kernel/skas/mmu.c               |   3 +
 arch/um/kernel/skas/stub.c              |  87 ++++++++++++++++++++--
 arch/um/kernel/skas/stub_exe.c          |  40 +++++++---
 arch/um/os-Linux/skas/mem.c             |  66 ++++++++++++++++-
 arch/um/os-Linux/skas/process.c         | 126 +++++++++++++++++++++++---------
 arch/um/os-Linux/start_up.c             |  10 ++-
 8 files changed, 280 insertions(+), 60 deletions(-)

(limited to 'arch')

diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h
index 0654c57bb28e..89df9a55fbea 100644
--- a/arch/um/include/shared/skas/mm_id.h
+++ b/arch/um/include/shared/skas/mm_id.h
@@ -6,10 +6,17 @@
 #ifndef __MM_ID_H
 #define __MM_ID_H
 
+#define STUB_MAX_FDS 4
+
 struct mm_id {
 	int pid;
 	unsigned long stack;
 	int syscall_data_len;
+
+	/* Only used with SECCOMP mode */
+	int sock;
+	int syscall_fd_num;
+	int syscall_fd_map[STUB_MAX_FDS];
 };
 
 void __switch_mm(struct mm_id *mm_idp);
diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h
index 675f1a0a1390..c261a77a32f6 100644
--- a/arch/um/include/shared/skas/stub-data.h
+++ b/arch/um/include/shared/skas/stub-data.h
@@ -12,6 +12,7 @@
 #include <as-layout.h>
 #include <sysdep/tls.h>
 #include <sysdep/stub-data.h>
+#include <mm_id.h>
 
 #define FUTEX_IN_CHILD 0
 #define FUTEX_IN_KERN 1
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 87a18ae4da19..849fafa4b54f 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -78,6 +78,9 @@ void destroy_context(struct mm_struct *mm)
 		mmu->id.pid = -1;
 	}
 
+	if (using_seccomp && mmu->id.sock)
+		os_close_file(mmu->id.sock);
+
 	free_pages(mmu->id.stack, ilog2(STUB_DATA_PAGES));
 
 	guard(spinlock_irqsave)(&mm_list_lock);
diff --git a/arch/um/kernel/skas/stub.c b/arch/um/kernel/skas/stub.c
index 9041f6b6e28b..67cab46a602c 100644
--- a/arch/um/kernel/skas/stub.c
+++ b/arch/um/kernel/skas/stub.c
@@ -6,23 +6,53 @@
 #include <sysdep/stub.h>
 
 #include <linux/futex.h>
+#include <sys/socket.h>
 #include <errno.h>
 
-static __always_inline int syscall_handler(struct stub_data *d)
+/*
+ * Known security issues
+ *
+ * Userspace can jump to this address to execute *any* syscall that is
+ * permitted by the stub. As we will return afterwards, it can do
+ * whatever it likes, including:
+ * - Tricking the kernel into handing out the memory FD
+ * - Using this memory FD to read/write all physical memory
+ * - Running in parallel to the kernel processing a syscall
+ *   (possibly creating data races?)
+ * - Blocking e.g. SIGALRM to avoid time based scheduling
+ *
+ * To avoid this, the permitted location for each syscall needs to be
+ * checked for in the SECCOMP filter (which is reasonably simple). Also,
+ * more care will need to go into considerations how the code might be
+ * tricked by using a prepared stack (or even modifying the stack from
+ * another thread in case SMP support is added).
+ *
+ * As for the SIGALRM, the best counter measure will be to check in the
+ * kernel that the process is reporting back the SIGALRM in a timely
+ * fashion.
+ */
+static __always_inline int syscall_handler(int fd_map[STUB_MAX_FDS])
 {
+	struct stub_data *d = get_stub_data();
 	int i;
 	unsigned long res;
+	int fd;
 
 	for (i = 0; i < d->syscall_data_len; i++) {
 		struct stub_syscall *sc = &d->syscall_data[i];
 
 		switch (sc->syscall) {
 		case STUB_SYSCALL_MMAP:
+			if (fd_map)
+				fd = fd_map[sc->mem.fd];
+			else
+				fd = sc->mem.fd;
+
 			res = stub_syscall6(STUB_MMAP_NR,
 					    sc->mem.addr, sc->mem.length,
 					    sc->mem.prot,
 					    MAP_SHARED | MAP_FIXED,
-					    sc->mem.fd, sc->mem.offset);
+					    fd, sc->mem.offset);
 			if (res != sc->mem.addr) {
 				d->err = res;
 				d->syscall_data_len = i;
@@ -54,9 +84,7 @@ static __always_inline int syscall_handler(struct stub_data *d)
 void __section(".__syscall_stub")
 stub_syscall_handler(void)
 {
-	struct stub_data *d = get_stub_data();
-
-	syscall_handler(d);
+	syscall_handler(NULL);
 
 	trap_myself();
 }
@@ -65,7 +93,25 @@ void __section(".__syscall_stub")
 stub_signal_interrupt(int sig, siginfo_t *info, void *p)
 {
 	struct stub_data *d = get_stub_data();
+	char rcv_data;
+	union {
+		char data[CMSG_SPACE(sizeof(int) * STUB_MAX_FDS)];
+		struct cmsghdr align;
+	} ctrl = {};
+	struct iovec iov = {
+		.iov_base = &rcv_data,
+		.iov_len = 1,
+	};
+	struct msghdr msghdr = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+		.msg_control = &ctrl,
+		.msg_controllen = sizeof(ctrl),
+	};
 	ucontext_t *uc = p;
+	struct cmsghdr *fd_msg;
+	int *fd_map;
+	int num_fds;
 	long res;
 
 	d->signal = sig;
@@ -78,6 +124,7 @@ restart_wait:
 		res = stub_syscall3(__NR_futex, (unsigned long)&d->futex,
 				    FUTEX_WAKE, 1);
 	} while (res == -EINTR);
+
 	do {
 		res = stub_syscall4(__NR_futex, (unsigned long)&d->futex,
 				    FUTEX_WAIT, FUTEX_IN_KERN, 0);
@@ -86,11 +133,37 @@ restart_wait:
 	if (res < 0 && res != -EAGAIN)
 		stub_syscall1(__NR_exit_group, 1);
 
-	/* Try running queued syscalls. */
-	if (syscall_handler(d) < 0 || d->restart_wait) {
+	if (d->syscall_data_len) {
+		/* Read passed FDs (if any) */
+		do {
+			res = stub_syscall3(__NR_recvmsg, 0, (unsigned long)&msghdr, 0);
+		} while (res == -EINTR);
+
+		/* We should never have a receive error (other than -EAGAIN) */
+		if (res < 0 && res != -EAGAIN)
+			stub_syscall1(__NR_exit_group, 1);
+
+		/* Receive the FDs */
+		num_fds = 0;
+		fd_msg = msghdr.msg_control;
+		fd_map = (void *)&CMSG_DATA(fd_msg);
+		if (res == iov.iov_len && msghdr.msg_controllen > sizeof(struct cmsghdr))
+			num_fds = (fd_msg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
+
+		/* Try running queued syscalls. */
+		res = syscall_handler(fd_map);
+
+		while (num_fds)
+			stub_syscall2(__NR_close, fd_map[--num_fds], 0);
+	} else {
+		res = 0;
+	}
+
+	if (res < 0 || d->restart_wait) {
 		/* Report SIGSYS if we restart. */
 		d->signal = SIGSYS;
 		d->restart_wait = 0;
+
 		goto restart_wait;
 	}
 
diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c
index f40f2332b676..cbafaa684e66 100644
--- a/arch/um/kernel/skas/stub_exe.c
+++ b/arch/um/kernel/skas/stub_exe.c
@@ -1,5 +1,6 @@
 #include <sys/ptrace.h>
 #include <sys/prctl.h>
+#include <sys/fcntl.h>
 #include <asm/unistd.h>
 #include <sysdep/stub.h>
 #include <stub-data.h>
@@ -45,7 +46,11 @@ noinline static void real_init(void)
 	if (res != sizeof(init_data))
 		stub_syscall1(__NR_exit, 10);
 
-	stub_syscall1(__NR_close, 0);
+	/* In SECCOMP mode, FD 0 is a socket and is later used for FD passing */
+	if (!init_data.seccomp)
+		stub_syscall1(__NR_close, 0);
+	else
+		stub_syscall3(__NR_fcntl, 0, F_SETFL, O_NONBLOCK);
 
 	/* map stub code + data */
 	res = stub_syscall6(STUB_MMAP_NR,
@@ -63,6 +68,13 @@ noinline static void real_init(void)
 	if (res != init_data.stub_start + UM_KERN_PAGE_SIZE)
 		stub_syscall1(__NR_exit, 12);
 
+	/* In SECCOMP mode, we only need the signalling FD from now on */
+	if (init_data.seccomp) {
+		res = stub_syscall3(__NR_close_range, 1, ~0U, 0);
+		if (res != 0)
+			stub_syscall1(__NR_exit, 13);
+	}
+
 	/* setup signal stack inside stub data */
 	stack.ss_sp = (void *)init_data.stub_start + UM_KERN_PAGE_SIZE;
 	stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
@@ -77,7 +89,7 @@ noinline static void real_init(void)
 		res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
 		if (res != 0)
-			stub_syscall1(__NR_exit, 13);
+			stub_syscall1(__NR_exit, 14);
 	} else {
 		/* SECCOMP mode uses rt_sigreturn, need to mask all signals */
 		sa.sa_mask = ~0ULL;
@@ -85,32 +97,32 @@ noinline static void real_init(void)
 		res = stub_syscall4(__NR_rt_sigaction, SIGSEGV,
 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
 		if (res != 0)
-			stub_syscall1(__NR_exit, 14);
+			stub_syscall1(__NR_exit, 15);
 
 		res = stub_syscall4(__NR_rt_sigaction, SIGSYS,
 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
 		if (res != 0)
-			stub_syscall1(__NR_exit, 15);
+			stub_syscall1(__NR_exit, 16);
 
 		res = stub_syscall4(__NR_rt_sigaction, SIGALRM,
 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
 		if (res != 0)
-			stub_syscall1(__NR_exit, 16);
+			stub_syscall1(__NR_exit, 17);
 
 		res = stub_syscall4(__NR_rt_sigaction, SIGTRAP,
 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
 		if (res != 0)
-			stub_syscall1(__NR_exit, 17);
+			stub_syscall1(__NR_exit, 18);
 
 		res = stub_syscall4(__NR_rt_sigaction, SIGILL,
 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
 		if (res != 0)
-			stub_syscall1(__NR_exit, 18);
+			stub_syscall1(__NR_exit, 19);
 
 		res = stub_syscall4(__NR_rt_sigaction, SIGFPE,
 				    (unsigned long)&sa, 0, sizeof(sa.sa_mask));
 		if (res != 0)
-			stub_syscall1(__NR_exit, 19);
+			stub_syscall1(__NR_exit, 20);
 	}
 
 	/*
@@ -153,8 +165,12 @@ noinline static void real_init(void)
 			BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
 				 offsetof(struct seccomp_data, nr)),
 
-			/* [10-14] Check against permitted syscalls */
+			/* [10-16] Check against permitted syscalls */
 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_futex,
+				 7, 0),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_recvmsg,
+				 6, 0),
+			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close,
 				 5, 0),
 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
 				 4, 0),
@@ -170,10 +186,10 @@ noinline static void real_init(void)
 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_rt_sigreturn,
 				 1, 0),
 
-			/* [15] Not one of the permitted syscalls */
+			/* [17] Not one of the permitted syscalls */
 			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
 
-			/* [16] Permitted call for the stub */
+			/* [18] Permitted call for the stub */
 			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
 		};
 		struct sock_fprog prog = {
@@ -184,7 +200,7 @@ noinline static void real_init(void)
 		if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
 				  SECCOMP_FILTER_FLAG_TSYNC,
 				  (unsigned long)&prog) != 0)
-			stub_syscall1(__NR_exit, 20);
+			stub_syscall1(__NR_exit, 21);
 
 		/* Fall through, the exit syscall will cause SIGSYS */
 	} else {
diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c
index 31bf3a52047a..8b9921ac3ef8 100644
--- a/arch/um/os-Linux/skas/mem.c
+++ b/arch/um/os-Linux/skas/mem.c
@@ -43,6 +43,16 @@ void syscall_stub_dump_error(struct mm_id *mm_idp)
 
 	print_hex_dump(UM_KERN_ERR, "    syscall data: ", 0,
 		       16, 4, sc, sizeof(*sc), 0);
+
+	if (using_seccomp) {
+		printk(UM_KERN_ERR "%s: FD map num: %d", __func__,
+		       mm_idp->syscall_fd_num);
+		print_hex_dump(UM_KERN_ERR,
+				"    FD map: ", 0, 16,
+				sizeof(mm_idp->syscall_fd_map[0]),
+				mm_idp->syscall_fd_map,
+				sizeof(mm_idp->syscall_fd_map), 0);
+	}
 }
 
 static inline unsigned long *check_init_stack(struct mm_id * mm_idp,
@@ -118,6 +128,9 @@ static inline long do_syscall_stub(struct mm_id *mm_idp)
 		mm_idp->syscall_data_len = 0;
 	}
 
+	if (using_seccomp)
+		mm_idp->syscall_fd_num = 0;
+
 	return mm_idp->syscall_data_len;
 }
 
@@ -180,6 +193,44 @@ static struct stub_syscall *syscall_stub_get_previous(struct mm_id *mm_idp,
 	return NULL;
 }
 
+static int get_stub_fd(struct mm_id *mm_idp, int fd)
+{
+	int i;
+
+	/* Find an FD slot (or flush and use first) */
+	if (!using_seccomp)
+		return fd;
+
+	/* Already crashed, value does not matter */
+	if (mm_idp->syscall_data_len < 0)
+		return 0;
+
+	/* Find existing FD in map if we can allocate another syscall */
+	if (mm_idp->syscall_data_len <
+	    ARRAY_SIZE(((struct stub_data *)NULL)->syscall_data)) {
+		for (i = 0; i < mm_idp->syscall_fd_num; i++) {
+			if (mm_idp->syscall_fd_map[i] == fd)
+				return i;
+		}
+
+		if (mm_idp->syscall_fd_num < STUB_MAX_FDS) {
+			i = mm_idp->syscall_fd_num;
+			mm_idp->syscall_fd_map[i] = fd;
+
+			mm_idp->syscall_fd_num++;
+
+			return i;
+		}
+	}
+
+	/* FD map full or no syscall space available, continue after flush */
+	do_syscall_stub(mm_idp);
+	mm_idp->syscall_fd_map[0] = fd;
+	mm_idp->syscall_fd_num = 1;
+
+	return 0;
+}
+
 int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot,
 	int phys_fd, unsigned long long offset)
 {
@@ -187,12 +238,21 @@ int map(struct mm_id *mm_idp, unsigned long virt, unsigned long len, int prot,
 
 	/* Compress with previous syscall if that is possible */
 	sc = syscall_stub_get_previous(mm_idp, STUB_SYSCALL_MMAP, virt);
-	if (sc && sc->mem.prot == prot && sc->mem.fd == phys_fd &&
+	if (sc && sc->mem.prot == prot &&
 	    sc->mem.offset == MMAP_OFFSET(offset - sc->mem.length)) {
-		sc->mem.length += len;
-		return 0;
+		int prev_fd = sc->mem.fd;
+
+		if (using_seccomp)
+			prev_fd = mm_idp->syscall_fd_map[sc->mem.fd];
+
+		if (phys_fd == prev_fd) {
+			sc->mem.length += len;
+			return 0;
+		}
 	}
 
+	phys_fd = get_stub_fd(mm_idp, phys_fd);
+
 	sc = syscall_stub_alloc(mm_idp);
 	sc->syscall = STUB_SYSCALL_MMAP;
 	sc->mem.addr = virt;
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index e1aaa144e273..e42ffac23e3c 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -16,6 +16,7 @@
 #include <sys/mman.h>
 #include <sys/wait.h>
 #include <sys/stat.h>
+#include <sys/socket.h>
 #include <asm/unistd.h>
 #include <as-layout.h>
 #include <init.h>
@@ -152,7 +153,39 @@ void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
 	int ret;
 
 	do {
+		const char byte = 0;
+		struct iovec iov = {
+			.iov_base = (void *)&byte,
+			.iov_len = sizeof(byte),
+		};
+		union {
+			char data[CMSG_SPACE(sizeof(mm_idp->syscall_fd_map))];
+			struct cmsghdr align;
+		} ctrl;
+		struct msghdr msgh = {
+			.msg_iov = &iov,
+			.msg_iovlen = 1,
+		};
+
 		if (!running) {
+			if (mm_idp->syscall_fd_num) {
+				unsigned int fds_size =
+					sizeof(int) * mm_idp->syscall_fd_num;
+				struct cmsghdr *cmsg;
+
+				msgh.msg_control = ctrl.data;
+				msgh.msg_controllen = CMSG_SPACE(fds_size);
+				cmsg = CMSG_FIRSTHDR(&msgh);
+				cmsg->cmsg_level = SOL_SOCKET;
+				cmsg->cmsg_type = SCM_RIGHTS;
+				cmsg->cmsg_len = CMSG_LEN(fds_size);
+				memcpy(CMSG_DATA(cmsg), mm_idp->syscall_fd_map,
+				       fds_size);
+
+				CATCH_EINTR(syscall(__NR_sendmsg, mm_idp->sock,
+						&msgh, 0));
+			}
+
 			data->signal = 0;
 			data->futex = FUTEX_IN_CHILD;
 			CATCH_EINTR(syscall(__NR_futex, &data->futex,
@@ -246,14 +279,20 @@ extern char __syscall_stub_start[];
 
 static int stub_exe_fd;
 
+struct tramp_data {
+	struct stub_data *stub_data;
+	/* 0 is inherited, 1 is the kernel side */
+	int sockpair[2];
+};
+
 #ifndef CLOSE_RANGE_CLOEXEC
 #define CLOSE_RANGE_CLOEXEC	(1U << 2)
 #endif
 
-static int userspace_tramp(void *stack)
+static int userspace_tramp(void *data)
 {
+	struct tramp_data *tramp_data = data;
 	char *const argv[] = { "uml-userspace", NULL };
-	int pipe_fds[2];
 	unsigned long long offset;
 	struct stub_init_data init_data = {
 		.seccomp = using_seccomp,
@@ -280,7 +319,8 @@ static int userspace_tramp(void *stack)
 					      &offset);
 	init_data.stub_code_offset = MMAP_OFFSET(offset);
 
-	init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset);
+	init_data.stub_data_fd = phys_mapping(uml_to_phys(tramp_data->stub_data),
+					      &offset);
 	init_data.stub_data_offset = MMAP_OFFSET(offset);
 
 	/*
@@ -291,20 +331,21 @@ static int userspace_tramp(void *stack)
 	syscall(__NR_close_range, 0, ~0U, CLOSE_RANGE_CLOEXEC);
 
 	fcntl(init_data.stub_data_fd, F_SETFD, 0);
-	for (iomem = iomem_regions; iomem; iomem = iomem->next)
-		fcntl(iomem->fd, F_SETFD, 0);
 
-	/* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */
-	if (pipe(pipe_fds))
-		exit(2);
+	/* In SECCOMP mode, these FDs are passed when needed */
+	if (!using_seccomp) {
+		for (iomem = iomem_regions; iomem; iomem = iomem->next)
+			fcntl(iomem->fd, F_SETFD, 0);
+	}
 
-	if (dup2(pipe_fds[0], 0) < 0)
+	/* dup2 signaling FD/socket to STDIN */
+	if (dup2(tramp_data->sockpair[0], 0) < 0)
 		exit(3);
-	close(pipe_fds[0]);
+	close(tramp_data->sockpair[0]);
 
 	/* Write init_data and close write side */
-	ret = write(pipe_fds[1], &init_data, sizeof(init_data));
-	close(pipe_fds[1]);
+	ret = write(tramp_data->sockpair[1], &init_data, sizeof(init_data));
+	close(tramp_data->sockpair[1]);
 
 	if (ret != sizeof(init_data))
 		exit(4);
@@ -397,7 +438,7 @@ int userspace_pid[NR_CPUS];
 
 /**
  * start_userspace() - prepare a new userspace process
- * @stub_stack:	pointer to the stub stack.
+ * @mm_id: The corresponding struct mm_id
  *
  * Setups a new temporary stack page that is used while userspace_tramp() runs
  * Clones the kernel process into a new userspace process, with FDs only.
@@ -409,9 +450,12 @@ int userspace_pid[NR_CPUS];
 int start_userspace(struct mm_id *mm_id)
 {
 	struct stub_data *proc_data = (void *)mm_id->stack;
+	struct tramp_data tramp_data = {
+		.stub_data = proc_data,
+	};
 	void *stack;
 	unsigned long sp;
-	int pid, status, n, err;
+	int status, n, err;
 
 	/* setup a temporary stack page */
 	stack = mmap(NULL, UM_KERN_PAGE_SIZE,
@@ -427,25 +471,32 @@ int start_userspace(struct mm_id *mm_id)
 	/* set stack pointer to the end of the stack page, so it can grow downwards */
 	sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
 
+	/* socket pair for init data and SECCOMP FD passing (no CLOEXEC here) */
+	if (socketpair(AF_UNIX, SOCK_STREAM, 0, tramp_data.sockpair)) {
+		err = -errno;
+		printk(UM_KERN_ERR "%s : socketpair failed, errno = %d\n",
+		       __func__, errno);
+		return err;
+	}
+
 	if (using_seccomp)
 		proc_data->futex = FUTEX_IN_CHILD;
 
-	/* clone into new userspace process */
-	pid = clone(userspace_tramp, (void *) sp,
+	mm_id->pid = clone(userspace_tramp, (void *) sp,
 		    CLONE_VFORK | CLONE_VM | SIGCHLD,
-		    (void *)mm_id->stack);
-	if (pid < 0) {
+		    (void *)&tramp_data);
+	if (mm_id->pid < 0) {
 		err = -errno;
 		printk(UM_KERN_ERR "%s : clone failed, errno = %d\n",
 		       __func__, errno);
-		return err;
+		goto out_close;
 	}
 
 	if (using_seccomp) {
 		wait_stub_done_seccomp(mm_id, 1, 1);
 	} else {
 		do {
-			CATCH_EINTR(n = waitpid(pid, &status,
+			CATCH_EINTR(n = waitpid(mm_id->pid, &status,
 						WUNTRACED | __WALL));
 			if (n < 0) {
 				err = -errno;
@@ -462,7 +513,7 @@ int start_userspace(struct mm_id *mm_id)
 			goto out_kill;
 		}
 
-		if (ptrace(PTRACE_SETOPTIONS, pid, NULL,
+		if (ptrace(PTRACE_SETOPTIONS, mm_id->pid, NULL,
 			   (void *) PTRACE_O_TRACESYSGOOD) < 0) {
 			err = -errno;
 			printk(UM_KERN_ERR "%s : PTRACE_SETOPTIONS failed, errno = %d\n",
@@ -478,12 +529,22 @@ int start_userspace(struct mm_id *mm_id)
 		goto out_kill;
 	}
 
-	mm_id->pid = pid;
+	close(tramp_data.sockpair[0]);
+	if (using_seccomp)
+		mm_id->sock = tramp_data.sockpair[1];
+	else
+		close(tramp_data.sockpair[1]);
+
+	return 0;
 
-	return pid;
+out_kill:
+	os_kill_ptraced_process(mm_id->pid, 1);
+out_close:
+	close(tramp_data.sockpair[0]);
+	close(tramp_data.sockpair[1]);
+
+	mm_id->pid = -1;
 
- out_kill:
-	os_kill_ptraced_process(pid, 1);
 	return err;
 }
 
@@ -546,17 +607,8 @@ void userspace(struct uml_pt_regs *regs)
 
 			/* Mark pending syscalls for flushing */
 			proc_data->syscall_data_len = mm_id->syscall_data_len;
-			mm_id->syscall_data_len = 0;
 
-			proc_data->signal = 0;
-			proc_data->futex = FUTEX_IN_CHILD;
-			CATCH_EINTR(syscall(__NR_futex, &proc_data->futex,
-					    FUTEX_WAKE, 1, NULL, NULL, 0));
-			do {
-				ret = syscall(__NR_futex, &proc_data->futex,
-					      FUTEX_WAIT, FUTEX_IN_CHILD, NULL, NULL, 0);
-			} while ((ret == -1 && errno == EINTR) ||
-				 proc_data->futex == FUTEX_IN_CHILD);
+			wait_stub_done_seccomp(mm_id, 0, 0);
 
 			sig = proc_data->signal;
 
@@ -564,9 +616,13 @@ void userspace(struct uml_pt_regs *regs)
 				printk(UM_KERN_ERR "%s - Error flushing stub syscalls",
 				       __func__);
 				syscall_stub_dump_error(mm_id);
+				mm_id->syscall_data_len = proc_data->err;
 				fatal_sigsegv();
 			}
 
+			mm_id->syscall_data_len = 0;
+			mm_id->syscall_fd_num = 0;
+
 			ret = get_stub_state(regs, proc_data, NULL);
 			if (ret) {
 				printk(UM_KERN_ERR "%s - failed to get regs: %d",
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index d95dc9034b26..49015be1aaaf 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -265,6 +265,10 @@ static int __init seccomp_helper(void *data)
 	};
 	struct sigaction sa;
 
+	/* close_range is needed for the stub */
+	if (stub_syscall3(__NR_close_range, 1, ~0U, 0))
+		exit(1);
+
 	set_sigstack(seccomp_test_stub_data->sigstack,
 			sizeof(seccomp_test_stub_data->sigstack));
 
@@ -272,17 +276,17 @@ static int __init seccomp_helper(void *data)
 	sa.sa_sigaction = (void *) sigsys_handler;
 	sa.sa_restorer = NULL;
 	if (sigaction(SIGSYS, &sa, NULL) < 0)
-		exit(1);
+		exit(2);
 
 	prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
 	if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
 			SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0)
-		exit(2);
+		exit(3);
 
 	sleep(0);
 
 	/* Never reached. */
-	_exit(3);
+	_exit(4);
 }
 
 static bool __init init_seccomp(void)
-- 
cgit v1.2.3


From 2f956db8b3b02256b21da4d1f26fedc63782adff Mon Sep 17 00:00:00 2001
From: Palmer Dabbelt <palmer@dabbelt.com>
Date: Mon, 2 Jun 2025 10:33:15 -0700
Subject: Revert "RISC-V: vDSO: Wire up getrandom() vDSO implementation"

This has been on -next for a bit, but it's broken and there's already a
v2.  So I'm reverting it to avoid more rebasing.

This reverts commit 89079520cef65d6da1e864eab4464effe5396e23.

Link: https://lore.kernel.org/r/20250602173315.20228-1-palmer@dabbelt.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig                         |   1 -
 arch/riscv/include/asm/vdso/getrandom.h    |  30 ----
 arch/riscv/kernel/vdso/Makefile            |  12 --
 arch/riscv/kernel/vdso/getrandom.c         |  10 --
 arch/riscv/kernel/vdso/vdso.lds.S          |   1 -
 arch/riscv/kernel/vdso/vgetrandom-chacha.S | 244 -----------------------------
 6 files changed, 298 deletions(-)
 delete mode 100644 arch/riscv/include/asm/vdso/getrandom.h
 delete mode 100644 arch/riscv/kernel/vdso/getrandom.c
 delete mode 100644 arch/riscv/kernel/vdso/vgetrandom-chacha.S

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index fbca724302ab..bbec87b79309 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -218,7 +218,6 @@ config RISCV
 	select THREAD_INFO_IN_TASK
 	select TRACE_IRQFLAGS_SUPPORT
 	select UACCESS_MEMCPY if !MMU
-	select VDSO_GETRANDOM if HAVE_GENERIC_VDSO
 	select USER_STACKTRACE_SUPPORT
 	select ZONE_DMA32 if 64BIT
 
diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h
deleted file mode 100644
index 8dc92441702a..000000000000
--- a/arch/riscv/include/asm/vdso/getrandom.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
- */
-#ifndef __ASM_VDSO_GETRANDOM_H
-#define __ASM_VDSO_GETRANDOM_H
-
-#ifndef __ASSEMBLY__
-
-#include <asm/unistd.h>
-
-static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags)
-{
-	register long ret asm("a0");
-	register long nr asm("a7") = __NR_getrandom;
-	register void *buffer asm("a0") = _buffer;
-	register size_t len asm("a1") = _len;
-	register unsigned int flags asm("a2") = _flags;
-
-	asm volatile ("ecall\n"
-		      : "+r" (ret)
-		      : "r" (nr), "r" (buffer), "r" (len), "r" (flags)
-		      : "memory");
-
-	return ret;
-}
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile
index 8d12f5646eb5..4a5d131506fc 100644
--- a/arch/riscv/kernel/vdso/Makefile
+++ b/arch/riscv/kernel/vdso/Makefile
@@ -13,17 +13,9 @@ vdso-syms += flush_icache
 vdso-syms += hwprobe
 vdso-syms += sys_hwprobe
 
-ifdef CONFIG_VDSO_GETRANDOM
-vdso-syms += getrandom
-endif
-
 # Files to link into the vdso
 obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o
 
-ifdef CONFIG_VDSO_GETRANDOM
-obj-vdso += vgetrandom-chacha.o
-endif
-
 ccflags-y := -fno-stack-protector
 ccflags-y += -DDISABLE_BRANCH_PROFILING
 ccflags-y += -fno-builtin
@@ -32,10 +24,6 @@ ifneq ($(c-gettimeofday-y),)
   CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y)
 endif
 
-ifneq ($(c-getrandom-y),)
-  CFLAGS_getrandom.o += -fPIC -include $(c-getrandom-y)
-endif
-
 CFLAGS_hwprobe.o += -fPIC
 
 # Build rules
diff --git a/arch/riscv/kernel/vdso/getrandom.c b/arch/riscv/kernel/vdso/getrandom.c
deleted file mode 100644
index f21922e8cebd..000000000000
--- a/arch/riscv/kernel/vdso/getrandom.c
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
- */
-#include <linux/types.h>
-
-ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
-{
-	return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
-}
diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S
index abc69cda0445..8e86965a8aae 100644
--- a/arch/riscv/kernel/vdso/vdso.lds.S
+++ b/arch/riscv/kernel/vdso/vdso.lds.S
@@ -80,7 +80,6 @@ VERSION
 #ifndef COMPAT_VDSO
 		__vdso_riscv_hwprobe;
 #endif
-		__vdso_getrandom;
 	local: *;
 	};
 }
diff --git a/arch/riscv/kernel/vdso/vgetrandom-chacha.S b/arch/riscv/kernel/vdso/vgetrandom-chacha.S
deleted file mode 100644
index d793cadc78a6..000000000000
--- a/arch/riscv/kernel/vdso/vgetrandom-chacha.S
+++ /dev/null
@@ -1,244 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
- *
- * Based on arch/loongarch/vdso/vgetrandom-chacha.S.
- */
-
-#include <asm/asm.h>
-#include <linux/linkage.h>
-
-.text
-
-.macro	ROTRI	rd rs imm
-	slliw	t0, \rs, 32 - \imm
-	srliw	\rd, \rs, \imm
-	or	\rd, \rd, t0
-.endm
-
-.macro	OP_4REG	op d0 d1 d2 d3 s0 s1 s2 s3
-	\op	\d0, \d0, \s0
-	\op	\d1, \d1, \s1
-	\op	\d2, \d2, \s2
-	\op	\d3, \d3, \s3
-.endm
-
-/*
- *	a0: output bytes
- * 	a1: 32-byte key input
- *	a2: 8-byte counter input/output
- *	a3: number of 64-byte blocks to write to output
- */
-SYM_FUNC_START(__arch_chacha20_blocks_nostack)
-
-#define output		a0
-#define key		a1
-#define counter		a2
-#define nblocks		a3
-#define i		a4
-#define state0		s0
-#define state1		s1
-#define state2		s2
-#define state3		s3
-#define state4		s4
-#define state5		s5
-#define state6		s6
-#define state7		s7
-#define state8		s8
-#define state9		s9
-#define state10		s10
-#define state11		s11
-#define state12		a5
-#define state13		a6
-#define state14		a7
-#define state15		t1
-#define cnt		t2
-#define copy0		t3
-#define copy1		t4
-#define copy2		t5
-#define copy3		t6
-
-/* Packs to be used with OP_4REG */
-#define line0		state0, state1, state2, state3
-#define line1		state4, state5, state6, state7
-#define line2		state8, state9, state10, state11
-#define line3		state12, state13, state14, state15
-
-#define line1_perm	state5, state6, state7, state4
-#define line2_perm	state10, state11, state8, state9
-#define line3_perm	state15, state12, state13, state14
-
-#define copy		copy0, copy1, copy2, copy3
-
-#define _16		16, 16, 16, 16
-#define _20		20, 20, 20, 20
-#define _24		24, 24, 24, 24
-#define _25		25, 25, 25, 25
-
-	addi		sp, sp, -12*SZREG
-	REG_S		s0,         (sp)
-	REG_S		s1,    SZREG(sp)
-	REG_S		s2,  2*SZREG(sp)
-	REG_S		s3,  3*SZREG(sp)
-	REG_S		s4,  4*SZREG(sp)
-	REG_S		s5,  5*SZREG(sp)
-	REG_S		s6,  6*SZREG(sp)
-	REG_S		s7,  7*SZREG(sp)
-	REG_S		s8,  8*SZREG(sp)
-	REG_S		s9,  9*SZREG(sp)
-	REG_S		s10, 10*SZREG(sp)
-	REG_S		s11, 11*SZREG(sp)
-
-	ld		cnt, (counter)
-
-	li		copy0, 0x61707865
-	li		copy1, 0x3320646e
-	li		copy2, 0x79622d32
-	li		copy3, 0x6b206574
-
-.Lblock:
-	/* state[0,1,2,3] = "expand 32-byte k" */
-	mv		state0, copy0
-	mv		state1, copy1
-	mv		state2, copy2
-	mv		state3, copy3
-
-	/* state[4,5,..,11] = key */
-	lw		state4,   (key)
-	lw		state5,  4(key)
-	lw		state6,  8(key)
-	lw		state7,  12(key)
-	lw		state8,  16(key)
-	lw		state9,  20(key)
-	lw		state10, 24(key)
-	lw		state11, 28(key)
-
-	/* state[12,13] = counter */
-	mv		state12, cnt
-	srli		state13, cnt, 32
-
-	/* state[14,15] = 0 */
-	mv		state14, zero
-	mv		state15, zero
-
-	li		i, 10
-.Lpermute:
-	/* odd round */
-	OP_4REG	addw	line0, line1
-	OP_4REG	xor	line3, line0
-	OP_4REG	ROTRI	line3, _16
-
-	OP_4REG	addw	line2, line3
-	OP_4REG	xor	line1, line2
-	OP_4REG	ROTRI	line1, _20
-
-	OP_4REG	addw	line0, line1
-	OP_4REG	xor	line3, line0
-	OP_4REG	ROTRI	line3, _24
-
-	OP_4REG	addw	line2, line3
-	OP_4REG	xor	line1, line2
-	OP_4REG	ROTRI	line1, _25
-
-	/* even round */
-	OP_4REG	addw	line0, line1_perm
-	OP_4REG	xor	line3_perm, line0
-	OP_4REG	ROTRI	line3_perm, _16
-
-	OP_4REG	addw	line2_perm, line3_perm
-	OP_4REG	xor	line1_perm, line2_perm
-	OP_4REG	ROTRI	line1_perm, _20
-
-	OP_4REG	addw	line0, line1_perm
-	OP_4REG	xor	line3_perm, line0
-	OP_4REG	ROTRI	line3_perm, _24
-
-	OP_4REG	addw	line2_perm, line3_perm
-	OP_4REG	xor	line1_perm, line2_perm
-	OP_4REG	ROTRI	line1_perm, _25
-
-	addi		i, i, -1
-	bnez		i, .Lpermute
-
-	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
-	OP_4REG	addw	line0, copy
-	sw		state0,   (output)
-	sw		state1,  4(output)
-	sw		state2,  8(output)
-	sw		state3, 12(output)
-
-	/* from now on state[0,1,2,3] are scratch registers  */
-
-	/* state[0,1,2,3] = lo(key) */
-	lw		state0,   (key)
-	lw		state1,  4(key)
-	lw		state2,  8(key)
-	lw		state3, 12(key)
-
-	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
-	OP_4REG	addw	line1, line0
-	sw		state4, 16(output)
-	sw		state5, 20(output)
-	sw		state6, 24(output)
-	sw		state7, 28(output)
-
-	/* state[0,1,2,3] = hi(key) */
-	lw		state0, 16(key)
-	lw		state1, 20(key)
-	lw		state2, 24(key)
-	lw		state3, 28(key)
-
-	/* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */
-	OP_4REG	addw	line2, line0
-	sw		state8,  32(output)
-	sw		state9,  36(output)
-	sw		state10, 40(output)
-	sw		state11, 44(output)
-
-	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
-	addw		state12, state12, cnt
-	srli		state0, cnt, 32
-	addw		state13, state13, state0
-	sw		state12, 48(output)
-	sw		state13, 52(output)
-	sw		state14, 56(output)
-	sw		state15, 60(output)
-
-	/* ++counter */
-	addi		cnt, cnt, 1
-
-	/* output += 64 */
-	addi		output, output, 64
-	/* --nblocks */
-	addi		nblocks, nblocks, -1
-	bnez		nblocks, .Lblock
-
-	/* counter = [cnt_lo, cnt_hi] */
-	sd		cnt, (counter)
-
-	/* Zero out the potentially sensitive regs, in case nothing uses these
-	 * again.  As at now copy[0,1,2,3] just contains "expand 32-byte k" and
-	 * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we
-	 * only need to zero state[12,...,15].
-	 */
-	mv		state12, zero
-	mv		state13, zero
-	mv		state14, zero
-	mv		state15, zero
-
-	REG_L		s0,         (sp)
-	REG_L		s1,    SZREG(sp)
-	REG_L		s2,  2*SZREG(sp)
-	REG_L		s3,  3*SZREG(sp)
-	REG_L		s4,  4*SZREG(sp)
-	REG_L		s5,  5*SZREG(sp)
-	REG_L		s6,  6*SZREG(sp)
-	REG_L		s7,  7*SZREG(sp)
-	REG_L		s8,  8*SZREG(sp)
-	REG_L		s9,  9*SZREG(sp)
-	REG_L		s10, 10*SZREG(sp)
-	REG_L		s11, 11*SZREG(sp)
-	addi		sp, sp, 12*SZREG
-
-	ret
-SYM_FUNC_END(__arch_chacha20_blocks_nostack)
-- 
cgit v1.2.3


From 8b68e978718f14fdcb080c2a7791c52a0d09bc6d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 26 Feb 2025 16:01:57 +0100
Subject: x86/iopl: Cure TIF_IO_BITMAP inconsistencies

io_bitmap_exit() is invoked from exit_thread() when a task exists or
when a fork fails. In the latter case the exit_thread() cleans up
resources which were allocated during fork().

io_bitmap_exit() invokes task_update_io_bitmap(), which in turn ends up
in tss_update_io_bitmap(). tss_update_io_bitmap() operates on the
current task. If current has TIF_IO_BITMAP set, but no bitmap installed,
tss_update_io_bitmap() crashes with a NULL pointer dereference.

There are two issues, which lead to that problem:

  1) io_bitmap_exit() should not invoke task_update_io_bitmap() when
     the task, which is cleaned up, is not the current task. That's a
     clear indicator for a cleanup after a failed fork().

  2) A task should not have TIF_IO_BITMAP set and neither a bitmap
     installed nor IOPL emulation level 3 activated.

     This happens when a kernel thread is created in the context of
     a user space thread, which has TIF_IO_BITMAP set as the thread
     flags are copied and the IO bitmap pointer is cleared.

     Other than in the failed fork() case this has no impact because
     kernel threads including IO workers never return to user space and
     therefore never invoke tss_update_io_bitmap().

Cure this by adding the missing cleanups and checks:

  1) Prevent io_bitmap_exit() to invoke task_update_io_bitmap() if
     the to be cleaned up task is not the current task.

  2) Clear TIF_IO_BITMAP in copy_thread() unconditionally. For user
     space forks it is set later, when the IO bitmap is inherited in
     io_bitmap_share().

For paranoia sake, add a warning into tss_update_io_bitmap() to catch
the case, when that code is invoked with inconsistent state.

Fixes: ea5f1cd7ab49 ("x86/ioperm: Remove bitmap if all permissions dropped")
Reported-by: syzbot+e2b1803445d236442e54@syzkaller.appspotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/87wmdceom2.ffs@tglx
---
 arch/x86/kernel/ioport.c  | 13 +++++++++----
 arch/x86/kernel/process.c |  6 ++++++
 2 files changed, 15 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 6290dd120f5e..ff40f09ad911 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -33,8 +33,9 @@ void io_bitmap_share(struct task_struct *tsk)
 	set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
 }
 
-static void task_update_io_bitmap(struct task_struct *tsk)
+static void task_update_io_bitmap(void)
 {
+	struct task_struct *tsk = current;
 	struct thread_struct *t = &tsk->thread;
 
 	if (t->iopl_emul == 3 || t->io_bitmap) {
@@ -54,7 +55,12 @@ void io_bitmap_exit(struct task_struct *tsk)
 	struct io_bitmap *iobm = tsk->thread.io_bitmap;
 
 	tsk->thread.io_bitmap = NULL;
-	task_update_io_bitmap(tsk);
+	/*
+	 * Don't touch the TSS when invoked on a failed fork(). TSS
+	 * reflects the state of @current and not the state of @tsk.
+	 */
+	if (tsk == current)
+		task_update_io_bitmap();
 	if (iobm && refcount_dec_and_test(&iobm->refcnt))
 		kfree(iobm);
 }
@@ -192,8 +198,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
 	}
 
 	t->iopl_emul = level;
-	task_update_io_bitmap(current);
-
+	task_update_io_bitmap();
 	return 0;
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c1d2dac72b9c..704883c21f3a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -176,6 +176,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
 	frame->ret_addr = (unsigned long) ret_from_fork_asm;
 	p->thread.sp = (unsigned long) fork_frame;
 	p->thread.io_bitmap = NULL;
+	clear_tsk_thread_flag(p, TIF_IO_BITMAP);
 	p->thread.iopl_warn = 0;
 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 
@@ -464,6 +465,11 @@ void native_tss_update_io_bitmap(void)
 	} else {
 		struct io_bitmap *iobm = t->io_bitmap;
 
+		if (WARN_ON_ONCE(!iobm)) {
+			clear_thread_flag(TIF_IO_BITMAP);
+			native_tss_invalidate_io_bitmap();
+		}
+
 		/*
 		 * Only copy bitmap data when the sequence number differs. The
 		 * update time is accounted to the incoming task.
-- 
cgit v1.2.3


From 942349413a49670e8bed246e2185fd3a053227be Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Wed, 4 Jun 2025 10:17:05 +0200
Subject: um: fix SECCOMP 32bit xstate register restore

There was a typo that caused the extended FP state to be copied into the
wrong location on 32 bit. On 32 bit we only store the xstate internally
as that already contains everything. However, for compatibility, the
mcontext on 32 bit first contains the legacy FP state and then the
xstate.

The code copied the xstate on top of the legacy FP state instead of
using the correct offset. This offset was already calculated in the
xstate_* variables, so simply switch to those to fix the problem.

With this SECCOMP mode works on 32 bit, so lift the restriction.

Fixes: b1e1bd2e6943 ("um: Add helper functions to get/set state for SECCOMP")
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250604081705.934112-1-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/os-Linux/start_up.c     | 4 ----
 arch/x86/um/os-Linux/mcontext.c | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 49015be1aaaf..a827c2e01aa5 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -296,10 +296,6 @@ static bool __init init_seccomp(void)
 	int n;
 	unsigned long sp;
 
-	/* doesn't work on 32-bit right now */
-	if (!IS_ENABLED(CONFIG_64BIT))
-		return false;
-
 	/*
 	 * We check that we can install a seccomp filter and then exit(0)
 	 * from a trapped syscall.
diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c
index e661fdc44db9..a21403df6663 100644
--- a/arch/x86/um/os-Linux/mcontext.c
+++ b/arch/x86/um/os-Linux/mcontext.c
@@ -231,7 +231,7 @@ int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data,
 	xstate_size = fp_size;
 #endif
 
-	memcpy(fpstate_stub, &regs->fp, fp_size);
+	memcpy(xstate_stub, &regs->fp, xstate_size);
 
 #ifdef __i386__
 	/*
-- 
cgit v1.2.3


From cf8651f7319d12a6fd81e1d001afb0958ee2bf28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Fri, 23 May 2025 12:19:18 +0200
Subject: riscv: sbi: add Firmware Feature (FWFT) SBI extensions definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Firmware Features extension (FWFT) was added as part of the SBI 3.0
specification. Add SBI definitions to use this extension.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Samuel Holland <samuel.holland@sifive.com>
Tested-by: Samuel Holland <samuel.holland@sifive.com>
Reviewed-by: Deepak Gupta <debug@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Link: https://lore.kernel.org/r/20250523101932.1594077-2-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/sbi.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 3d250824178b..bb077d0c912f 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -35,6 +35,7 @@ enum sbi_ext_id {
 	SBI_EXT_DBCN = 0x4442434E,
 	SBI_EXT_STA = 0x535441,
 	SBI_EXT_NACL = 0x4E41434C,
+	SBI_EXT_FWFT = 0x46574654,
 
 	/* Experimentals extensions must lie within this range */
 	SBI_EXT_EXPERIMENTAL_START = 0x08000000,
@@ -402,6 +403,33 @@ enum sbi_ext_nacl_feature {
 #define SBI_NACL_SHMEM_SRET_X(__i)		((__riscv_xlen / 8) * (__i))
 #define SBI_NACL_SHMEM_SRET_X_LAST		31
 
+/* SBI function IDs for FW feature extension */
+#define SBI_EXT_FWFT_SET		0x0
+#define SBI_EXT_FWFT_GET		0x1
+
+enum sbi_fwft_feature_t {
+	SBI_FWFT_MISALIGNED_EXC_DELEG		= 0x0,
+	SBI_FWFT_LANDING_PAD			= 0x1,
+	SBI_FWFT_SHADOW_STACK			= 0x2,
+	SBI_FWFT_DOUBLE_TRAP			= 0x3,
+	SBI_FWFT_PTE_AD_HW_UPDATING		= 0x4,
+	SBI_FWFT_POINTER_MASKING_PMLEN		= 0x5,
+	SBI_FWFT_LOCAL_RESERVED_START		= 0x6,
+	SBI_FWFT_LOCAL_RESERVED_END		= 0x3fffffff,
+	SBI_FWFT_LOCAL_PLATFORM_START		= 0x40000000,
+	SBI_FWFT_LOCAL_PLATFORM_END		= 0x7fffffff,
+
+	SBI_FWFT_GLOBAL_RESERVED_START		= 0x80000000,
+	SBI_FWFT_GLOBAL_RESERVED_END		= 0xbfffffff,
+	SBI_FWFT_GLOBAL_PLATFORM_START		= 0xc0000000,
+	SBI_FWFT_GLOBAL_PLATFORM_END		= 0xffffffff,
+};
+
+#define SBI_FWFT_PLATFORM_FEATURE_BIT		BIT(30)
+#define SBI_FWFT_GLOBAL_FEATURE_BIT		BIT(31)
+
+#define SBI_FWFT_SET_FLAG_LOCK			BIT(0)
+
 /* SBI spec version fields */
 #define SBI_SPEC_VERSION_DEFAULT	0x1
 #define SBI_SPEC_VERSION_MAJOR_SHIFT	24
@@ -419,6 +447,11 @@ enum sbi_ext_nacl_feature {
 #define SBI_ERR_ALREADY_STARTED -7
 #define SBI_ERR_ALREADY_STOPPED -8
 #define SBI_ERR_NO_SHMEM	-9
+#define SBI_ERR_INVALID_STATE	-10
+#define SBI_ERR_BAD_RANGE	-11
+#define SBI_ERR_TIMEOUT		-12
+#define SBI_ERR_IO		-13
+#define SBI_ERR_DENIED_LOCKED	-14
 
 extern unsigned long sbi_spec_version;
 struct sbiret {
-- 
cgit v1.2.3


From a7cd450f0e06b4118b2ca8b4e1ef707d5c2c4506 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Fri, 23 May 2025 12:19:19 +0200
Subject: riscv: sbi: remove useless parenthesis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A few parenthesis in check for SBI version/extension were useless,
remove them.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Link: https://lore.kernel.org/r/20250523101932.1594077-3-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/kernel/sbi.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index 1989b8cade1b..1d44c35305a9 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -609,7 +609,7 @@ void __init sbi_init(void)
 		} else {
 			__sbi_rfence	= __sbi_rfence_v01;
 		}
-		if ((sbi_spec_version >= sbi_mk_version(0, 3)) &&
+		if (sbi_spec_version >= sbi_mk_version(0, 3) &&
 		    sbi_probe_extension(SBI_EXT_SRST)) {
 			pr_info("SBI SRST extension detected\n");
 			pm_power_off = sbi_srst_power_off;
@@ -617,8 +617,8 @@ void __init sbi_init(void)
 			sbi_srst_reboot_nb.priority = 192;
 			register_restart_handler(&sbi_srst_reboot_nb);
 		}
-		if ((sbi_spec_version >= sbi_mk_version(2, 0)) &&
-		    (sbi_probe_extension(SBI_EXT_DBCN) > 0)) {
+		if (sbi_spec_version >= sbi_mk_version(2, 0) &&
+		    sbi_probe_extension(SBI_EXT_DBCN) > 0) {
 			pr_info("SBI DBCN extension detected\n");
 			sbi_debug_console_available = true;
 		}
-- 
cgit v1.2.3


From 99cf5b7c738733032af9a265a6a5a6bc34b91900 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Fri, 23 May 2025 12:19:20 +0200
Subject: riscv: sbi: add new SBI error mappings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A few new errors have been added with SBI V3.0, maps them as close as
possible to errno values.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Link: https://lore.kernel.org/r/20250523101932.1594077-4-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/sbi.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index bb077d0c912f..0938f2a8d01b 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -536,11 +536,21 @@ static inline int sbi_err_map_linux_errno(int err)
 	case SBI_SUCCESS:
 		return 0;
 	case SBI_ERR_DENIED:
+	case SBI_ERR_DENIED_LOCKED:
 		return -EPERM;
 	case SBI_ERR_INVALID_PARAM:
+	case SBI_ERR_INVALID_STATE:
 		return -EINVAL;
+	case SBI_ERR_BAD_RANGE:
+		return -ERANGE;
 	case SBI_ERR_INVALID_ADDRESS:
 		return -EFAULT;
+	case SBI_ERR_NO_SHMEM:
+		return -ENOMEM;
+	case SBI_ERR_TIMEOUT:
+		return -ETIMEDOUT;
+	case SBI_ERR_IO:
+		return -EIO;
 	case SBI_ERR_NOT_SUPPORTED:
 	case SBI_ERR_FAILURE:
 	default:
-- 
cgit v1.2.3


From 6d6d0641dcfa9d1e398d75791283bf6d129135de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Fri, 23 May 2025 12:19:21 +0200
Subject: riscv: sbi: add FWFT extension interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This SBI extensions enables supervisor mode to control feature that are
under M-mode control (For instance, Svadu menvcfg ADUE bit, Ssdbltrp
DTE, etc). Add an interface to set local features for a specific cpu
mask as well as for the online cpu mask.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Link: https://lore.kernel.org/r/20250523101932.1594077-5-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/sbi.h | 17 +++++++++++++
 arch/riscv/kernel/sbi.c      | 57 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 0938f2a8d01b..341e74238aa0 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -503,6 +503,23 @@ int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask,
 				unsigned long asid);
 long sbi_probe_extension(int ext);
 
+int sbi_fwft_set(u32 feature, unsigned long value, unsigned long flags);
+int sbi_fwft_set_cpumask(const cpumask_t *mask, u32 feature,
+			 unsigned long value, unsigned long flags);
+/**
+ * sbi_fwft_set_online_cpus() - Set a feature on all online cpus
+ * @feature: The feature to be set
+ * @value: The feature value to be set
+ * @flags: FWFT feature set flags
+ *
+ * Return: 0 on success, appropriate linux error code otherwise.
+ */
+static inline int sbi_fwft_set_online_cpus(u32 feature, unsigned long value,
+					   unsigned long flags)
+{
+	return sbi_fwft_set_cpumask(cpu_online_mask, feature, value, flags);
+}
+
 /* Check if current SBI specification version is 0.1 or not */
 static inline int sbi_spec_is_0_1(void)
 {
diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index 1d44c35305a9..818efafdc8e9 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -299,6 +299,63 @@ static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask,
 	return 0;
 }
 
+struct fwft_set_req {
+	u32 feature;
+	unsigned long value;
+	unsigned long flags;
+	atomic_t error;
+};
+
+static void cpu_sbi_fwft_set(void *arg)
+{
+	struct fwft_set_req *req = arg;
+	int ret;
+
+	ret = sbi_fwft_set(req->feature, req->value, req->flags);
+	if (ret)
+		atomic_set(&req->error, ret);
+}
+
+/**
+ * sbi_fwft_set() - Set a feature on the local hart
+ * @feature: The feature ID to be set
+ * @value: The feature value to be set
+ * @flags: FWFT feature set flags
+ *
+ * Return: 0 on success, appropriate linux error code otherwise.
+ */
+int sbi_fwft_set(u32 feature, unsigned long value, unsigned long flags)
+{
+	return -EOPNOTSUPP;
+}
+
+/**
+ * sbi_fwft_set_cpumask() - Set a feature for the specified cpumask
+ * @mask: CPU mask of cpus that need the feature to be set
+ * @feature: The feature ID to be set
+ * @value: The feature value to be set
+ * @flags: FWFT feature set flags
+ *
+ * Return: 0 on success, appropriate linux error code otherwise.
+ */
+int sbi_fwft_set_cpumask(const cpumask_t *mask, u32 feature,
+			       unsigned long value, unsigned long flags)
+{
+	struct fwft_set_req req = {
+		.feature = feature,
+		.value = value,
+		.flags = flags,
+		.error = ATOMIC_INIT(0),
+	};
+
+	if (feature & SBI_FWFT_GLOBAL_FEATURE_BIT)
+		return -EINVAL;
+
+	on_each_cpu_mask(mask, cpu_sbi_fwft_set, &req, 1);
+
+	return atomic_read(&req.error);
+}
+
 /**
  * sbi_set_timer() - Program the timer for next timer event.
  * @stime_value: The value after which next timer event should fire.
-- 
cgit v1.2.3


From c4a50db1e1739a5d4698dee7cd4c6f6430bff7b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Fri, 23 May 2025 12:19:22 +0200
Subject: riscv: sbi: add SBI FWFT extension calls
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add FWFT extension calls. This will be ratified in SBI V3.0 hence, it is
provided as a separate commit that can be left out if needed.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Link: https://lore.kernel.org/r/20250523101932.1594077-6-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/kernel/sbi.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index 818efafdc8e9..53836a9235e3 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -299,6 +299,8 @@ static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask,
 	return 0;
 }
 
+static bool sbi_fwft_supported;
+
 struct fwft_set_req {
 	u32 feature;
 	unsigned long value;
@@ -326,7 +328,15 @@ static void cpu_sbi_fwft_set(void *arg)
  */
 int sbi_fwft_set(u32 feature, unsigned long value, unsigned long flags)
 {
-	return -EOPNOTSUPP;
+	struct sbiret ret;
+
+	if (!sbi_fwft_supported)
+		return -EOPNOTSUPP;
+
+	ret = sbi_ecall(SBI_EXT_FWFT, SBI_EXT_FWFT_SET,
+			feature, value, flags, 0, 0, 0);
+
+	return sbi_err_map_linux_errno(ret.error);
 }
 
 /**
@@ -348,6 +358,9 @@ int sbi_fwft_set_cpumask(const cpumask_t *mask, u32 feature,
 		.error = ATOMIC_INIT(0),
 	};
 
+	if (!sbi_fwft_supported)
+		return -EOPNOTSUPP;
+
 	if (feature & SBI_FWFT_GLOBAL_FEATURE_BIT)
 		return -EINVAL;
 
@@ -679,6 +692,11 @@ void __init sbi_init(void)
 			pr_info("SBI DBCN extension detected\n");
 			sbi_debug_console_available = true;
 		}
+		if (sbi_spec_version >= sbi_mk_version(3, 0) &&
+		    sbi_probe_extension(SBI_EXT_FWFT)) {
+			pr_info("SBI FWFT extension detected\n");
+			sbi_fwft_supported = true;
+		}
 	} else {
 		__sbi_set_timer = __sbi_set_timer_v01;
 		__sbi_send_ipi	= __sbi_send_ipi_v01;
-- 
cgit v1.2.3


From cf5a8abc6560f989a880bec3647c614e638a0c9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Fri, 23 May 2025 12:19:23 +0200
Subject: riscv: misaligned: request misaligned exception from SBI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that the kernel can handle misaligned accesses in S-mode, request
misaligned access exception delegation from SBI. This uses the FWFT SBI
extension defined in SBI version 3.0.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20250523101932.1594077-7-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/cpufeature.h        |  3 +-
 arch/riscv/kernel/traps_misaligned.c       | 71 ++++++++++++++++++++++++++++--
 arch/riscv/kernel/unaligned_access_speed.c |  8 +++-
 3 files changed, 77 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
index f56b409361fb..dbe5970d4fe6 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -67,8 +67,9 @@ void __init riscv_user_isa_enable(void);
 	_RISCV_ISA_EXT_DATA(_name, _id, _sub_exts, ARRAY_SIZE(_sub_exts), _validate)
 
 bool __init check_unaligned_access_emulated_all_cpus(void);
+void unaligned_access_init(void);
+int cpu_online_unaligned_access_init(unsigned int cpu);
 #if defined(CONFIG_RISCV_SCALAR_MISALIGNED)
-void check_unaligned_access_emulated(struct work_struct *work __always_unused);
 void unaligned_emulation_finish(void);
 bool unaligned_ctl_available(void);
 DECLARE_PER_CPU(long, misaligned_access_speed);
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 77c788660223..592b1a28e897 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -16,6 +16,7 @@
 #include <asm/entry-common.h>
 #include <asm/hwprobe.h>
 #include <asm/cpufeature.h>
+#include <asm/sbi.h>
 #include <asm/vector.h>
 
 #define INSN_MATCH_LB			0x3
@@ -646,7 +647,7 @@ bool __init check_vector_unaligned_access_emulated_all_cpus(void)
 
 static bool unaligned_ctl __read_mostly;
 
-void check_unaligned_access_emulated(struct work_struct *work __always_unused)
+static void check_unaligned_access_emulated(struct work_struct *work __always_unused)
 {
 	int cpu = smp_processor_id();
 	long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu);
@@ -657,6 +658,13 @@ void check_unaligned_access_emulated(struct work_struct *work __always_unused)
 	__asm__ __volatile__ (
 		"       "REG_L" %[tmp], 1(%[ptr])\n"
 		: [tmp] "=r" (tmp_val) : [ptr] "r" (&tmp_var) : "memory");
+}
+
+static int cpu_online_check_unaligned_access_emulated(unsigned int cpu)
+{
+	long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu);
+
+	check_unaligned_access_emulated(NULL);
 
 	/*
 	 * If unaligned_ctl is already set, this means that we detected that all
@@ -665,9 +673,10 @@ void check_unaligned_access_emulated(struct work_struct *work __always_unused)
 	 */
 	if (unlikely(unaligned_ctl && (*mas_ptr != RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED))) {
 		pr_crit("CPU misaligned accesses non homogeneous (expected all emulated)\n");
-		while (true)
-			cpu_relax();
+		return -EINVAL;
 	}
+
+	return 0;
 }
 
 bool __init check_unaligned_access_emulated_all_cpus(void)
@@ -699,4 +708,60 @@ bool __init check_unaligned_access_emulated_all_cpus(void)
 {
 	return false;
 }
+static int cpu_online_check_unaligned_access_emulated(unsigned int cpu)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_RISCV_SBI
+
+static bool misaligned_traps_delegated;
+
+static int cpu_online_sbi_unaligned_setup(unsigned int cpu)
+{
+	if (sbi_fwft_set(SBI_FWFT_MISALIGNED_EXC_DELEG, 1, 0) &&
+	    misaligned_traps_delegated) {
+		pr_crit("Misaligned trap delegation non homogeneous (expected delegated)");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+void __init unaligned_access_init(void)
+{
+	int ret;
+
+	ret = sbi_fwft_set_online_cpus(SBI_FWFT_MISALIGNED_EXC_DELEG, 1, 0);
+	if (ret)
+		return;
+
+	misaligned_traps_delegated = true;
+	pr_info("SBI misaligned access exception delegation ok\n");
+	/*
+	 * Note that we don't have to take any specific action here, if
+	 * the delegation is successful, then
+	 * check_unaligned_access_emulated() will verify that indeed the
+	 * platform traps on misaligned accesses.
+	 */
+}
+#else
+void __init unaligned_access_init(void) {}
+
+static int cpu_online_sbi_unaligned_setup(unsigned int cpu __always_unused)
+{
+	return 0;
+}
 #endif
+
+int cpu_online_unaligned_access_init(unsigned int cpu)
+{
+	int ret;
+
+	ret = cpu_online_sbi_unaligned_setup(cpu);
+	if (ret)
+		return ret;
+
+	return cpu_online_check_unaligned_access_emulated(cpu);
+}
diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
index b8ba13819d05..ae2068425fbc 100644
--- a/arch/riscv/kernel/unaligned_access_speed.c
+++ b/arch/riscv/kernel/unaligned_access_speed.c
@@ -236,6 +236,11 @@ arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
 
 static int riscv_online_cpu(unsigned int cpu)
 {
+	int ret = cpu_online_unaligned_access_init(cpu);
+
+	if (ret)
+		return ret;
+
 	/* We are already set since the last check */
 	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
 		goto exit;
@@ -248,7 +253,6 @@ static int riscv_online_cpu(unsigned int cpu)
 	{
 		static struct page *buf;
 
-		check_unaligned_access_emulated(NULL);
 		buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
 		if (!buf) {
 			pr_warn("Allocation failure, not measuring misaligned performance\n");
@@ -439,6 +443,8 @@ static int __init check_unaligned_access_all_cpus(void)
 {
 	int cpu;
 
+	unaligned_access_init();
+
 	if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
 		pr_info("scalar unaligned access speed set to '%s' (%lu) by command line\n",
 			speed_str[unaligned_scalar_speed_param], unaligned_scalar_speed_param);
-- 
cgit v1.2.3


From 9f9f6fdd1dc6791bcfe251160a96a446199f85ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Fri, 23 May 2025 12:19:24 +0200
Subject: riscv: misaligned: use on_each_cpu() for scalar misaligned access
 probing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

schedule_on_each_cpu() was used without any good reason while documented
as very slow. This call was in the boot path, so better use
on_each_cpu() for scalar misaligned checking. Vector misaligned check
still needs to use schedule_on_each_cpu() since it requires irqs to be
enabled but that's less of a problem since this code is ran in a kthread.
Add a comment to explicit that.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20250523101932.1594077-8-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/kernel/traps_misaligned.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 592b1a28e897..34b4a4e9dfca 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -627,6 +627,10 @@ bool __init check_vector_unaligned_access_emulated_all_cpus(void)
 {
 	int cpu;
 
+	/*
+	 * While being documented as very slow, schedule_on_each_cpu() is used since
+	 * kernel_vector_begin() expects irqs to be enabled or it will panic()
+	 */
 	schedule_on_each_cpu(check_vector_unaligned_access_emulated);
 
 	for_each_online_cpu(cpu)
@@ -647,7 +651,7 @@ bool __init check_vector_unaligned_access_emulated_all_cpus(void)
 
 static bool unaligned_ctl __read_mostly;
 
-static void check_unaligned_access_emulated(struct work_struct *work __always_unused)
+static void check_unaligned_access_emulated(void *arg __always_unused)
 {
 	int cpu = smp_processor_id();
 	long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu);
@@ -688,7 +692,7 @@ bool __init check_unaligned_access_emulated_all_cpus(void)
 	 * accesses emulated since tasks requesting such control can run on any
 	 * CPU.
 	 */
-	schedule_on_each_cpu(check_unaligned_access_emulated);
+	on_each_cpu(check_unaligned_access_emulated, NULL, 1);
 
 	for_each_online_cpu(cpu)
 		if (per_cpu(misaligned_access_speed, cpu)
-- 
cgit v1.2.3


From 1317045a7d6f397904d105f6d40dc9787876a34b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Fri, 23 May 2025 12:19:25 +0200
Subject: riscv: misaligned: declare misaligned_access_speed under
 CONFIG_RISCV_MISALIGNED
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While misaligned_access_speed was defined in a file compile with
CONFIG_RISCV_MISALIGNED, its definition was under
CONFIG_RISCV_SCALAR_MISALIGNED. This resulted in compilation problems
when using it in a file compiled with CONFIG_RISCV_MISALIGNED.

Move the declaration under CONFIG_RISCV_MISALIGNED so that it can be
used unconditionnally when compiled with that config and remove the check
for that variable in traps_misaligned.c.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Charlie Jenkins <charlie@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20250523101932.1594077-9-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/cpufeature.h  | 5 ++++-
 arch/riscv/kernel/traps_misaligned.c | 2 --
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
index dbe5970d4fe6..2bfa4ef383ed 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -72,7 +72,6 @@ int cpu_online_unaligned_access_init(unsigned int cpu);
 #if defined(CONFIG_RISCV_SCALAR_MISALIGNED)
 void unaligned_emulation_finish(void);
 bool unaligned_ctl_available(void);
-DECLARE_PER_CPU(long, misaligned_access_speed);
 #else
 static inline bool unaligned_ctl_available(void)
 {
@@ -80,6 +79,10 @@ static inline bool unaligned_ctl_available(void)
 }
 #endif
 
+#if defined(CONFIG_RISCV_MISALIGNED)
+DECLARE_PER_CPU(long, misaligned_access_speed);
+#endif
+
 bool __init check_vector_unaligned_access_emulated_all_cpus(void);
 #if defined(CONFIG_RISCV_VECTOR_MISALIGNED)
 void check_vector_unaligned_access_emulated(struct work_struct *work __always_unused);
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 34b4a4e9dfca..f1b2af515592 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -369,9 +369,7 @@ static int handle_scalar_misaligned_load(struct pt_regs *regs)
 
 	perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr);
 
-#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
 	*this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED;
-#endif
 
 	if (!unaligned_enabled)
 		return -1;
-- 
cgit v1.2.3


From 4eaaa65e301208d6ff612ad2244c6174c9d852b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Fri, 23 May 2025 12:19:26 +0200
Subject: riscv: misaligned: move emulated access uniformity check in a
 function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Split the code that check for the uniformity of misaligned accesses
performance on all cpus from check_unaligned_access_emulated_all_cpus()
to its own function which will be used for delegation check. No
functional changes intended.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20250523101932.1594077-10-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/kernel/traps_misaligned.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index f1b2af515592..7ecaa8103fe7 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -645,6 +645,18 @@ bool __init check_vector_unaligned_access_emulated_all_cpus(void)
 }
 #endif
 
+static bool all_cpus_unaligned_scalar_access_emulated(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		if (per_cpu(misaligned_access_speed, cpu) !=
+		    RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED)
+			return false;
+
+	return true;
+}
+
 #ifdef CONFIG_RISCV_SCALAR_MISALIGNED
 
 static bool unaligned_ctl __read_mostly;
@@ -683,8 +695,6 @@ static int cpu_online_check_unaligned_access_emulated(unsigned int cpu)
 
 bool __init check_unaligned_access_emulated_all_cpus(void)
 {
-	int cpu;
-
 	/*
 	 * We can only support PR_UNALIGN controls if all CPUs have misaligned
 	 * accesses emulated since tasks requesting such control can run on any
@@ -692,10 +702,8 @@ bool __init check_unaligned_access_emulated_all_cpus(void)
 	 */
 	on_each_cpu(check_unaligned_access_emulated, NULL, 1);
 
-	for_each_online_cpu(cpu)
-		if (per_cpu(misaligned_access_speed, cpu)
-		    != RISCV_HWPROBE_MISALIGNED_SCALAR_EMULATED)
-			return false;
+	if (!all_cpus_unaligned_scalar_access_emulated())
+		return false;
 
 	unaligned_ctl = true;
 	return true;
-- 
cgit v1.2.3


From 7977448bf374f6e9592153838f072a89bd3b5c45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Fri, 23 May 2025 12:19:27 +0200
Subject: riscv: misaligned: add a function to check misalign trap delegability
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Checking for the delegability of the misaligned access trap is needed
for the KVM FWFT extension implementation. Add a function to get the
delegability of the misaligned trap exception.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
Tested-by: Charlie Jenkins <charlie@rivosinc.com>
Link: https://lore.kernel.org/r/20250523101932.1594077-11-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/cpufeature.h  |  6 ++++++
 arch/riscv/kernel/traps_misaligned.c | 17 +++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/cpufeature.h b/arch/riscv/include/asm/cpufeature.h
index 2bfa4ef383ed..fbd0e4306c93 100644
--- a/arch/riscv/include/asm/cpufeature.h
+++ b/arch/riscv/include/asm/cpufeature.h
@@ -81,6 +81,12 @@ static inline bool unaligned_ctl_available(void)
 
 #if defined(CONFIG_RISCV_MISALIGNED)
 DECLARE_PER_CPU(long, misaligned_access_speed);
+bool misaligned_traps_can_delegate(void);
+#else
+static inline bool misaligned_traps_can_delegate(void)
+{
+	return false;
+}
 #endif
 
 bool __init check_vector_unaligned_access_emulated_all_cpus(void);
diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
index 7ecaa8103fe7..93043924fe6c 100644
--- a/arch/riscv/kernel/traps_misaligned.c
+++ b/arch/riscv/kernel/traps_misaligned.c
@@ -724,10 +724,10 @@ static int cpu_online_check_unaligned_access_emulated(unsigned int cpu)
 }
 #endif
 
-#ifdef CONFIG_RISCV_SBI
-
 static bool misaligned_traps_delegated;
 
+#ifdef CONFIG_RISCV_SBI
+
 static int cpu_online_sbi_unaligned_setup(unsigned int cpu)
 {
 	if (sbi_fwft_set(SBI_FWFT_MISALIGNED_EXC_DELEG, 1, 0) &&
@@ -763,6 +763,7 @@ static int cpu_online_sbi_unaligned_setup(unsigned int cpu __always_unused)
 {
 	return 0;
 }
+
 #endif
 
 int cpu_online_unaligned_access_init(unsigned int cpu)
@@ -775,3 +776,15 @@ int cpu_online_unaligned_access_init(unsigned int cpu)
 
 	return cpu_online_check_unaligned_access_emulated(cpu);
 }
+
+bool misaligned_traps_can_delegate(void)
+{
+	/*
+	 * Either we successfully requested misaligned traps delegation for all
+	 * CPUs, or the SBI does not implement the FWFT extension but delegated
+	 * the exception by default.
+	 */
+	return misaligned_traps_delegated ||
+	       all_cpus_unaligned_scalar_access_emulated();
+}
+EXPORT_SYMBOL_GPL(misaligned_traps_can_delegate);
-- 
cgit v1.2.3


From 27a041040f2c6cfbbc24053bce9acb2e801e4e71 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Thu, 5 Jun 2025 07:03:24 +0200
Subject: um: fix unused variable warning

The code was updated to access the PID of the userspace stub process in
a different way, making the local cpu variable obsolete. Remove it.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202506050008.AwXLNxQX-lkp@intel.com/
Fixes: 406d17c6c370 ("um: Implement kernel side of SECCOMP based process handling")
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250605050325.1077208-1-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/x86/um/tls_32.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index 21cbb70cf771..cb3f17627d16 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -25,7 +25,6 @@ int host_gdt_entry_tls_min;
 static int do_set_thread_area(struct task_struct* task, struct user_desc *info)
 {
 	int ret;
-	u32 cpu;
 
 	if (info->entry_number < host_gdt_entry_tls_min ||
 	    info->entry_number >= host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES)
@@ -41,9 +40,7 @@ static int do_set_thread_area(struct task_struct* task, struct user_desc *info)
 		return 0;
 	}
 
-	cpu = get_cpu();
 	ret = os_set_thread_area(info, task->mm->context.id.pid);
-	put_cpu();
 
 	if (ret)
 		printk(KERN_ERR "PTRACE_SET_THREAD_AREA failed, err = %d, "
-- 
cgit v1.2.3


From e56a50ff7c12983aba710bd02a2c2ad401379e91 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Thu, 5 Jun 2025 07:03:25 +0200
Subject: um: remove "extern" from implementation of sigchld_handler

There is no need to mark the function as extern in the implementation.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202506051226.X8r7X5aa-lkp@intel.com/
Fixes: 8420e08fe3a5 ("um: Track userspace children dying in SECCOMP mode")
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250605050325.1077208-2-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/kernel/irq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index f1787be3983c..0dfaf96bb7da 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -691,8 +691,8 @@ void __init init_IRQ(void)
 	os_setup_epoll();
 }
 
-extern void sigchld_handler(int sig, struct siginfo *unused_si,
-			    struct uml_pt_regs *regs, void *mc)
+void sigchld_handler(int sig, struct siginfo *unused_si,
+		     struct uml_pt_regs *regs, void *mc)
 {
 	do_IRQ(SIGCHLD_IRQ, regs);
 }
-- 
cgit v1.2.3


From 11709abccf93b08adde95ef313c300b0d4bc28f1 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 3 Jun 2025 15:49:36 +0200
Subject: s390/mm: Fix in_atomic() handling in do_secure_storage_access()

Kernel user spaces accesses to not exported pages in atomic context
incorrectly try to resolve the page fault.
With debug options enabled call traces like this can be seen:

BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:1523
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 419074, name: qemu-system-s39
preempt_count: 1, expected: 0
RCU nest depth: 0, expected: 0
INFO: lockdep is turned off.
Preemption disabled at:
[<00000383ea47cfa2>] copy_page_from_iter_atomic+0xa2/0x8a0
CPU: 12 UID: 0 PID: 419074 Comm: qemu-system-s39
Tainted: G        W           6.16.0-20250531.rc0.git0.69b3a602feac.63.fc42.s390x+debug #1 PREEMPT
Tainted: [W]=WARN
Hardware name: IBM 3931 A01 703 (LPAR)
Call Trace:
 [<00000383e990d282>] dump_stack_lvl+0xa2/0xe8
 [<00000383e99bf152>] __might_resched+0x292/0x2d0
 [<00000383eaa7c374>] down_read+0x34/0x2d0
 [<00000383e99432f8>] do_secure_storage_access+0x108/0x360
 [<00000383eaa724b0>] __do_pgm_check+0x130/0x220
 [<00000383eaa842e4>] pgm_check_handler+0x114/0x160
 [<00000383ea47d028>] copy_page_from_iter_atomic+0x128/0x8a0
([<00000383ea47d016>] copy_page_from_iter_atomic+0x116/0x8a0)
 [<00000383e9c45eae>] generic_perform_write+0x16e/0x310
 [<00000383e9eb87f4>] ext4_buffered_write_iter+0x84/0x160
 [<00000383e9da0de4>] vfs_write+0x1c4/0x460
 [<00000383e9da123c>] ksys_write+0x7c/0x100
 [<00000383eaa7284e>] __do_syscall+0x15e/0x280
 [<00000383eaa8417e>] system_call+0x6e/0x90
INFO: lockdep is turned off.

It is not allowed to take the mmap_lock while in atomic context. Therefore
handle such a secure storage access fault as if the accessed page is not
mapped: the uaccess function will return -EFAULT, and the caller has to
deal with this. Usually this means that the access is retried in process
context, which allows to resolve the page fault (or in this case export the
page).

Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Acked-by: Alexander Gordeev <agordeev@linux.ibm.com>
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Link: https://lore.kernel.org/r/20250603134936.1314139-1-hca@linux.ibm.com
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/fault.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index da84ff6770de..8b3f6dd00eab 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -442,6 +442,8 @@ void do_secure_storage_access(struct pt_regs *regs)
 		if (rc)
 			BUG();
 	} else {
+		if (faulthandler_disabled())
+			return handle_fault_error_nolock(regs, 0);
 		mm = current->mm;
 		mmap_read_lock(mm);
 		vma = find_vma(mm, addr);
-- 
cgit v1.2.3


From 6678791ee3da0b78c28fe7d77814097f53cbb8df Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 3 Jun 2025 08:08:21 +0100
Subject: KVM: arm64: Add assignment-specific sysreg accessor

Assigning a value to a system register doesn't do what it is
supposed to be doing if that register is one that has RESx bits.

The main problem is that we use __vcpu_sys_reg(), which can be used
both as a lvalue and rvalue. When used as a lvalue, the bit masking
occurs *before* the new value is assigned, meaning that we (1) do
pointless work on the old cvalue, and (2) potentially assign an
invalid value as we fail to apply the masks to it.

Fix this by providing a new __vcpu_assign_sys_reg() that does
what it says on the tin, and sanitises the *new* value instead of
the old one. This comes with a significant amount of churn.

Reviewed-by: Miguel Luis <miguel.luis@oracle.com>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20250603070824.1192795-2-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h          | 11 ++++++++
 arch/arm64/kvm/arch_timer.c                | 16 +++++------
 arch/arm64/kvm/hyp/exception.c             |  4 +--
 arch/arm64/kvm/hyp/include/hyp/switch.h    |  4 +--
 arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h |  6 ++--
 arch/arm64/kvm/hyp/nvhe/hyp-main.c         |  4 +--
 arch/arm64/kvm/hyp/vhe/switch.c            |  4 +--
 arch/arm64/kvm/hyp/vhe/sysreg-sr.c         | 44 +++++++++++++++---------------
 arch/arm64/kvm/pmu-emul.c                  | 14 +++++-----
 arch/arm64/kvm/sys_regs.c                  | 38 ++++++++++++++------------
 arch/arm64/kvm/sys_regs.h                  |  4 +--
 arch/arm64/kvm/vgic/vgic-v3-nested.c       | 10 +++----
 12 files changed, 86 insertions(+), 73 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index d941abc6b5ee..3b84ad91116b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1107,6 +1107,17 @@ static inline u64 *___ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r)
 #define ctxt_sys_reg(c,r)	(*__ctxt_sys_reg(c,r))
 
 u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64);
+
+#define __vcpu_assign_sys_reg(v, r, val)				\
+	do {								\
+		const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt;	\
+		u64 __v = (val);					\
+		if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__)	\
+			__v = kvm_vcpu_apply_reg_masks((v), (r), __v);	\
+									\
+		ctxt_sys_reg(ctxt, (r)) = __v;				\
+	} while (0)
+
 #define __vcpu_sys_reg(v,r)						\
 	(*({								\
 		const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt;	\
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 5133dcbfe9f7..5a67a6d4d95b 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -108,16 +108,16 @@ static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
 
 	switch(arch_timer_ctx_index(ctxt)) {
 	case TIMER_VTIMER:
-		__vcpu_sys_reg(vcpu, CNTV_CTL_EL0) = ctl;
+		__vcpu_assign_sys_reg(vcpu, CNTV_CTL_EL0, ctl);
 		break;
 	case TIMER_PTIMER:
-		__vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = ctl;
+		__vcpu_assign_sys_reg(vcpu, CNTP_CTL_EL0, ctl);
 		break;
 	case TIMER_HVTIMER:
-		__vcpu_sys_reg(vcpu, CNTHV_CTL_EL2) = ctl;
+		__vcpu_assign_sys_reg(vcpu, CNTHV_CTL_EL2, ctl);
 		break;
 	case TIMER_HPTIMER:
-		__vcpu_sys_reg(vcpu, CNTHP_CTL_EL2) = ctl;
+		__vcpu_assign_sys_reg(vcpu, CNTHP_CTL_EL2, ctl);
 		break;
 	default:
 		WARN_ON(1);
@@ -130,16 +130,16 @@ static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval)
 
 	switch(arch_timer_ctx_index(ctxt)) {
 	case TIMER_VTIMER:
-		__vcpu_sys_reg(vcpu, CNTV_CVAL_EL0) = cval;
+		__vcpu_assign_sys_reg(vcpu, CNTV_CVAL_EL0, cval);
 		break;
 	case TIMER_PTIMER:
-		__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = cval;
+		__vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, cval);
 		break;
 	case TIMER_HVTIMER:
-		__vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2) = cval;
+		__vcpu_assign_sys_reg(vcpu, CNTHV_CVAL_EL2, cval);
 		break;
 	case TIMER_HPTIMER:
-		__vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = cval;
+		__vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, cval);
 		break;
 	default:
 		WARN_ON(1);
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 424a5107cddb..6a2a899a344e 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -37,7 +37,7 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
 	if (unlikely(vcpu_has_nv(vcpu)))
 		vcpu_write_sys_reg(vcpu, val, reg);
 	else if (!__vcpu_write_sys_reg_to_cpu(val, reg))
-		__vcpu_sys_reg(vcpu, reg) = val;
+		__vcpu_assign_sys_reg(vcpu, reg, val);
 }
 
 static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
@@ -51,7 +51,7 @@ static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
 	} else if (has_vhe()) {
 		write_sysreg_el1(val, SYS_SPSR);
 	} else {
-		__vcpu_sys_reg(vcpu, SPSR_EL1) = val;
+		__vcpu_assign_sys_reg(vcpu, SPSR_EL1, val);
 	}
 }
 
diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index eef310cdbdbd..aa5b561b9218 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -45,7 +45,7 @@ static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu)
 	if (!vcpu_el1_is_32bit(vcpu))
 		return;
 
-	__vcpu_sys_reg(vcpu, FPEXC32_EL2) = read_sysreg(fpexc32_el2);
+	__vcpu_assign_sys_reg(vcpu, FPEXC32_EL2, read_sysreg(fpexc32_el2));
 }
 
 static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
@@ -457,7 +457,7 @@ static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu)
 	 */
 	if (vcpu_has_sve(vcpu)) {
 		zcr_el1 = read_sysreg_el1(SYS_ZCR);
-		__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1;
+		__vcpu_assign_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu), zcr_el1);
 
 		/*
 		 * The guest's state is always saved using the guest's max VL.
diff --git a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
index b9cff893bbe0..4d0dbea4c56f 100644
--- a/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/sysreg-sr.h
@@ -307,11 +307,11 @@ static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu)
 	vcpu->arch.ctxt.spsr_irq = read_sysreg(spsr_irq);
 	vcpu->arch.ctxt.spsr_fiq = read_sysreg(spsr_fiq);
 
-	__vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2);
-	__vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2);
+	__vcpu_assign_sys_reg(vcpu, DACR32_EL2, read_sysreg(dacr32_el2));
+	__vcpu_assign_sys_reg(vcpu, IFSR32_EL2, read_sysreg(ifsr32_el2));
 
 	if (has_vhe() || kvm_debug_regs_in_use(vcpu))
-		__vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2);
+		__vcpu_assign_sys_reg(vcpu, DBGVCR32_EL2, read_sysreg(dbgvcr32_el2));
 }
 
 static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 8e8848de4d47..e9198e56e784 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -26,7 +26,7 @@ void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);
 
 static void __hyp_sve_save_guest(struct kvm_vcpu *vcpu)
 {
-	__vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR);
+	__vcpu_assign_sys_reg(vcpu, ZCR_EL1, read_sysreg_el1(SYS_ZCR));
 	/*
 	 * On saving/restoring guest sve state, always use the maximum VL for
 	 * the guest. The layout of the data when saving the sve state depends
@@ -79,7 +79,7 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
 
 	has_fpmr = kvm_has_fpmr(kern_hyp_va(vcpu->kvm));
 	if (has_fpmr)
-		__vcpu_sys_reg(vcpu, FPMR) = read_sysreg_s(SYS_FPMR);
+		__vcpu_assign_sys_reg(vcpu, FPMR, read_sysreg_s(SYS_FPMR));
 
 	if (system_supports_sve())
 		__hyp_sve_restore_host();
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index c9b330dc2066..09df2b42bc1b 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -223,9 +223,9 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu)
 		 */
 		val = read_sysreg_el0(SYS_CNTP_CVAL);
 		if (map.direct_ptimer == vcpu_ptimer(vcpu))
-			__vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = val;
+			__vcpu_assign_sys_reg(vcpu, CNTP_CVAL_EL0, val);
 		if (map.direct_ptimer == vcpu_hptimer(vcpu))
-			__vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = val;
+			__vcpu_assign_sys_reg(vcpu, CNTHP_CVAL_EL2, val);
 
 		offset = read_sysreg_s(SYS_CNTPOFF_EL2);
 
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 3814b0b2c937..34c7bf7fe9de 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -18,17 +18,17 @@
 static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu)
 {
 	/* These registers are common with EL1 */
-	__vcpu_sys_reg(vcpu, PAR_EL1)	= read_sysreg(par_el1);
-	__vcpu_sys_reg(vcpu, TPIDR_EL1)	= read_sysreg(tpidr_el1);
-
-	__vcpu_sys_reg(vcpu, ESR_EL2)	= read_sysreg_el1(SYS_ESR);
-	__vcpu_sys_reg(vcpu, AFSR0_EL2)	= read_sysreg_el1(SYS_AFSR0);
-	__vcpu_sys_reg(vcpu, AFSR1_EL2)	= read_sysreg_el1(SYS_AFSR1);
-	__vcpu_sys_reg(vcpu, FAR_EL2)	= read_sysreg_el1(SYS_FAR);
-	__vcpu_sys_reg(vcpu, MAIR_EL2)	= read_sysreg_el1(SYS_MAIR);
-	__vcpu_sys_reg(vcpu, VBAR_EL2)	= read_sysreg_el1(SYS_VBAR);
-	__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR);
-	__vcpu_sys_reg(vcpu, AMAIR_EL2)	= read_sysreg_el1(SYS_AMAIR);
+	__vcpu_assign_sys_reg(vcpu, PAR_EL1,	 read_sysreg(par_el1));
+	__vcpu_assign_sys_reg(vcpu, TPIDR_EL1,	 read_sysreg(tpidr_el1));
+
+	__vcpu_assign_sys_reg(vcpu, ESR_EL2,	 read_sysreg_el1(SYS_ESR));
+	__vcpu_assign_sys_reg(vcpu, AFSR0_EL2,	 read_sysreg_el1(SYS_AFSR0));
+	__vcpu_assign_sys_reg(vcpu, AFSR1_EL2,	 read_sysreg_el1(SYS_AFSR1));
+	__vcpu_assign_sys_reg(vcpu, FAR_EL2,	 read_sysreg_el1(SYS_FAR));
+	__vcpu_assign_sys_reg(vcpu, MAIR_EL2,	 read_sysreg_el1(SYS_MAIR));
+	__vcpu_assign_sys_reg(vcpu, VBAR_EL2,	 read_sysreg_el1(SYS_VBAR));
+	__vcpu_assign_sys_reg(vcpu, CONTEXTIDR_EL2, read_sysreg_el1(SYS_CONTEXTIDR));
+	__vcpu_assign_sys_reg(vcpu, AMAIR_EL2,	 read_sysreg_el1(SYS_AMAIR));
 
 	/*
 	 * In VHE mode those registers are compatible between EL1 and EL2,
@@ -46,21 +46,21 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu)
 		 * are always trapped, ensuring that the in-memory
 		 * copy is always up-to-date. A small blessing...
 		 */
-		__vcpu_sys_reg(vcpu, SCTLR_EL2)	= read_sysreg_el1(SYS_SCTLR);
-		__vcpu_sys_reg(vcpu, TTBR0_EL2)	= read_sysreg_el1(SYS_TTBR0);
-		__vcpu_sys_reg(vcpu, TTBR1_EL2)	= read_sysreg_el1(SYS_TTBR1);
-		__vcpu_sys_reg(vcpu, TCR_EL2)	= read_sysreg_el1(SYS_TCR);
+		__vcpu_assign_sys_reg(vcpu, SCTLR_EL2,	 read_sysreg_el1(SYS_SCTLR));
+		__vcpu_assign_sys_reg(vcpu, TTBR0_EL2,	 read_sysreg_el1(SYS_TTBR0));
+		__vcpu_assign_sys_reg(vcpu, TTBR1_EL2,	 read_sysreg_el1(SYS_TTBR1));
+		__vcpu_assign_sys_reg(vcpu, TCR_EL2,	 read_sysreg_el1(SYS_TCR));
 
 		if (ctxt_has_tcrx(&vcpu->arch.ctxt)) {
-			__vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2);
+			__vcpu_assign_sys_reg(vcpu, TCR2_EL2, read_sysreg_el1(SYS_TCR2));
 
 			if (ctxt_has_s1pie(&vcpu->arch.ctxt)) {
-				__vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0);
-				__vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR);
+				__vcpu_assign_sys_reg(vcpu, PIRE0_EL2, read_sysreg_el1(SYS_PIRE0));
+				__vcpu_assign_sys_reg(vcpu, PIR_EL2, read_sysreg_el1(SYS_PIR));
 			}
 
 			if (ctxt_has_s1poe(&vcpu->arch.ctxt))
-				__vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR);
+				__vcpu_assign_sys_reg(vcpu, POR_EL2, read_sysreg_el1(SYS_POR));
 		}
 
 		/*
@@ -74,9 +74,9 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu)
 		__vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val;
 	}
 
-	__vcpu_sys_reg(vcpu, SP_EL2)	= read_sysreg(sp_el1);
-	__vcpu_sys_reg(vcpu, ELR_EL2)	= read_sysreg_el1(SYS_ELR);
-	__vcpu_sys_reg(vcpu, SPSR_EL2)	= read_sysreg_el1(SYS_SPSR);
+	__vcpu_assign_sys_reg(vcpu, SP_EL2,	 read_sysreg(sp_el1));
+	__vcpu_assign_sys_reg(vcpu, ELR_EL2,	 read_sysreg_el1(SYS_ELR));
+	__vcpu_assign_sys_reg(vcpu, SPSR_EL2,	 read_sysreg_el1(SYS_SPSR));
 }
 
 static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu)
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 25c29107f13f..4f0ae8073f78 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -178,7 +178,7 @@ static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force)
 		val |= lower_32_bits(val);
 	}
 
-	__vcpu_sys_reg(vcpu, reg) = val;
+	__vcpu_assign_sys_reg(vcpu, reg, val);
 
 	/* Recreate the perf event to reflect the updated sample_period */
 	kvm_pmu_create_perf_event(pmc);
@@ -204,7 +204,7 @@ void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
 void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
 {
 	kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
-	__vcpu_sys_reg(vcpu, counter_index_to_reg(select_idx)) = val;
+	__vcpu_assign_sys_reg(vcpu, counter_index_to_reg(select_idx), val);
 	kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
 }
 
@@ -239,7 +239,7 @@ static void kvm_pmu_stop_counter(struct kvm_pmc *pmc)
 
 	reg = counter_index_to_reg(pmc->idx);
 
-	__vcpu_sys_reg(vcpu, reg) = val;
+	__vcpu_assign_sys_reg(vcpu, reg, val);
 
 	kvm_pmu_release_perf_event(pmc);
 }
@@ -503,7 +503,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu,
 		reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1;
 		if (!kvm_pmc_is_64bit(pmc))
 			reg = lower_32_bits(reg);
-		__vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg;
+		__vcpu_assign_sys_reg(vcpu, counter_index_to_reg(i), reg);
 
 		/* No overflow? move on */
 		if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg))
@@ -602,7 +602,7 @@ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
 		kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
 
 	/* The reset bits don't indicate any state, and shouldn't be saved. */
-	__vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P);
+	__vcpu_assign_sys_reg(vcpu, PMCR_EL0, (val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P)));
 
 	if (val & ARMV8_PMU_PMCR_C)
 		kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);
@@ -779,7 +779,7 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
 	u64 reg;
 
 	reg = counter_index_to_evtreg(pmc->idx);
-	__vcpu_sys_reg(vcpu, reg) = data & kvm_pmu_evtyper_mask(vcpu->kvm);
+	__vcpu_assign_sys_reg(vcpu, reg, (data & kvm_pmu_evtyper_mask(vcpu->kvm)));
 
 	kvm_pmu_create_perf_event(pmc);
 }
@@ -1038,7 +1038,7 @@ static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr)
 			u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2);
 			val &= ~MDCR_EL2_HPMN;
 			val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters);
-			__vcpu_sys_reg(vcpu, MDCR_EL2) = val;
+			__vcpu_assign_sys_reg(vcpu, MDCR_EL2, val);
 		}
 	}
 }
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 707c651aff03..93d0ca7ed936 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -228,7 +228,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
 		 * to reverse-translate virtual EL2 system registers for a
 		 * non-VHE guest hypervisor.
 		 */
-		__vcpu_sys_reg(vcpu, reg) = val;
+		__vcpu_assign_sys_reg(vcpu, reg, val);
 
 		switch (reg) {
 		case CNTHCTL_EL2:
@@ -263,7 +263,7 @@ void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
 		return;
 
 memory_write:
-	 __vcpu_sys_reg(vcpu, reg) = val;
+	__vcpu_assign_sys_reg(vcpu, reg, val);
 }
 
 /* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */
@@ -605,7 +605,7 @@ static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 	if ((val ^ rd->val) & ~OSLSR_EL1_OSLK)
 		return -EINVAL;
 
-	__vcpu_sys_reg(vcpu, rd->reg) = val;
+	__vcpu_assign_sys_reg(vcpu, rd->reg, val);
 	return 0;
 }
 
@@ -835,7 +835,7 @@ static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 	 * The value of PMCR.N field is included when the
 	 * vCPU register is read via kvm_vcpu_read_pmcr().
 	 */
-	__vcpu_sys_reg(vcpu, r->reg) = pmcr;
+	__vcpu_assign_sys_reg(vcpu, r->reg, pmcr);
 
 	return __vcpu_sys_reg(vcpu, r->reg);
 }
@@ -907,7 +907,7 @@ static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 		return false;
 
 	if (p->is_write)
-		__vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval;
+		__vcpu_assign_sys_reg(vcpu, PMSELR_EL0, p->regval);
 	else
 		/* return PMSELR.SEL field */
 		p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0)
@@ -1076,7 +1076,7 @@ static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 va
 {
 	u64 mask = kvm_pmu_accessible_counter_mask(vcpu);
 
-	__vcpu_sys_reg(vcpu, r->reg) = val & mask;
+	__vcpu_assign_sys_reg(vcpu, r->reg, val & mask);
 	kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
 
 	return 0;
@@ -1185,8 +1185,8 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 		if (!vcpu_mode_priv(vcpu))
 			return undef_access(vcpu, p, r);
 
-		__vcpu_sys_reg(vcpu, PMUSERENR_EL0) =
-			       p->regval & ARMV8_PMU_USERENR_MASK;
+		__vcpu_assign_sys_reg(vcpu, PMUSERENR_EL0,
+				      (p->regval & ARMV8_PMU_USERENR_MASK));
 	} else {
 		p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0)
 			    & ARMV8_PMU_USERENR_MASK;
@@ -1237,7 +1237,7 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
 	if (!kvm_supports_32bit_el0())
 		val |= ARMV8_PMU_PMCR_LC;
 
-	__vcpu_sys_reg(vcpu, r->reg) = val;
+	__vcpu_assign_sys_reg(vcpu, r->reg, val);
 	kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
 
 	return 0;
@@ -2207,7 +2207,7 @@ static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 	if (kvm_has_mte(vcpu->kvm))
 		clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc);
 
-	__vcpu_sys_reg(vcpu, r->reg) = clidr;
+	__vcpu_assign_sys_reg(vcpu, r->reg, clidr);
 
 	return __vcpu_sys_reg(vcpu, r->reg);
 }
@@ -2221,7 +2221,7 @@ static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
 	if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc))
 		return -EINVAL;
 
-	__vcpu_sys_reg(vcpu, rd->reg) = val;
+	__vcpu_assign_sys_reg(vcpu, rd->reg, val);
 
 	return 0;
 }
@@ -2398,7 +2398,7 @@ static bool access_sp_el1(struct kvm_vcpu *vcpu,
 			  const struct sys_reg_desc *r)
 {
 	if (p->is_write)
-		__vcpu_sys_reg(vcpu, SP_EL1) = p->regval;
+		__vcpu_assign_sys_reg(vcpu, SP_EL1, p->regval);
 	else
 		p->regval = __vcpu_sys_reg(vcpu, SP_EL1);
 
@@ -2422,7 +2422,7 @@ static bool access_spsr(struct kvm_vcpu *vcpu,
 			const struct sys_reg_desc *r)
 {
 	if (p->is_write)
-		__vcpu_sys_reg(vcpu, SPSR_EL1) = p->regval;
+		__vcpu_assign_sys_reg(vcpu, SPSR_EL1, p->regval);
 	else
 		p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1);
 
@@ -2434,7 +2434,7 @@ static bool access_cntkctl_el12(struct kvm_vcpu *vcpu,
 				const struct sys_reg_desc *r)
 {
 	if (p->is_write)
-		__vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval;
+		__vcpu_assign_sys_reg(vcpu, CNTKCTL_EL1, p->regval);
 	else
 		p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1);
 
@@ -2448,7 +2448,9 @@ static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 	if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1))
 		val |= HCR_E2H;
 
-	return __vcpu_sys_reg(vcpu, r->reg) = val;
+	__vcpu_assign_sys_reg(vcpu, r->reg, val);
+
+	return __vcpu_sys_reg(vcpu, r->reg);
 }
 
 static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu,
@@ -2619,7 +2621,7 @@ static bool access_mdcr(struct kvm_vcpu *vcpu,
 		u64_replace_bits(val, hpmn, MDCR_EL2_HPMN);
 	}
 
-	__vcpu_sys_reg(vcpu, MDCR_EL2) = val;
+	__vcpu_assign_sys_reg(vcpu, MDCR_EL2, val);
 
 	/*
 	 * Request a reload of the PMU to enable/disable the counters
@@ -2748,7 +2750,7 @@ static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
 
 static u64 reset_mdcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
-	__vcpu_sys_reg(vcpu, r->reg) = vcpu->kvm->arch.nr_pmu_counters;
+	__vcpu_assign_sys_reg(vcpu, r->reg, vcpu->kvm->arch.nr_pmu_counters);
 	return vcpu->kvm->arch.nr_pmu_counters;
 }
 
@@ -5006,7 +5008,7 @@ int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
 	if (r->set_user) {
 		ret = (r->set_user)(vcpu, r, val);
 	} else {
-		__vcpu_sys_reg(vcpu, r->reg) = val;
+		__vcpu_assign_sys_reg(vcpu, r->reg, val);
 		ret = 0;
 	}
 
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index cc6338d38766..ef97d9fc67cc 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -137,7 +137,7 @@ static inline u64 reset_unknown(struct kvm_vcpu *vcpu,
 {
 	BUG_ON(!r->reg);
 	BUG_ON(r->reg >= NR_SYS_REGS);
-	__vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
+	__vcpu_assign_sys_reg(vcpu, r->reg, 0x1de7ec7edbadc0deULL);
 	return __vcpu_sys_reg(vcpu, r->reg);
 }
 
@@ -145,7 +145,7 @@ static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	BUG_ON(!r->reg);
 	BUG_ON(r->reg >= NR_SYS_REGS);
-	__vcpu_sys_reg(vcpu, r->reg) = r->val;
+	__vcpu_assign_sys_reg(vcpu, r->reg, r->val);
 	return __vcpu_sys_reg(vcpu, r->reg);
 }
 
diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
index 4f6954c30674..d22a8ad7bcc5 100644
--- a/arch/arm64/kvm/vgic/vgic-v3-nested.c
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -356,12 +356,12 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
 	val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
 	val &= ~ICH_HCR_EL2_EOIcount_MASK;
 	val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK);
-	__vcpu_sys_reg(vcpu, ICH_HCR_EL2) = val;
-	__vcpu_sys_reg(vcpu, ICH_VMCR_EL2) = s_cpu_if->vgic_vmcr;
+	__vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val);
+	__vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr);
 
 	for (i = 0; i < 4; i++) {
-		__vcpu_sys_reg(vcpu, ICH_AP0RN(i)) = s_cpu_if->vgic_ap0r[i];
-		__vcpu_sys_reg(vcpu, ICH_AP1RN(i)) = s_cpu_if->vgic_ap1r[i];
+		__vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]);
+		__vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]);
 	}
 
 	for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
@@ -370,7 +370,7 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
 		val &= ~ICH_LR_STATE;
 		val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE;
 
-		__vcpu_sys_reg(vcpu, ICH_LRN(i)) = val;
+		__vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
 		s_cpu_if->vgic_lr[i] = 0;
 	}
 
-- 
cgit v1.2.3


From 8800b7c4bbede3cd40831726d3f98e8080baf4df Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 3 Jun 2025 08:08:22 +0100
Subject: KVM: arm64: Add RMW specific sysreg accessor

In a number of cases, we perform a Read-Modify-Write operation on
a system register, meaning that we would apply the RESx masks twice.

Instead, provide a new accessor that performs this RMW operation,
allowing the masks to be applied exactly once per operation.

Reviewed-by: Miguel Luis <miguel.luis@oracle.com>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20250603070824.1192795-3-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h  | 11 +++++++++++
 arch/arm64/kvm/debug.c             |  4 ++--
 arch/arm64/kvm/hyp/vhe/sysreg-sr.c |  4 ++--
 arch/arm64/kvm/nested.c            |  2 +-
 arch/arm64/kvm/pmu-emul.c          | 10 +++++-----
 arch/arm64/kvm/sys_regs.c          | 22 +++++++++++-----------
 6 files changed, 32 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3b84ad91116b..8b8c649f225b 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1118,6 +1118,17 @@ u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64);
 		ctxt_sys_reg(ctxt, (r)) = __v;				\
 	} while (0)
 
+#define __vcpu_rmw_sys_reg(v, r, op, val)				\
+	do {								\
+		const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt;	\
+		u64 __v = ctxt_sys_reg(ctxt, (r));			\
+		__v op (val);						\
+		if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__)	\
+			__v = kvm_vcpu_apply_reg_masks((v), (r), __v);	\
+									\
+		ctxt_sys_reg(ctxt, (r)) = __v;				\
+	} while (0)
+
 #define __vcpu_sys_reg(v,r)						\
 	(*({								\
 		const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt;	\
diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c
index 0e4c805e7e89..1a7dab333f55 100644
--- a/arch/arm64/kvm/debug.c
+++ b/arch/arm64/kvm/debug.c
@@ -216,9 +216,9 @@ void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu)
 void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val)
 {
 	if (val & OSLAR_EL1_OSLK)
-		__vcpu_sys_reg(vcpu, OSLSR_EL1) |= OSLSR_EL1_OSLK;
+		__vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, |=, OSLSR_EL1_OSLK);
 	else
-		__vcpu_sys_reg(vcpu, OSLSR_EL1) &= ~OSLSR_EL1_OSLK;
+		__vcpu_rmw_sys_reg(vcpu, OSLSR_EL1, &=, ~OSLSR_EL1_OSLK);
 
 	preempt_disable();
 	kvm_arch_vcpu_put(vcpu);
diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
index 34c7bf7fe9de..73e4bc7fde9e 100644
--- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c
@@ -70,8 +70,8 @@ static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu)
 		 */
 		val = read_sysreg_el1(SYS_CNTKCTL);
 		val &= CNTKCTL_VALID_BITS;
-		__vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS;
-		__vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val;
+		__vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, &=, ~CNTKCTL_VALID_BITS);
+		__vcpu_rmw_sys_reg(vcpu, CNTHCTL_EL2, |=, val);
 	}
 
 	__vcpu_assign_sys_reg(vcpu, SP_EL2,	 read_sysreg(sp_el1));
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index 4a53e4147fb0..5b191f4dc566 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -1757,7 +1757,7 @@ int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
 
 out:
 	for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++)
-		(void)__vcpu_sys_reg(vcpu, sr);
+		__vcpu_rmw_sys_reg(vcpu, sr, |=, 0);
 
 	return 0;
 }
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 4f0ae8073f78..b03dbda7f1ab 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -510,7 +510,7 @@ static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu,
 			continue;
 
 		/* Mark overflow */
-		__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
+		__vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(i));
 
 		if (kvm_pmu_counter_can_chain(pmc))
 			kvm_pmu_counter_increment(vcpu, BIT(i + 1),
@@ -556,7 +556,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
 	perf_event->attr.sample_period = period;
 	perf_event->hw.sample_period = period;
 
-	__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);
+	__vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, BIT(idx));
 
 	if (kvm_pmu_counter_can_chain(pmc))
 		kvm_pmu_counter_increment(vcpu, BIT(idx + 1),
@@ -914,9 +914,9 @@ void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu)
 {
 	u64 mask = kvm_pmu_implemented_counter_mask(vcpu);
 
-	__vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= mask;
-	__vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= mask;
-	__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= mask;
+	__vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, mask);
+	__vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, mask);
+	__vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, mask);
 
 	kvm_pmu_reprogram_counter_mask(vcpu, mask);
 }
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 93d0ca7ed936..e89d345e42d0 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -791,7 +791,7 @@ static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 		mask |= GENMASK(n - 1, 0);
 
 	reset_unknown(vcpu, r);
-	__vcpu_sys_reg(vcpu, r->reg) &= mask;
+	__vcpu_rmw_sys_reg(vcpu, r->reg, &=, mask);
 
 	return __vcpu_sys_reg(vcpu, r->reg);
 }
@@ -799,7 +799,7 @@ static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	reset_unknown(vcpu, r);
-	__vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0);
+	__vcpu_rmw_sys_reg(vcpu, r->reg, &=, GENMASK(31, 0));
 
 	return __vcpu_sys_reg(vcpu, r->reg);
 }
@@ -811,7 +811,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 		return 0;
 
 	reset_unknown(vcpu, r);
-	__vcpu_sys_reg(vcpu, r->reg) &= kvm_pmu_evtyper_mask(vcpu->kvm);
+	__vcpu_rmw_sys_reg(vcpu, r->reg, &=, kvm_pmu_evtyper_mask(vcpu->kvm));
 
 	return __vcpu_sys_reg(vcpu, r->reg);
 }
@@ -819,7 +819,7 @@ static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 	reset_unknown(vcpu, r);
-	__vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK;
+	__vcpu_rmw_sys_reg(vcpu, r->reg, &=, PMSELR_EL0_SEL_MASK);
 
 	return __vcpu_sys_reg(vcpu, r->reg);
 }
@@ -1103,10 +1103,10 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 		val = p->regval & mask;
 		if (r->Op2 & 0x1)
 			/* accessing PMCNTENSET_EL0 */
-			__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val;
+			__vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, |=, val);
 		else
 			/* accessing PMCNTENCLR_EL0 */
-			__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;
+			__vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, ~val);
 
 		kvm_pmu_reprogram_counter_mask(vcpu, val);
 	} else {
@@ -1129,10 +1129,10 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 
 		if (r->Op2 & 0x1)
 			/* accessing PMINTENSET_EL1 */
-			__vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val;
+			__vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, |=, val);
 		else
 			/* accessing PMINTENCLR_EL1 */
-			__vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
+			__vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, ~val);
 	} else {
 		p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
 	}
@@ -1151,10 +1151,10 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 	if (p->is_write) {
 		if (r->CRm & 0x2)
 			/* accessing PMOVSSET_EL0 */
-			__vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask);
+			__vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, (p->regval & mask));
 		else
 			/* accessing PMOVSCLR_EL0 */
-			__vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
+			__vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, ~(p->regval & mask));
 	} else {
 		p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
 	}
@@ -4786,7 +4786,7 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
 			r->reset(vcpu, r);
 
 		if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS)
-			(void)__vcpu_sys_reg(vcpu, r->reg);
+			__vcpu_rmw_sys_reg(vcpu, r->reg, |=, 0);
 	}
 
 	set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags);
-- 
cgit v1.2.3


From b61fae4ee120f337b8700dff43b2fd639f3bf6a5 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 3 Jun 2025 08:08:23 +0100
Subject: KVM: arm64: Don't use __vcpu_sys_reg() to get the address of a sysreg

We are about to prevent the use of __vcpu_sys_reg() as a lvalue,
and getting the address of a rvalue is not a thing.

Update the couple of places where we do this to use the __ctxt_sys_reg()
accessor, which return the address of a register.

Reviewed-by: Miguel Luis <miguel.luis@oracle.com>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20250603070824.1192795-4-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/arch_timer.c | 2 +-
 arch/arm64/kvm/fpsimd.c     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 5a67a6d4d95b..c6b4fb4c8e08 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -1036,7 +1036,7 @@ void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
 	if (vcpu_has_nv(vcpu)) {
 		struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset;
 
-		offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2);
+		offs->vcpu_offset = __ctxt_sys_reg(&vcpu->arch.ctxt, CNTVOFF_EL2);
 		offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset;
 	}
 
diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c
index 7f6e43d25691..8f6c8f57c6b9 100644
--- a/arch/arm64/kvm/fpsimd.c
+++ b/arch/arm64/kvm/fpsimd.c
@@ -103,8 +103,8 @@ void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
 		fp_state.sve_state = vcpu->arch.sve_state;
 		fp_state.sve_vl = vcpu->arch.sve_max_vl;
 		fp_state.sme_state = NULL;
-		fp_state.svcr = &__vcpu_sys_reg(vcpu, SVCR);
-		fp_state.fpmr = &__vcpu_sys_reg(vcpu, FPMR);
+		fp_state.svcr = __ctxt_sys_reg(&vcpu->arch.ctxt, SVCR);
+		fp_state.fpmr = __ctxt_sys_reg(&vcpu->arch.ctxt, FPMR);
 		fp_state.fp_type = &vcpu->arch.fp_type;
 
 		if (vcpu_has_sve(vcpu))
-- 
cgit v1.2.3


From b5fa1f91e11fdf74ad4e2ac6dae246a57cbd2d95 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Tue, 3 Jun 2025 08:08:24 +0100
Subject: KVM: arm64: Make __vcpu_sys_reg() a pure rvalue operand

Now that we don't have any use of __vcpu_sys_reg() as a lvalue,
remove the in-place update, and directly return the sanitised
value.

Reviewed-by: Miguel Luis <miguel.luis@oracle.com>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20250603070824.1192795-5-maz@kernel.org
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 8b8c649f225b..382b382d14da 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1130,13 +1130,13 @@ u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64);
 	} while (0)
 
 #define __vcpu_sys_reg(v,r)						\
-	(*({								\
+	({								\
 		const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt;	\
-		u64 *__r = __ctxt_sys_reg(ctxt, (r));			\
+		u64 __v = ctxt_sys_reg(ctxt, (r));			\
 		if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__)	\
-			*__r = kvm_vcpu_apply_reg_masks((v), (r), *__r);\
-		__r;							\
-	}))
+			__v = kvm_vcpu_apply_reg_masks((v), (r), __v);	\
+		__v;							\
+	})
 
 u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg);
 void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);
-- 
cgit v1.2.3


From f8693f6dffcd1865da7f5f4c3df1de445e519bec Mon Sep 17 00:00:00 2001
From: Andy Chiu <andy.chiu@sifive.com>
Date: Tue, 8 Apr 2025 02:08:25 +0800
Subject: riscv: ftrace: support fastcc in Clang for WITH_ARGS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some caller-saved registers which are not defined as function arguments
in the ABI can still be passed as arguments when the kernel is compiled
with Clang. As a result, we must save and restore those registers to
prevent ftrace from clobbering them.

- [1]: https://reviews.llvm.org/D68559

Reported-by: Evgenii Shatokhin <e.shatokhin@yadro.com>
Closes: https://lore.kernel.org/linux-riscv/7e7c7914-445d-426d-89a0-59a9199c45b1@yadro.com/
Fixes: 7caa9765465f ("ftrace: riscv: move from REGS to ARGS")
Acked-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Signed-off-by: Andy Chiu <andy.chiu@sifive.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20250407180838.42877-1-andybnac@gmail.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/ftrace.h |  7 +++++++
 arch/riscv/kernel/asm-offsets.c |  7 +++++++
 arch/riscv/kernel/mcount-dyn.S  | 16 ++++++++++++++--
 3 files changed, 28 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index d627f63ee289..d8b2138bd9c6 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -146,6 +146,13 @@ struct __arch_ftrace_regs {
 			unsigned long a5;
 			unsigned long a6;
 			unsigned long a7;
+#ifdef CONFIG_CC_IS_CLANG
+			unsigned long t2;
+			unsigned long t3;
+			unsigned long t4;
+			unsigned long t5;
+			unsigned long t6;
+#endif
 		};
 	};
 };
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 16490755304e..7c43c8e26ae7 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -501,6 +501,13 @@ void asm_offsets(void)
 	DEFINE(FREGS_SP,	    offsetof(struct __arch_ftrace_regs, sp));
 	DEFINE(FREGS_S0,	    offsetof(struct __arch_ftrace_regs, s0));
 	DEFINE(FREGS_T1,	    offsetof(struct __arch_ftrace_regs, t1));
+#ifdef CONFIG_CC_IS_CLANG
+	DEFINE(FREGS_T2,	    offsetof(struct __arch_ftrace_regs, t2));
+	DEFINE(FREGS_T3,	    offsetof(struct __arch_ftrace_regs, t3));
+	DEFINE(FREGS_T4,	    offsetof(struct __arch_ftrace_regs, t4));
+	DEFINE(FREGS_T5,	    offsetof(struct __arch_ftrace_regs, t5));
+	DEFINE(FREGS_T6,	    offsetof(struct __arch_ftrace_regs, t6));
+#endif
 	DEFINE(FREGS_A0,	    offsetof(struct __arch_ftrace_regs, a0));
 	DEFINE(FREGS_A1,	    offsetof(struct __arch_ftrace_regs, a1));
 	DEFINE(FREGS_A2,	    offsetof(struct __arch_ftrace_regs, a2));
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index 745dd4c4a69c..e988bd26b28b 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -96,7 +96,13 @@
 	REG_S	x8,  FREGS_S0(sp)
 #endif
 	REG_S	x6,  FREGS_T1(sp)
-
+#ifdef CONFIG_CC_IS_CLANG
+	REG_S	x7,  FREGS_T2(sp)
+	REG_S	x28, FREGS_T3(sp)
+	REG_S	x29, FREGS_T4(sp)
+	REG_S	x30, FREGS_T5(sp)
+	REG_S	x31, FREGS_T6(sp)
+#endif
 	// save the arguments
 	REG_S	x10, FREGS_A0(sp)
 	REG_S	x11, FREGS_A1(sp)
@@ -115,7 +121,13 @@
 	REG_L	x8, FREGS_S0(sp)
 #endif
 	REG_L	x6,  FREGS_T1(sp)
-
+#ifdef CONFIG_CC_IS_CLANG
+	REG_L	x7,  FREGS_T2(sp)
+	REG_L	x28, FREGS_T3(sp)
+	REG_L	x29, FREGS_T4(sp)
+	REG_L	x30, FREGS_T5(sp)
+	REG_L	x31, FREGS_T6(sp)
+#endif
 	// restore the arguments
 	REG_L	x10, FREGS_A0(sp)
 	REG_L	x11, FREGS_A1(sp)
-- 
cgit v1.2.3


From 54ecbc8d857122b32a98fe204cb61a06aabc063d Mon Sep 17 00:00:00 2001
From: Andy Chiu <andybnac@gmail.com>
Date: Tue, 8 Apr 2025 02:08:26 +0800
Subject: riscv: ftrace factor out code defined by !WITH_ARG

DYNAMIC_FTRACE selects DYNAMIC_FTRACE_WITH_ARGS and mcount-dyn.S in
riscv, so we can remove ifdef jargons of WITH_ARG when it is known that
DYNAMIC_FTRACE is true.

Signed-off-by: Andy Chiu <andybnac@gmail.com>
Link: https://lore.kernel.org/r/20250407180838.42877-2-andybnac@gmail.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/kernel/ftrace.c     | 15 ---------------
 arch/riscv/kernel/mcount-dyn.S | 34 ----------------------------------
 2 files changed, 49 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 674dcdfae7a1..1fd10555c580 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -210,7 +210,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
 }
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
@@ -231,19 +230,5 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 	if (!function_graph_enter_regs(old, ip, frame_pointer, parent, fregs))
 		*parent = return_hooker;
 }
-#else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
-extern void ftrace_graph_call(void);
-int ftrace_enable_ftrace_graph_caller(void)
-{
-	return __ftrace_modify_call((unsigned long)&ftrace_graph_call,
-				    (unsigned long)&prepare_ftrace_return, true, true);
-}
-
-int ftrace_disable_ftrace_graph_caller(void)
-{
-	return __ftrace_modify_call((unsigned long)&ftrace_graph_call,
-				    (unsigned long)&prepare_ftrace_return, false, true);
-}
-#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index e988bd26b28b..3f06b40bb6c8 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -56,8 +56,6 @@
 	addi	sp, sp, ABI_SIZE_ON_STACK
 	.endm
 
-#ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
-
 /**
 * SAVE_ABI_REGS - save regs against the ftrace_regs struct
 *
@@ -149,36 +147,6 @@
 	mv	a3, sp
 	.endm
 
-#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
-
-#ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
-SYM_FUNC_START(ftrace_caller)
-	SAVE_ABI
-
-	addi	a0, t0, -FENTRY_RA_OFFSET
-	la	a1, function_trace_op
-	REG_L	a2, 0(a1)
-	mv	a1, ra
-	mv	a3, sp
-
-SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
-	call	ftrace_stub
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	addi	a0, sp, ABI_RA
-	REG_L	a1, ABI_T0(sp)
-	addi	a1, a1, -FENTRY_RA_OFFSET
-#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
-	mv	a2, s0
-#endif
-SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
-	call	ftrace_stub
-#endif
-	RESTORE_ABI
-	jr	t0
-SYM_FUNC_END(ftrace_caller)
-
-#else /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 SYM_FUNC_START(ftrace_caller)
 	mv	t1, zero
 	SAVE_ABI_REGS
@@ -194,8 +162,6 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
 	jr	t1
 SYM_FUNC_END(ftrace_caller)
 
-#endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
-
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 SYM_CODE_START(ftrace_stub_direct_tramp)
 	jr	t0
-- 
cgit v1.2.3


From c41bf4326c7b0af3f7004235ac91551833ec95c6 Mon Sep 17 00:00:00 2001
From: Andy Chiu <andy.chiu@sifive.com>
Date: Tue, 8 Apr 2025 02:08:27 +0800
Subject: riscv: ftrace: align patchable functions to 4 Byte boundary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We are changing ftrace code patching in order to remove dependency from
stop_machine() and enable kernel preemption. This requires us to align
functions entry at a 4-B align address.

However, -falign-functions on older versions of GCC alone was not strong
enoungh to align all functions. In fact, cold functions are not aligned
after turning on optimizations. We consider this is a bug in GCC and
turn off guess-branch-probility as a workaround to align all functions.

GCC bug id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345

The option -fmin-function-alignment is able to align all functions
properly on newer versions of gcc. So, we add a cc-option to test if
the toolchain supports it.

Suggested-by: Evgenii Shatokhin <e.shatokhin@yadro.com>
Signed-off-by: Andy Chiu <andy.chiu@sifive.com>
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20250407180838.42877-3-andybnac@gmail.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index bbec87b79309..7dbed10843d2 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -150,6 +150,7 @@ config RISCV
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
+	select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C
 	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 	select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_GRAPH_FUNC
@@ -236,6 +237,7 @@ config CLANG_SUPPORTS_DYNAMIC_FTRACE
 config GCC_SUPPORTS_DYNAMIC_FTRACE
 	def_bool CC_IS_GCC
 	depends on $(cc-option,-fpatchable-function-entry=8)
+	depends on CC_HAS_MIN_FUNCTION_ALIGNMENT || !RISCV_ISA_C
 
 config HAVE_SHADOW_CALL_STACK
 	def_bool $(cc-option,-fsanitize=shadow-call-stack)
-- 
cgit v1.2.3


From b2137c3b6d7a45622967aac78b46a4bd2cff6c42 Mon Sep 17 00:00:00 2001
From: Andy Chiu <andy.chiu@sifive.com>
Date: Tue, 8 Apr 2025 02:08:29 +0800
Subject: riscv: ftrace: prepare ftrace for atomic code patching
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We use an AUIPC+JALR pair to jump into a ftrace trampoline. Since
instruction fetch can break down to 4 byte at a time, it is impossible
to update two instructions without a race. In order to mitigate it, we
initialize the patchable entry to AUIPC + NOP4. Then, the run-time code
patching can change NOP4 to JALR to eable/disable ftrcae from a
function. This limits the reach of each ftrace entry to +-2KB displacing
from ftrace_caller.

Starting from the trampoline, we add a level of indirection for it to
reach ftrace caller target. Now, it loads the target address from a
memory location, then perform the jump. This enable the kernel to update
the target atomically.

The new don't-stop-the-world text patching on change only one RISC-V
instruction:

  |  -8: &ftrace_ops of the associated tracer function.
  | <ftrace enable>:
  |   0: auipc  t0, hi(ftrace_caller)
  |   4: jalr   t0, lo(ftrace_caller)
  |
  |  -8: &ftrace_nop_ops
  | <ftrace disable>:
  |   0: auipc  t0, hi(ftrace_caller)
  |   4: nop

This means that f+0x0 is fixed, and should not be claimed by ftrace,
e.g. kprobe should be able to put a probe in f+0x0. Thus, we adjust the
offset and MCOUNT_INSN_SIZE accordingly.

[ alex: Fix build errors with !CONFIG_DYNAMIC_FTRACE ]

Co-developed-by: Björn Töpel <bjorn@rivosinc.com>
Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
Signed-off-by: Andy Chiu <andy.chiu@sifive.com>
Link: https://lore.kernel.org/r/20250407180838.42877-5-andybnac@gmail.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/ftrace.h |  49 +++++++-------
 arch/riscv/kernel/ftrace.c      | 137 +++++++++++++++++++++-------------------
 arch/riscv/kernel/mcount-dyn.S  |   9 ++-
 3 files changed, 98 insertions(+), 97 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index d8b2138bd9c6..6a5c0a7fb826 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -20,10 +20,9 @@ extern void *return_address(unsigned int level);
 #define ftrace_return_address(n) return_address(n)
 
 void _mcount(void);
-static inline unsigned long ftrace_call_adjust(unsigned long addr)
-{
-	return addr;
-}
+unsigned long ftrace_call_adjust(unsigned long addr);
+unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip);
+#define ftrace_get_symaddr(fentry_ip) arch_ftrace_get_symaddr(fentry_ip)
 
 /*
  * Let's do like x86/arm64 and ignore the compat syscalls.
@@ -57,12 +56,21 @@ struct dyn_arch_ftrace {
  * 2) jalr: setting low-12 offset to ra, jump to ra, and set ra to
  *          return address (original pc + 4)
  *
+ * The first 2 instructions for each tracable function is compiled to 2 nop
+ * instructions. Then, the kernel initializes the first instruction to auipc at
+ * boot time (<ftrace disable>). The second instruction is patched to jalr to
+ * start the trace.
+ *
+ *<Image>:
+ * 0: nop
+ * 4: nop
+ *
  *<ftrace enable>:
- * 0: auipc  t0/ra, 0x?
- * 4: jalr   t0/ra, ?(t0/ra)
+ * 0: auipc  t0, 0x?
+ * 4: jalr   t0, ?(t0)
  *
  *<ftrace disable>:
- * 0: nop
+ * 0: auipc  t0, 0x?
  * 4: nop
  *
  * Dynamic ftrace generates probes to call sites, so we must deal with
@@ -75,10 +83,9 @@ struct dyn_arch_ftrace {
 #define AUIPC_OFFSET_MASK	(0xfffff000)
 #define AUIPC_PAD		(0x00001000)
 #define JALR_SHIFT		20
-#define JALR_RA			(0x000080e7)
-#define AUIPC_RA		(0x00000097)
 #define JALR_T0			(0x000282e7)
 #define AUIPC_T0		(0x00000297)
+#define JALR_RANGE		(JALR_SIGN_MASK - 1)
 
 #define to_jalr_t0(offset)						\
 	(((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_T0)
@@ -96,26 +103,14 @@ do {									\
 	call[1] = to_jalr_t0(offset);					\
 } while (0)
 
-#define to_jalr_ra(offset)						\
-	(((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_RA)
-
-#define to_auipc_ra(offset)						\
-	((offset & JALR_SIGN_MASK) ?					\
-	(((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_RA) :	\
-	((offset & AUIPC_OFFSET_MASK) | AUIPC_RA))
-
-#define make_call_ra(caller, callee, call)				\
-do {									\
-	unsigned int offset =						\
-		(unsigned long) (callee) - (unsigned long) (caller);	\
-	call[0] = to_auipc_ra(offset);					\
-	call[1] = to_jalr_ra(offset);					\
-} while (0)
-
 /*
- * Let auipc+jalr be the basic *mcount unit*, so we make it 8 bytes here.
+ * Only the jalr insn in the auipc+jalr is patched, so we make it 4
+ * bytes here.
  */
-#define MCOUNT_INSN_SIZE 8
+#define MCOUNT_INSN_SIZE	4
+#define MCOUNT_AUIPC_SIZE	4
+#define MCOUNT_JALR_SIZE	4
+#define MCOUNT_NOP4_SIZE	4
 
 #ifndef __ASSEMBLY__
 struct dyn_ftrace;
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 1fd10555c580..ea04f09f9d4d 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -8,11 +8,22 @@
 #include <linux/ftrace.h>
 #include <linux/uaccess.h>
 #include <linux/memory.h>
+#include <linux/irqflags.h>
 #include <linux/stop_machine.h>
 #include <asm/cacheflush.h>
 #include <asm/text-patching.h>
 
 #ifdef CONFIG_DYNAMIC_FTRACE
+unsigned long ftrace_call_adjust(unsigned long addr)
+{
+	return addr + MCOUNT_AUIPC_SIZE;
+}
+
+unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip)
+{
+	return fentry_ip - MCOUNT_AUIPC_SIZE;
+}
+
 void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex)
 {
 	mutex_lock(&text_mutex);
@@ -32,51 +43,32 @@ void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex)
 	mutex_unlock(&text_mutex);
 }
 
-static int ftrace_check_current_call(unsigned long hook_pos,
-				     unsigned int *expected)
+static int __ftrace_modify_call(unsigned long source, unsigned long target, bool validate)
 {
+	unsigned int call[2], offset;
 	unsigned int replaced[2];
-	unsigned int nops[2] = {RISCV_INSN_NOP4, RISCV_INSN_NOP4};
 
-	/* we expect nops at the hook position */
-	if (!expected)
-		expected = nops;
+	offset = target - source;
+	call[1] = to_jalr_t0(offset);
 
-	/*
-	 * Read the text we want to modify;
-	 * return must be -EFAULT on read error
-	 */
-	if (copy_from_kernel_nofault(replaced, (void *)hook_pos,
-			MCOUNT_INSN_SIZE))
-		return -EFAULT;
-
-	/*
-	 * Make sure it is what we expect it to be;
-	 * return must be -EINVAL on failed comparison
-	 */
-	if (memcmp(expected, replaced, sizeof(replaced))) {
-		pr_err("%p: expected (%08x %08x) but got (%08x %08x)\n",
-		       (void *)hook_pos, expected[0], expected[1], replaced[0],
-		       replaced[1]);
-		return -EINVAL;
+	if (validate) {
+		call[0] = to_auipc_t0(offset);
+		/*
+		 * Read the text we want to modify;
+		 * return must be -EFAULT on read error
+		 */
+		if (copy_from_kernel_nofault(replaced, (void *)source, 2 * MCOUNT_INSN_SIZE))
+			return -EFAULT;
+
+		if (replaced[0] != call[0]) {
+			pr_err("%p: expected (%08x) but got (%08x)\n",
+			       (void *)source, call[0], replaced[0]);
+			return -EINVAL;
+		}
 	}
 
-	return 0;
-}
-
-static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
-				bool enable, bool ra)
-{
-	unsigned int call[2];
-	unsigned int nops[2] = {RISCV_INSN_NOP4, RISCV_INSN_NOP4};
-
-	if (ra)
-		make_call_ra(hook_pos, target, call);
-	else
-		make_call_t0(hook_pos, target, call);
-
-	/* Replace the auipc-jalr pair at once. Return -EPERM on write error. */
-	if (patch_insn_write((void *)hook_pos, enable ? call : nops, MCOUNT_INSN_SIZE))
+	/* Replace the jalr at once. Return -EPERM on write error. */
+	if (patch_insn_write((void *)(source + MCOUNT_AUIPC_SIZE), call + 1, MCOUNT_JALR_SIZE))
 		return -EPERM;
 
 	return 0;
@@ -84,22 +76,21 @@ static int __ftrace_modify_call(unsigned long hook_pos, unsigned long target,
 
 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
-	unsigned int call[2];
-
-	make_call_t0(rec->ip, addr, call);
+	unsigned long distance, orig_addr, pc = rec->ip - MCOUNT_AUIPC_SIZE;
 
-	if (patch_insn_write((void *)rec->ip, call, MCOUNT_INSN_SIZE))
-		return -EPERM;
+	orig_addr = (unsigned long)&ftrace_caller;
+	distance = addr > orig_addr ? addr - orig_addr : orig_addr - addr;
+	if (distance > JALR_RANGE)
+		return -EINVAL;
 
-	return 0;
+	return __ftrace_modify_call(pc, addr, false);
 }
 
-int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
-		    unsigned long addr)
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr)
 {
-	unsigned int nops[2] = {RISCV_INSN_NOP4, RISCV_INSN_NOP4};
+	u32 nop4 = RISCV_INSN_NOP4;
 
-	if (patch_insn_write((void *)rec->ip, nops, MCOUNT_INSN_SIZE))
+	if (patch_insn_write((void *)rec->ip, &nop4, MCOUNT_NOP4_SIZE))
 		return -EPERM;
 
 	return 0;
@@ -114,21 +105,38 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec,
  */
 int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
 {
-	int out;
+	unsigned long pc = rec->ip - MCOUNT_AUIPC_SIZE;
+	unsigned int nops[2], offset;
+	int ret;
+
+	offset = (unsigned long) &ftrace_caller - pc;
+	nops[0] = to_auipc_t0(offset);
+	nops[1] = RISCV_INSN_NOP4;
 
 	mutex_lock(&text_mutex);
-	out = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
+	ret = patch_insn_write((void *)pc, nops, 2 * MCOUNT_INSN_SIZE);
 	mutex_unlock(&text_mutex);
 
-	return out;
+	return ret;
 }
 
+ftrace_func_t ftrace_call_dest = ftrace_stub;
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
-	int ret = __ftrace_modify_call((unsigned long)&ftrace_call,
-				       (unsigned long)func, true, true);
-
-	return ret;
+	WRITE_ONCE(ftrace_call_dest, func);
+	/*
+	 * The data fence ensure that the update to ftrace_call_dest happens
+	 * before the write to function_trace_op later in the generic ftrace.
+	 * If the sequence is not enforced, then an old ftrace_call_dest may
+	 * race loading a new function_trace_op set in ftrace_modify_all_code
+	 *
+	 * If we are in stop_machine, then we don't need to call remote fence
+	 * as there is no concurrent read-side of ftrace_call_dest.
+	 */
+	smp_wmb();
+	if (!irqs_disabled())
+		smp_call_function(ftrace_sync_ipi, NULL, 1);
+	return 0;
 }
 
 struct ftrace_modify_param {
@@ -166,23 +174,22 @@ void arch_ftrace_update_code(int command)
 
 	stop_machine(__ftrace_modify_code, &param, cpu_online_mask);
 }
-#endif
+#else /* CONFIG_DYNAMIC_FTRACE */
+unsigned long ftrace_call_adjust(unsigned long addr)
+{
+	return addr;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
 		       unsigned long addr)
 {
+	unsigned long caller = rec->ip - MCOUNT_AUIPC_SIZE;
 	unsigned int call[2];
-	unsigned long caller = rec->ip;
-	int ret;
 
 	make_call_t0(caller, old_addr, call);
-	ret = ftrace_check_current_call(caller, call);
-
-	if (ret)
-		return ret;
-
-	return __ftrace_modify_call(caller, addr, true, false);
+	return __ftrace_modify_call(caller, addr, true);
 }
 #endif
 
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index 3f06b40bb6c8..8aa554d56096 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -13,7 +13,6 @@
 
 	.text
 
-#define FENTRY_RA_OFFSET	8
 #define ABI_SIZE_ON_STACK	80
 #define ABI_A0			0
 #define ABI_A1			8
@@ -62,8 +61,7 @@
 * After the stack is established,
 *
 * 0(sp) stores the PC of the traced function which can be accessed
-* by &(fregs)->epc in tracing function. Note that the real
-* function entry address should be computed with -FENTRY_RA_OFFSET.
+* by &(fregs)->epc in tracing function.
 *
 * 8(sp) stores the function return address (i.e. parent IP) that
 * can be accessed by &(fregs)->ra in tracing function.
@@ -140,7 +138,7 @@
 	.endm
 
 	.macro PREPARE_ARGS
-	addi	a0, t0, -FENTRY_RA_OFFSET
+	addi	a0, t0, -MCOUNT_JALR_SIZE	// ip (callsite's jalr insn)
 	la	a1, function_trace_op
 	REG_L	a2, 0(a1)
 	mv	a1, ra
@@ -153,7 +151,8 @@ SYM_FUNC_START(ftrace_caller)
 	PREPARE_ARGS
 
 SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
-	call	ftrace_stub
+	REG_L	ra, ftrace_call_dest
+	jalr	ra, 0(ra)
 
 	RESTORE_ABI_REGS
 	bnez	t1, .Ldirect
-- 
cgit v1.2.3


From 5aa4ef95588456df5a5563dd23892827f97fb14f Mon Sep 17 00:00:00 2001
From: Andy Chiu <andy.chiu@sifive.com>
Date: Tue, 8 Apr 2025 02:08:30 +0800
Subject: riscv: ftrace: do not use stop_machine to update code

Now it is safe to remove dependency from stop_machine() for us to patch
code in ftrace.

Signed-off-by: Andy Chiu <andy.chiu@sifive.com>
Link: https://lore.kernel.org/r/20250407180838.42877-6-andybnac@gmail.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/kernel/ftrace.c | 64 ++++++++--------------------------------------
 1 file changed, 10 insertions(+), 54 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index ea04f09f9d4d..b133c60808fe 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -24,23 +24,13 @@ unsigned long arch_ftrace_get_symaddr(unsigned long fentry_ip)
 	return fentry_ip - MCOUNT_AUIPC_SIZE;
 }
 
-void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex)
+void arch_ftrace_update_code(int command)
 {
 	mutex_lock(&text_mutex);
-
-	/*
-	 * The code sequences we use for ftrace can't be patched while the
-	 * kernel is running, so we need to use stop_machine() to modify them
-	 * for now.  This doesn't play nice with text_mutex, we use this flag
-	 * to elide the check.
-	 */
-	riscv_patch_in_stop_machine = true;
-}
-
-void ftrace_arch_code_modify_post_process(void) __releases(&text_mutex)
-{
-	riscv_patch_in_stop_machine = false;
+	command |= FTRACE_MAY_SLEEP;
+	ftrace_modify_all_code(command);
 	mutex_unlock(&text_mutex);
+	flush_icache_all();
 }
 
 static int __ftrace_modify_call(unsigned long source, unsigned long target, bool validate)
@@ -129,51 +119,17 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	 * before the write to function_trace_op later in the generic ftrace.
 	 * If the sequence is not enforced, then an old ftrace_call_dest may
 	 * race loading a new function_trace_op set in ftrace_modify_all_code
-	 *
-	 * If we are in stop_machine, then we don't need to call remote fence
-	 * as there is no concurrent read-side of ftrace_call_dest.
 	 */
 	smp_wmb();
-	if (!irqs_disabled())
-		smp_call_function(ftrace_sync_ipi, NULL, 1);
-	return 0;
-}
-
-struct ftrace_modify_param {
-	int command;
-	atomic_t cpu_count;
-};
-
-static int __ftrace_modify_code(void *data)
-{
-	struct ftrace_modify_param *param = data;
-
-	if (atomic_inc_return(&param->cpu_count) == num_online_cpus()) {
-		ftrace_modify_all_code(param->command);
-		/*
-		 * Make sure the patching store is effective *before* we
-		 * increment the counter which releases all waiting CPUs
-		 * by using the release variant of atomic increment. The
-		 * release pairs with the call to local_flush_icache_all()
-		 * on the waiting CPU.
-		 */
-		atomic_inc_return_release(&param->cpu_count);
-	} else {
-		while (atomic_read(&param->cpu_count) <= num_online_cpus())
-			cpu_relax();
-
-		local_flush_icache_all();
-	}
-
+	/*
+	 * Updating ftrace dpes not take stop_machine path, so irqs should not
+	 * be disabled.
+	 */
+	WARN_ON(irqs_disabled());
+	smp_call_function(ftrace_sync_ipi, NULL, 1);
 	return 0;
 }
 
-void arch_ftrace_update_code(int command)
-{
-	struct ftrace_modify_param param = { command, ATOMIC_INIT(0) };
-
-	stop_machine(__ftrace_modify_code, &param, cpu_online_mask);
-}
 #else /* CONFIG_DYNAMIC_FTRACE */
 unsigned long ftrace_call_adjust(unsigned long addr)
 {
-- 
cgit v1.2.3


From d1049fc0de81bca3abbb35e8d4b8794170498b78 Mon Sep 17 00:00:00 2001
From: Andy Chiu <andy.chiu@sifive.com>
Date: Tue, 8 Apr 2025 02:08:31 +0800
Subject: riscv: vector: Support calling schedule() for preemptible Vector

Each function entry implies a call to ftrace infrastructure. And it may
call into schedule in some cases. So, it is possible for preemptible
kernel-mode Vector to implicitly call into schedule. Since all V-regs
are caller-saved, it is possible to drop all V context when a thread
voluntarily call schedule(). Besides, we currently don't pass argument
through vector register, so we don't have to save/restore V-regs in
ftrace trampoline.

Signed-off-by: Andy Chiu <andy.chiu@sifive.com>
Link: https://lore.kernel.org/r/20250407180838.42877-7-andybnac@gmail.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/processor.h |  5 +++++
 arch/riscv/include/asm/vector.h    | 22 +++++++++++++++++++---
 2 files changed, 24 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index 5f56eb9d114a..9c1cc716b891 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -79,6 +79,10 @@ struct pt_regs;
  *       Thus, the task does not own preempt_v. Any use of Vector will have to
  *       save preempt_v, if dirty, and fallback to non-preemptible kernel-mode
  *       Vector.
+ *  - bit 29: The thread voluntarily calls schedule() while holding an active
+ *    preempt_v. All preempt_v context should be dropped in such case because
+ *    V-regs are caller-saved. Only sstatus.VS=ON is persisted across a
+ *    schedule() call.
  *  - bit 30: The in-kernel preempt_v context is saved, and requries to be
  *    restored when returning to the context that owns the preempt_v.
  *  - bit 31: The in-kernel preempt_v context is dirty, as signaled by the
@@ -93,6 +97,7 @@ struct pt_regs;
 #define RISCV_PREEMPT_V			0x00000100
 #define RISCV_PREEMPT_V_DIRTY		0x80000000
 #define RISCV_PREEMPT_V_NEED_RESTORE	0x40000000
+#define RISCV_PREEMPT_V_IN_SCHEDULE	0x20000000
 
 /* CPU-specific state of a task */
 struct thread_struct {
diff --git a/arch/riscv/include/asm/vector.h b/arch/riscv/include/asm/vector.h
index e8a83f55be2b..45c9b426fcc5 100644
--- a/arch/riscv/include/asm/vector.h
+++ b/arch/riscv/include/asm/vector.h
@@ -120,6 +120,11 @@ static __always_inline void riscv_v_disable(void)
 		csr_clear(CSR_SSTATUS, SR_VS);
 }
 
+static __always_inline bool riscv_v_is_on(void)
+{
+	return !!(csr_read(CSR_SSTATUS) & SR_VS);
+}
+
 static __always_inline void __vstate_csr_save(struct __riscv_v_ext_state *dest)
 {
 	asm volatile (
@@ -366,6 +371,11 @@ static inline void __switch_to_vector(struct task_struct *prev,
 	struct pt_regs *regs;
 
 	if (riscv_preempt_v_started(prev)) {
+		if (riscv_v_is_on()) {
+			WARN_ON(prev->thread.riscv_v_flags & RISCV_V_CTX_DEPTH_MASK);
+			riscv_v_disable();
+			prev->thread.riscv_v_flags |= RISCV_PREEMPT_V_IN_SCHEDULE;
+		}
 		if (riscv_preempt_v_dirty(prev)) {
 			__riscv_v_vstate_save(&prev->thread.kernel_vstate,
 					      prev->thread.kernel_vstate.datap);
@@ -376,10 +386,16 @@ static inline void __switch_to_vector(struct task_struct *prev,
 		riscv_v_vstate_save(&prev->thread.vstate, regs);
 	}
 
-	if (riscv_preempt_v_started(next))
-		riscv_preempt_v_set_restore(next);
-	else
+	if (riscv_preempt_v_started(next)) {
+		if (next->thread.riscv_v_flags & RISCV_PREEMPT_V_IN_SCHEDULE) {
+			next->thread.riscv_v_flags &= ~RISCV_PREEMPT_V_IN_SCHEDULE;
+			riscv_v_enable();
+		} else {
+			riscv_preempt_v_set_restore(next);
+		}
+	} else {
 		riscv_v_vstate_set_restore(next, task_pt_regs(next));
+	}
 }
 
 void riscv_v_vstate_ctrl_init(struct task_struct *tsk);
-- 
cgit v1.2.3


From ca358692de41b273468e625f96926fa53e13bd8c Mon Sep 17 00:00:00 2001
From: Andy Chiu <andybnac@gmail.com>
Date: Tue, 8 Apr 2025 02:08:32 +0800
Subject: riscv: add a data fence for CMODX in the kernel mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RISC-V spec explicitly calls out that a local fence.i is not enough for
the code modification to be visble from a remote hart. In fact, it
states:

To make a store to instruction memory visible to all RISC-V harts, the
writing hart also has to execute a data FENCE before requesting that all
remote RISC-V harts execute a FENCE.I.

Although current riscv drivers for IPI use ordered MMIO when sending IPIs
in order to synchronize the action between previous csd writes, riscv
does not restrict itself to any particular flavor of IPI. Any driver or
firmware implementation that does not order data writes before the IPI
may pose a risk for code-modifying race.

Thus, add a fence here to order data writes before making the IPI.

Signed-off-by: Andy Chiu <andybnac@gmail.com>
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20250407180838.42877-8-andybnac@gmail.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/mm/cacheflush.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c
index b81672729887..b2e4b81763f8 100644
--- a/arch/riscv/mm/cacheflush.c
+++ b/arch/riscv/mm/cacheflush.c
@@ -24,7 +24,20 @@ void flush_icache_all(void)
 
 	if (num_online_cpus() < 2)
 		return;
-	else if (riscv_use_sbi_for_rfence())
+
+	/*
+	 * Make sure all previous writes to the D$ are ordered before making
+	 * the IPI. The RISC-V spec states that a hart must execute a data fence
+	 * before triggering a remote fence.i in order to make the modification
+	 * visable for remote harts.
+	 *
+	 * IPIs on RISC-V are triggered by MMIO writes to either CLINT or
+	 * S-IMSIC, so the fence ensures previous data writes "happen before"
+	 * the MMIO.
+	 */
+	RISCV_FENCE(w, o);
+
+	if (riscv_use_sbi_for_rfence())
 		sbi_remote_fence_i(NULL);
 	else
 		on_each_cpu(ipi_remote_fence_i, NULL, 1);
-- 
cgit v1.2.3


From d0262e907e2991ae09ca476281fc8cae3ec57850 Mon Sep 17 00:00:00 2001
From: Andy Chiu <andy.chiu@sifive.com>
Date: Tue, 8 Apr 2025 02:08:33 +0800
Subject: riscv: ftrace: support PREEMPT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now, we can safely enable dynamic ftrace with kernel preemption.

Signed-off-by: Andy Chiu <andy.chiu@sifive.com>
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20250407180838.42877-9-andybnac@gmail.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 7dbed10843d2..dc0fc11b6e96 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -157,7 +157,7 @@ config RISCV
 	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
 	select HAVE_FUNCTION_GRAPH_TRACER if HAVE_DYNAMIC_FTRACE_WITH_ARGS
 	select HAVE_FUNCTION_GRAPH_FREGS
-	select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION
+	select HAVE_FUNCTION_TRACER if !XIP_KERNEL
 	select HAVE_EBPF_JIT if MMU
 	select HAVE_GUP_FAST if MMU
 	select HAVE_FUNCTION_ARG_ACCESS_API
-- 
cgit v1.2.3


From c217157bcd1df434fdeaeb24b7c25ec31e15cf4a Mon Sep 17 00:00:00 2001
From: Puranjay Mohan <puranjay12@gmail.com>
Date: Tue, 8 Apr 2025 02:08:34 +0800
Subject: riscv: Implement HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch enables support for DYNAMIC_FTRACE_WITH_CALL_OPS on RISC-V.
This allows each ftrace callsite to provide an ftrace_ops to the common
ftrace trampoline, allowing each callsite to invoke distinct tracer
functions without the need to fall back to list processing or to
allocate custom trampolines for each callsite. This significantly speeds
up cases where multiple distinct trace functions are used and callsites
are mostly traced by a single tracer.

The idea and most of the implementation is taken from the ARM64's
implementation of the same feature. The idea is to place a pointer to
the ftrace_ops as a literal at a fixed offset from the function entry
point, which can be recovered by the common ftrace trampoline.

We use -fpatchable-function-entry to reserve 8 bytes above the function
entry by emitting 2 4 byte or 4 2 byte  nops depending on the presence of
CONFIG_RISCV_ISA_C. These 8 bytes are patched at runtime with a pointer
to the associated ftrace_ops for that callsite. Functions are aligned to
8 bytes to make sure that the accesses to this literal are atomic.

This approach allows for directly invoking ftrace_ops::func even for
ftrace_ops which are dynamically-allocated (or part of a module),
without going via ftrace_ops_list_func.

We've benchamrked this with the ftrace_ops sample module on Spacemit K1
Jupiter:

Without this patch:

baseline (Linux rivos 6.14.0-09584-g7d06015d936c #3 SMP Sat Mar 29
+-----------------------+-----------------+----------------------------+
|  Number of tracers    | Total time (ns) | Per-call average time      |
|-----------------------+-----------------+----------------------------|
| Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
|----------+------------+-----------------+------------+---------------|
|        0 |          0 |        1357958 |          13 |             - |
|        0 |          1 |        1302375 |          13 |             - |
|        0 |          2 |        1302375 |          13 |             - |
|        0 |         10 |        1379084 |          13 |             - |
|        0 |        100 |        1302458 |          13 |             - |
|        0 |        200 |        1302333 |          13 |             - |
|----------+------------+-----------------+------------+---------------|
|        1 |          0 |       13677833 |         136 |           123 |
|        1 |          1 |       18500916 |         185 |           172 |
|        1 |          2 |       22856459 |         228 |           215 |
|        1 |         10 |       58824709 |         588 |           575 |
|        1 |        100 |      505141584 |        5051 |          5038 |
|        1 |        200 |     1580473126 |       15804 |         15791 |
|----------+------------+-----------------+------------+---------------|
|        1 |          0 |       13561000 |         135 |           122 |
|        2 |          0 |       19707292 |         197 |           184 |
|       10 |          0 |       67774750 |         677 |           664 |
|      100 |          0 |      714123125 |        7141 |          7128 |
|      200 |          0 |     1918065668 |       19180 |         19167 |
+----------+------------+-----------------+------------+---------------+

Note: per-call overhead is estimated relative to the baseline case with
0 relevant tracers and 0 irrelevant tracers.

With this patch:

v4-rc4 (Linux rivos 6.14.0-09598-gd75747611c93 #4 SMP Sat Mar 29
+-----------------------+-----------------+----------------------------+
|  Number of tracers    | Total time (ns) | Per-call average time      |
|-----------------------+-----------------+----------------------------|
| Relevant | Irrelevant |    100000 calls | Total (ns) | Overhead (ns) |
|----------+------------+-----------------+------------+---------------|
|        0 |          0 |         1459917 |         14 |             - |
|        0 |          1 |         1408000 |         14 |             - |
|        0 |          2 |         1383792 |         13 |             - |
|        0 |         10 |         1430709 |         14 |             - |
|        0 |        100 |         1383791 |         13 |             - |
|        0 |        200 |         1383750 |         13 |             - |
|----------+------------+-----------------+------------+---------------|
|        1 |          0 |         5238041 |         52 |            38 |
|        1 |          1 |         5228542 |         52 |            38 |
|        1 |          2 |         5325917 |         53 |            40 |
|        1 |         10 |         5299667 |         52 |            38 |
|        1 |        100 |         5245250 |         52 |            39 |
|        1 |        200 |         5238459 |         52 |            39 |
|----------+------------+-----------------+------------+---------------|
|        1 |          0 |         5239083 |         52 |            38 |
|        2 |          0 |        19449417 |        194 |           181 |
|       10 |          0 |        67718584 |        677 |           663 |
|      100 |          0 |       709840708 |       7098 |          7085 |
|      200 |          0 |      2203580626 |      22035 |         22022 |
+----------+------------+-----------------+------------+---------------+

Note: per-call overhead is estimated relative to the baseline case with
0 relevant tracers and 0 irrelevant tracers.

As can be seen from the above:

 a) Whenever there is a single relevant tracer function associated with a
    tracee, the overhead of invoking the tracer is constant, and does not
    scale with the number of tracers which are *not* associated with that
    tracee.

 b) The overhead for a single relevant tracer has dropped to ~1/3 of the
    overhead prior to this series (from 122ns to 38ns). This is largely
    due to permitting calls to dynamically-allocated ftrace_ops without
    going through ftrace_ops_list_func.

Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>

[update kconfig, asm, refactor]

Signed-off-by: Andy Chiu <andybnac@gmail.com>
Tested-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20250407180838.42877-10-andybnac@gmail.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig              |  2 ++
 arch/riscv/Makefile             |  4 +--
 arch/riscv/kernel/asm-offsets.c |  3 ++
 arch/riscv/kernel/ftrace.c      | 67 +++++++++++++++++++++++++++++++++++++++++
 arch/riscv/kernel/mcount-dyn.S  | 35 ++++++++++++++++++---
 5 files changed, 105 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index dc0fc11b6e96..ec986c9120e3 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -99,6 +99,7 @@ config RISCV
 	select EDAC_SUPPORT
 	select FRAME_POINTER if PERF_EVENTS || (FUNCTION_TRACER && !DYNAMIC_FTRACE)
 	select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY if DYNAMIC_FTRACE
+	select FUNCTION_ALIGNMENT_8B if DYNAMIC_FTRACE_WITH_CALL_OPS
 	select GENERIC_ARCH_TOPOLOGY
 	select GENERIC_ATOMIC64 if !64BIT
 	select GENERIC_CLOCKEVENTS_BROADCAST if SMP
@@ -152,6 +153,7 @@ config RISCV
 	select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
 	select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C
 	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG)
 	select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_GRAPH_FUNC
 	select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 539d2aef5cab..df57654a615e 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -15,9 +15,9 @@ ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
 	LDFLAGS_vmlinux += --no-relax
 	KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
 ifeq ($(CONFIG_RISCV_ISA_C),y)
-	CC_FLAGS_FTRACE := -fpatchable-function-entry=4
+	CC_FLAGS_FTRACE := -fpatchable-function-entry=8,4
 else
-	CC_FLAGS_FTRACE := -fpatchable-function-entry=2
+	CC_FLAGS_FTRACE := -fpatchable-function-entry=4,2
 endif
 endif
 
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 7c43c8e26ae7..2d96197a8abf 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -493,6 +493,9 @@ void asm_offsets(void)
 	DEFINE(STACKFRAME_SIZE_ON_STACK, ALIGN(sizeof(struct stackframe), STACK_ALIGN));
 	OFFSET(STACKFRAME_FP, stackframe, fp);
 	OFFSET(STACKFRAME_RA, stackframe, ra);
+#ifdef CONFIG_FUNCTION_TRACER
+	DEFINE(FTRACE_OPS_FUNC,		offsetof(struct ftrace_ops, func));
+#endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
 	DEFINE(FREGS_SIZE_ON_STACK, ALIGN(sizeof(struct __arch_ftrace_regs), STACK_ALIGN));
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index b133c60808fe..d56fc6e9fba0 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -16,6 +16,9 @@
 #ifdef CONFIG_DYNAMIC_FTRACE
 unsigned long ftrace_call_adjust(unsigned long addr)
 {
+	if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
+		return addr + 8;
+
 	return addr + MCOUNT_AUIPC_SIZE;
 }
 
@@ -64,9 +67,52 @@ static int __ftrace_modify_call(unsigned long source, unsigned long target, bool
 	return 0;
 }
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec)
+{
+	const struct ftrace_ops *ops = NULL;
+
+	if (rec->flags & FTRACE_FL_CALL_OPS_EN) {
+		ops = ftrace_find_unique_ops(rec);
+		WARN_ON_ONCE(!ops);
+	}
+
+	if (!ops)
+		ops = &ftrace_list_ops;
+
+	return ops;
+}
+
+static int ftrace_rec_set_ops(const struct dyn_ftrace *rec,
+			      const struct ftrace_ops *ops)
+{
+	unsigned long literal = rec->ip - 8;
+
+	return patch_text_nosync((void *)literal, &ops, sizeof(ops));
+}
+
+static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec)
+{
+	return ftrace_rec_set_ops(rec, &ftrace_nop_ops);
+}
+
+static int ftrace_rec_update_ops(struct dyn_ftrace *rec)
+{
+	return ftrace_rec_set_ops(rec, riscv64_rec_get_ops(rec));
+}
+#else
+static int ftrace_rec_set_nop_ops(struct dyn_ftrace *rec) { return 0; }
+static int ftrace_rec_update_ops(struct dyn_ftrace *rec) { return 0; }
+#endif
+
 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
 	unsigned long distance, orig_addr, pc = rec->ip - MCOUNT_AUIPC_SIZE;
+	int ret;
+
+	ret = ftrace_rec_update_ops(rec);
+	if (ret)
+		return ret;
 
 	orig_addr = (unsigned long)&ftrace_caller;
 	distance = addr > orig_addr ? addr - orig_addr : orig_addr - addr;
@@ -79,6 +125,11 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr)
 {
 	u32 nop4 = RISCV_INSN_NOP4;
+	int ret;
+
+	ret = ftrace_rec_set_nop_ops(rec);
+	if (ret)
+		return ret;
 
 	if (patch_insn_write((void *)rec->ip, &nop4, MCOUNT_NOP4_SIZE))
 		return -EPERM;
@@ -99,6 +150,10 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
 	unsigned int nops[2], offset;
 	int ret;
 
+	ret = ftrace_rec_set_nop_ops(rec);
+	if (ret)
+		return ret;
+
 	offset = (unsigned long) &ftrace_caller - pc;
 	nops[0] = to_auipc_t0(offset);
 	nops[1] = RISCV_INSN_NOP4;
@@ -113,6 +168,13 @@ int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
 ftrace_func_t ftrace_call_dest = ftrace_stub;
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
+	/*
+	 * When using CALL_OPS, the function to call is associated with the
+	 * call site, and we don't have a global function pointer to update.
+	 */
+	if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
+		return 0;
+
 	WRITE_ONCE(ftrace_call_dest, func);
 	/*
 	 * The data fence ensure that the update to ftrace_call_dest happens
@@ -143,8 +205,13 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
 {
 	unsigned long caller = rec->ip - MCOUNT_AUIPC_SIZE;
 	unsigned int call[2];
+	int ret;
 
 	make_call_t0(caller, old_addr, call);
+	ret = ftrace_rec_update_ops(rec);
+	if (ret)
+		return ret;
+
 	return __ftrace_modify_call(caller, addr, true);
 }
 #endif
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index 8aa554d56096..699684eea7f0 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -139,10 +139,34 @@
 
 	.macro PREPARE_ARGS
 	addi	a0, t0, -MCOUNT_JALR_SIZE	// ip (callsite's jalr insn)
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+	/*
+	 * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the
+	 * function entry, these are later overwritten with the pointer to the
+	 * associated struct ftrace_ops.
+	 *
+	 * -8: &ftrace_ops of the associated tracer function.
+	 *<ftrace enable>:
+	 *  0: auipc  t0/ra, 0x?
+	 *  4: jalr   t0/ra, ?(t0/ra)
+	 *
+	 * -8: &ftrace_nop_ops
+	 *<ftrace disable>:
+	 *  0: nop
+	 *  4: nop
+	 *
+	 * t0 is set to ip+8 after the jalr is executed at the callsite,
+	 * so we find the associated op at t0-16.
+	 */
+	mv 	a1, ra				// parent_ip
+	REG_L   a2, -16(t0)			// op
+	REG_L   ra, FTRACE_OPS_FUNC(a2)		// op->func
+#else
 	la	a1, function_trace_op
-	REG_L	a2, 0(a1)
-	mv	a1, ra
-	mv	a3, sp
+	REG_L	a2, 0(a1)			// op
+	mv	a1, ra				// parent_ip
+#endif
+	mv	a3, sp				// regs
 	.endm
 
 SYM_FUNC_START(ftrace_caller)
@@ -150,10 +174,13 @@ SYM_FUNC_START(ftrace_caller)
 	SAVE_ABI_REGS
 	PREPARE_ARGS
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+	jalr	ra
+#else
 SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
 	REG_L	ra, ftrace_call_dest
 	jalr	ra, 0(ra)
-
+#endif
 	RESTORE_ABI_REGS
 	bnez	t1, .Ldirect
 	jr	t0
-- 
cgit v1.2.3


From b21cdb9523e5561b97fd534dbb75d132c5c938ff Mon Sep 17 00:00:00 2001
From: Andy Chiu <andybnac@gmail.com>
Date: Tue, 8 Apr 2025 02:08:35 +0800
Subject: riscv: ftrace: support direct call using call_ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

jump to FTRACE_ADDR if distance is out of reach

Co-developed-by: Björn Töpel <bjorn@rivosinc.com>
Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
Signed-off-by: Andy Chiu <andybnac@gmail.com>
Link: https://lore.kernel.org/r/20250407180838.42877-11-andybnac@gmail.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig              |  2 +-
 arch/riscv/include/asm/ftrace.h |  6 +++++
 arch/riscv/kernel/asm-offsets.c |  3 +++
 arch/riscv/kernel/ftrace.c      | 13 ++++-------
 arch/riscv/kernel/mcount-dyn.S  | 51 ++++++++++++++++++++++++++---------------
 5 files changed, 48 insertions(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index ec986c9120e3..8fdca6345fa3 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -152,7 +152,7 @@ config RISCV
 	select HAVE_DMA_CONTIGUOUS if MMU
 	select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && MMU && (CLANG_SUPPORTS_DYNAMIC_FTRACE || GCC_SUPPORTS_DYNAMIC_FTRACE)
 	select FUNCTION_ALIGNMENT_4B if HAVE_DYNAMIC_FTRACE && RISCV_ISA_C
-	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS if HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
 	select HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS if (DYNAMIC_FTRACE_WITH_ARGS && !CFI_CLANG)
 	select HAVE_DYNAMIC_FTRACE_WITH_ARGS if HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_GRAPH_FUNC
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 6a5c0a7fb826..22ebea3c2b26 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -130,6 +130,9 @@ struct __arch_ftrace_regs {
 	unsigned long sp;
 	unsigned long s0;
 	unsigned long t1;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	unsigned long direct_tramp;
+#endif
 	union {
 		unsigned long args[8];
 		struct {
@@ -223,10 +226,13 @@ void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
 		       struct ftrace_ops *op, struct ftrace_regs *fregs);
 #define ftrace_graph_func ftrace_graph_func
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
 static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr)
 {
 	arch_ftrace_regs(fregs)->t1 = addr;
 }
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
 #endif /* CONFIG_DYNAMIC_FTRACE_WITH_ARGS */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 2d96197a8abf..b26334075697 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -495,6 +495,9 @@ void asm_offsets(void)
 	OFFSET(STACKFRAME_RA, stackframe, ra);
 #ifdef CONFIG_FUNCTION_TRACER
 	DEFINE(FTRACE_OPS_FUNC,		offsetof(struct ftrace_ops, func));
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	DEFINE(FTRACE_OPS_DIRECT_CALL,	offsetof(struct ftrace_ops, direct_call));
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
 #endif
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index d56fc6e9fba0..4c6c24380cfd 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -17,7 +17,7 @@
 unsigned long ftrace_call_adjust(unsigned long addr)
 {
 	if (IS_ENABLED(CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS))
-		return addr + 8;
+		return addr + 8 + MCOUNT_AUIPC_SIZE;
 
 	return addr + MCOUNT_AUIPC_SIZE;
 }
@@ -83,10 +83,9 @@ static const struct ftrace_ops *riscv64_rec_get_ops(struct dyn_ftrace *rec)
 	return ops;
 }
 
-static int ftrace_rec_set_ops(const struct dyn_ftrace *rec,
-			      const struct ftrace_ops *ops)
+static int ftrace_rec_set_ops(const struct dyn_ftrace *rec, const struct ftrace_ops *ops)
 {
-	unsigned long literal = rec->ip - 8;
+	unsigned long literal = ALIGN_DOWN(rec->ip - 12, 8);
 
 	return patch_text_nosync((void *)literal, &ops, sizeof(ops));
 }
@@ -117,7 +116,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 	orig_addr = (unsigned long)&ftrace_caller;
 	distance = addr > orig_addr ? addr - orig_addr : orig_addr - addr;
 	if (distance > JALR_RANGE)
-		return -EINVAL;
+		addr = FTRACE_ADDR;
 
 	return __ftrace_modify_call(pc, addr, false);
 }
@@ -204,15 +203,13 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
 		       unsigned long addr)
 {
 	unsigned long caller = rec->ip - MCOUNT_AUIPC_SIZE;
-	unsigned int call[2];
 	int ret;
 
-	make_call_t0(caller, old_addr, call);
 	ret = ftrace_rec_update_ops(rec);
 	if (ret)
 		return ret;
 
-	return __ftrace_modify_call(caller, addr, true);
+	return __ftrace_modify_call(caller, FTRACE_ADDR, true);
 }
 #endif
 
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index 699684eea7f0..48f6c4f7dca0 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -82,12 +82,9 @@
 *                       +++++++++
 **/
 	.macro SAVE_ABI_REGS
-	mv	t4, sp			// Save original SP in T4
 	addi	sp, sp, -FREGS_SIZE_ON_STACK
-
 	REG_S	t0,  FREGS_EPC(sp)
 	REG_S	x1,  FREGS_RA(sp)
-	REG_S	t4,  FREGS_SP(sp)	// Put original SP on stack
 #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
 	REG_S	x8,  FREGS_S0(sp)
 #endif
@@ -108,9 +105,12 @@
 	REG_S	x15, FREGS_A5(sp)
 	REG_S	x16, FREGS_A6(sp)
 	REG_S	x17, FREGS_A7(sp)
+	mv	a0, sp
+	addi	a0, a0, FREGS_SIZE_ON_STACK
+	REG_S	a0, FREGS_SP(sp)	// Put original SP on stack
 	.endm
 
-	.macro RESTORE_ABI_REGS, all=0
+	.macro RESTORE_ABI_REGS
 	REG_L	t0, FREGS_EPC(sp)
 	REG_L	x1, FREGS_RA(sp)
 #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
@@ -139,6 +139,19 @@
 
 	.macro PREPARE_ARGS
 	addi	a0, t0, -MCOUNT_JALR_SIZE	// ip (callsite's jalr insn)
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
+	mv 	a1, ra				// parent_ip
+	REG_L   a2, -16(t0)			// op
+	REG_L   ra, FTRACE_OPS_FUNC(a2)		// op->func
+#else
+	la	a1, function_trace_op
+	REG_L	a2, 0(a1)			// op
+	mv	a1, ra				// parent_ip
+#endif
+	mv	a3, sp				// regs
+	.endm
+
+SYM_FUNC_START(ftrace_caller)
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_CALL_OPS
 	/*
 	 * When CALL_OPS is enabled (2 or 4) nops [8B] are placed before the
@@ -158,19 +171,17 @@
 	 * t0 is set to ip+8 after the jalr is executed at the callsite,
 	 * so we find the associated op at t0-16.
 	 */
-	mv 	a1, ra				// parent_ip
-	REG_L   a2, -16(t0)			// op
-	REG_L   ra, FTRACE_OPS_FUNC(a2)		// op->func
-#else
-	la	a1, function_trace_op
-	REG_L	a2, 0(a1)			// op
-	mv	a1, ra				// parent_ip
-#endif
-	mv	a3, sp				// regs
-	.endm
+	REG_L	t1, -16(t0) // op Should be SZ_REG instead of 16
 
-SYM_FUNC_START(ftrace_caller)
-	mv	t1, zero
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	/*
+	 * If the op has a direct call, handle it immediately without
+	 * saving/restoring registers.
+	 */
+	REG_L	t1, FTRACE_OPS_DIRECT_CALL(t1)
+	bnez	t1, ftrace_caller_direct
+#endif
+#endif
 	SAVE_ABI_REGS
 	PREPARE_ARGS
 
@@ -182,10 +193,14 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
 	jalr	ra, 0(ra)
 #endif
 	RESTORE_ABI_REGS
-	bnez	t1, .Ldirect
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+	bnez	t1, ftrace_caller_direct
+#endif
 	jr	t0
-.Ldirect:
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+SYM_INNER_LABEL(ftrace_caller_direct, SYM_L_LOCAL)
 	jr	t1
+#endif
 SYM_FUNC_END(ftrace_caller)
 
 #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
-- 
cgit v1.2.3


From 1df45f8a9fea5a7513bd1bad98604ce1fbefcaaf Mon Sep 17 00:00:00 2001
From: Song Shuai <songshuaishuai@tinylab.org>
Date: Wed, 9 Apr 2025 21:29:58 +0200
Subject: riscv: kexec_file: Split the loading of kernel and others
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is the preparative patch for kexec_file_load Image support.

It separates the elf_kexec_load() as two parts:
- the first part loads the vmlinux (or Image)
- the second part loads other segments (e.g. initrd,fdt,purgatory)

And the second part is exported as the load_extra_segments() function
which would be used in both kexec-elf.c and kexec-image.c.

No functional change intended.

Signed-off-by: Song Shuai <songshuaishuai@tinylab.org>
Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20250409193004.643839-2-bjorn@kernel.org
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/kexec.h         |   5 +
 arch/riscv/kernel/Makefile             |   2 +-
 arch/riscv/kernel/elf_kexec.c          | 485 ---------------------------------
 arch/riscv/kernel/kexec_elf.c          | 144 ++++++++++
 arch/riscv/kernel/machine_kexec_file.c | 360 ++++++++++++++++++++++++
 5 files changed, 510 insertions(+), 486 deletions(-)
 delete mode 100644 arch/riscv/kernel/elf_kexec.c
 create mode 100644 arch/riscv/kernel/kexec_elf.c

(limited to 'arch')

diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h
index 2b56769cb530..518825fe4160 100644
--- a/arch/riscv/include/asm/kexec.h
+++ b/arch/riscv/include/asm/kexec.h
@@ -67,6 +67,11 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 struct kimage;
 int arch_kimage_file_post_load_cleanup(struct kimage *image);
 #define arch_kimage_file_post_load_cleanup arch_kimage_file_post_load_cleanup
+
+int load_extra_segments(struct kimage *image, unsigned long kernel_start,
+			unsigned long kernel_len, char *initrd,
+			unsigned long initrd_len, char *cmdline,
+			unsigned long cmdline_len);
 #endif
 
 #endif
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 8d186bfced45..d56305c8e631 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -107,7 +107,7 @@ obj-$(CONFIG_HOTPLUG_CPU)	+= cpu-hotplug.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_KEXEC_CORE)	+= kexec_relocate.o crash_save_regs.o machine_kexec.o
-obj-$(CONFIG_KEXEC_FILE)	+= elf_kexec.o machine_kexec_file.o
+obj-$(CONFIG_KEXEC_FILE)	+= kexec_elf.o machine_kexec_file.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 
diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c
deleted file mode 100644
index e783a72d051f..000000000000
--- a/arch/riscv/kernel/elf_kexec.c
+++ /dev/null
@@ -1,485 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Load ELF vmlinux file for the kexec_file_load syscall.
- *
- * Copyright (C) 2021 Huawei Technologies Co, Ltd.
- *
- * Author: Liao Chang (liaochang1@huawei.com)
- *
- * Based on kexec-tools' kexec-elf-riscv.c, heavily modified
- * for kernel.
- */
-
-#define pr_fmt(fmt)	"kexec_image: " fmt
-
-#include <linux/elf.h>
-#include <linux/kexec.h>
-#include <linux/slab.h>
-#include <linux/of.h>
-#include <linux/libfdt.h>
-#include <linux/types.h>
-#include <linux/memblock.h>
-#include <linux/vmalloc.h>
-#include <asm/setup.h>
-
-int arch_kimage_file_post_load_cleanup(struct kimage *image)
-{
-	kvfree(image->arch.fdt);
-	image->arch.fdt = NULL;
-
-	vfree(image->elf_headers);
-	image->elf_headers = NULL;
-	image->elf_headers_sz = 0;
-
-	return kexec_image_post_load_cleanup_default(image);
-}
-
-static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
-				struct kexec_elf_info *elf_info, unsigned long old_pbase,
-				unsigned long new_pbase)
-{
-	int i;
-	int ret = 0;
-	size_t size;
-	struct kexec_buf kbuf;
-	const struct elf_phdr *phdr;
-
-	kbuf.image = image;
-
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		phdr = &elf_info->proghdrs[i];
-		if (phdr->p_type != PT_LOAD)
-			continue;
-
-		size = phdr->p_filesz;
-		if (size > phdr->p_memsz)
-			size = phdr->p_memsz;
-
-		kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset;
-		kbuf.bufsz = size;
-		kbuf.buf_align = phdr->p_align;
-		kbuf.mem = phdr->p_paddr - old_pbase + new_pbase;
-		kbuf.memsz = phdr->p_memsz;
-		kbuf.top_down = false;
-		ret = kexec_add_buffer(&kbuf);
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
-/*
- * Go through the available phsyical memory regions and find one that hold
- * an image of the specified size.
- */
-static int elf_find_pbase(struct kimage *image, unsigned long kernel_len,
-			  struct elfhdr *ehdr, struct kexec_elf_info *elf_info,
-			  unsigned long *old_pbase, unsigned long *new_pbase)
-{
-	int i;
-	int ret;
-	struct kexec_buf kbuf;
-	const struct elf_phdr *phdr;
-	unsigned long lowest_paddr = ULONG_MAX;
-	unsigned long lowest_vaddr = ULONG_MAX;
-
-	for (i = 0; i < ehdr->e_phnum; i++) {
-		phdr = &elf_info->proghdrs[i];
-		if (phdr->p_type != PT_LOAD)
-			continue;
-
-		if (lowest_paddr > phdr->p_paddr)
-			lowest_paddr = phdr->p_paddr;
-
-		if (lowest_vaddr > phdr->p_vaddr)
-			lowest_vaddr = phdr->p_vaddr;
-	}
-
-	kbuf.image = image;
-	kbuf.buf_min = lowest_paddr;
-	kbuf.buf_max = ULONG_MAX;
-
-	/*
-	 * Current riscv boot protocol requires 2MB alignment for
-	 * RV64 and 4MB alignment for RV32
-	 *
-	 */
-	kbuf.buf_align = PMD_SIZE;
-	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
-	kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE);
-	kbuf.top_down = false;
-	ret = arch_kexec_locate_mem_hole(&kbuf);
-	if (!ret) {
-		*old_pbase = lowest_paddr;
-		*new_pbase = kbuf.mem;
-		image->start = ehdr->e_entry - lowest_vaddr + kbuf.mem;
-	}
-	return ret;
-}
-
-#ifdef CONFIG_CRASH_DUMP
-static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
-{
-	unsigned int *nr_ranges = arg;
-
-	(*nr_ranges)++;
-	return 0;
-}
-
-static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
-{
-	struct crash_mem *cmem = arg;
-
-	cmem->ranges[cmem->nr_ranges].start = res->start;
-	cmem->ranges[cmem->nr_ranges].end = res->end;
-	cmem->nr_ranges++;
-
-	return 0;
-}
-
-static int prepare_elf_headers(void **addr, unsigned long *sz)
-{
-	struct crash_mem *cmem;
-	unsigned int nr_ranges;
-	int ret;
-
-	nr_ranges = 1; /* For exclusion of crashkernel region */
-	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
-
-	cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL);
-	if (!cmem)
-		return -ENOMEM;
-
-	cmem->max_nr_ranges = nr_ranges;
-	cmem->nr_ranges = 0;
-	ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback);
-	if (ret)
-		goto out;
-
-	/* Exclude crashkernel region */
-	ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
-	if (!ret)
-		ret = crash_prepare_elf64_headers(cmem, true, addr, sz);
-
-out:
-	kfree(cmem);
-	return ret;
-}
-
-static char *setup_kdump_cmdline(struct kimage *image, char *cmdline,
-				 unsigned long cmdline_len)
-{
-	int elfcorehdr_strlen;
-	char *cmdline_ptr;
-
-	cmdline_ptr = kzalloc(COMMAND_LINE_SIZE, GFP_KERNEL);
-	if (!cmdline_ptr)
-		return NULL;
-
-	elfcorehdr_strlen = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ",
-		image->elf_load_addr);
-
-	if (elfcorehdr_strlen + cmdline_len > COMMAND_LINE_SIZE) {
-		pr_err("Appending elfcorehdr=<addr> exceeds cmdline size\n");
-		kfree(cmdline_ptr);
-		return NULL;
-	}
-
-	memcpy(cmdline_ptr + elfcorehdr_strlen, cmdline, cmdline_len);
-	/* Ensure it's nul terminated */
-	cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0';
-	return cmdline_ptr;
-}
-#endif
-
-static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
-			    unsigned long kernel_len, char *initrd,
-			    unsigned long initrd_len, char *cmdline,
-			    unsigned long cmdline_len)
-{
-	int ret;
-	void *fdt;
-	unsigned long old_kernel_pbase = ULONG_MAX;
-	unsigned long new_kernel_pbase = 0UL;
-	unsigned long initrd_pbase = 0UL;
-	unsigned long kernel_start;
-	struct elfhdr ehdr;
-	struct kexec_buf kbuf;
-	struct kexec_elf_info elf_info;
-	char *modified_cmdline = NULL;
-
-	ret = kexec_build_elf_info(kernel_buf, kernel_len, &ehdr, &elf_info);
-	if (ret)
-		return ERR_PTR(ret);
-
-	ret = elf_find_pbase(image, kernel_len, &ehdr, &elf_info,
-			     &old_kernel_pbase, &new_kernel_pbase);
-	if (ret)
-		goto out;
-	kernel_start = image->start;
-
-	/* Add the kernel binary to the image */
-	ret = riscv_kexec_elf_load(image, &ehdr, &elf_info,
-				   old_kernel_pbase, new_kernel_pbase);
-	if (ret)
-		goto out;
-
-	kbuf.image = image;
-	kbuf.buf_min = new_kernel_pbase + kernel_len;
-	kbuf.buf_max = ULONG_MAX;
-
-#ifdef CONFIG_CRASH_DUMP
-	/* Add elfcorehdr */
-	if (image->type == KEXEC_TYPE_CRASH) {
-		void *headers;
-		unsigned long headers_sz;
-		ret = prepare_elf_headers(&headers, &headers_sz);
-		if (ret) {
-			pr_err("Preparing elf core header failed\n");
-			goto out;
-		}
-
-		kbuf.buffer = headers;
-		kbuf.bufsz = headers_sz;
-		kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
-		kbuf.memsz = headers_sz;
-		kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
-		kbuf.top_down = true;
-
-		ret = kexec_add_buffer(&kbuf);
-		if (ret) {
-			vfree(headers);
-			goto out;
-		}
-		image->elf_headers = headers;
-		image->elf_load_addr = kbuf.mem;
-		image->elf_headers_sz = headers_sz;
-
-		kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
-			      image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
-
-		/* Setup cmdline for kdump kernel case */
-		modified_cmdline = setup_kdump_cmdline(image, cmdline,
-						       cmdline_len);
-		if (!modified_cmdline) {
-			pr_err("Setting up cmdline for kdump kernel failed\n");
-			ret = -EINVAL;
-			goto out;
-		}
-		cmdline = modified_cmdline;
-	}
-#endif
-
-#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
-	/* Add purgatory to the image */
-	kbuf.top_down = true;
-	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
-	ret = kexec_load_purgatory(image, &kbuf);
-	if (ret) {
-		pr_err("Error loading purgatory ret=%d\n", ret);
-		goto out;
-	}
-	kexec_dprintk("Loaded purgatory at 0x%lx\n", kbuf.mem);
-
-	ret = kexec_purgatory_get_set_symbol(image, "riscv_kernel_entry",
-					     &kernel_start,
-					     sizeof(kernel_start), 0);
-	if (ret)
-		pr_err("Error update purgatory ret=%d\n", ret);
-#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */
-
-	/* Add the initrd to the image */
-	if (initrd != NULL) {
-		kbuf.buffer = initrd;
-		kbuf.bufsz = kbuf.memsz = initrd_len;
-		kbuf.buf_align = PAGE_SIZE;
-		kbuf.top_down = true;
-		kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
-		ret = kexec_add_buffer(&kbuf);
-		if (ret)
-			goto out;
-		initrd_pbase = kbuf.mem;
-		kexec_dprintk("Loaded initrd at 0x%lx\n", initrd_pbase);
-	}
-
-	/* Add the DTB to the image */
-	fdt = of_kexec_alloc_and_setup_fdt(image, initrd_pbase,
-					   initrd_len, cmdline, 0);
-	if (!fdt) {
-		pr_err("Error setting up the new device tree.\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	fdt_pack(fdt);
-	kbuf.buffer = fdt;
-	kbuf.bufsz = kbuf.memsz = fdt_totalsize(fdt);
-	kbuf.buf_align = PAGE_SIZE;
-	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
-	kbuf.top_down = true;
-	ret = kexec_add_buffer(&kbuf);
-	if (ret) {
-		pr_err("Error add DTB kbuf ret=%d\n", ret);
-		goto out_free_fdt;
-	}
-	/* Cache the fdt buffer address for memory cleanup */
-	image->arch.fdt = fdt;
-	kexec_dprintk("Loaded device tree at 0x%lx\n", kbuf.mem);
-	goto out;
-
-out_free_fdt:
-	kvfree(fdt);
-out:
-	kfree(modified_cmdline);
-	kexec_free_elf_info(&elf_info);
-	return ret ? ERR_PTR(ret) : NULL;
-}
-
-#define RV_X(x, s, n)  (((x) >> (s)) & ((1 << (n)) - 1))
-#define RISCV_IMM_BITS 12
-#define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS)
-#define RISCV_CONST_HIGH_PART(x) \
-	(((x) + (RISCV_IMM_REACH >> 1)) & ~(RISCV_IMM_REACH - 1))
-#define RISCV_CONST_LOW_PART(x) ((x) - RISCV_CONST_HIGH_PART(x))
-
-#define ENCODE_ITYPE_IMM(x) \
-	(RV_X(x, 0, 12) << 20)
-#define ENCODE_BTYPE_IMM(x) \
-	((RV_X(x, 1, 4) << 8) | (RV_X(x, 5, 6) << 25) | \
-	(RV_X(x, 11, 1) << 7) | (RV_X(x, 12, 1) << 31))
-#define ENCODE_UTYPE_IMM(x) \
-	(RV_X(x, 12, 20) << 12)
-#define ENCODE_JTYPE_IMM(x) \
-	((RV_X(x, 1, 10) << 21) | (RV_X(x, 11, 1) << 20) | \
-	(RV_X(x, 12, 8) << 12) | (RV_X(x, 20, 1) << 31))
-#define ENCODE_CBTYPE_IMM(x) \
-	((RV_X(x, 1, 2) << 3) | (RV_X(x, 3, 2) << 10) | (RV_X(x, 5, 1) << 2) | \
-	(RV_X(x, 6, 2) << 5) | (RV_X(x, 8, 1) << 12))
-#define ENCODE_CJTYPE_IMM(x) \
-	((RV_X(x, 1, 3) << 3) | (RV_X(x, 4, 1) << 11) | (RV_X(x, 5, 1) << 2) | \
-	(RV_X(x, 6, 1) << 7) | (RV_X(x, 7, 1) << 6) | (RV_X(x, 8, 2) << 9) | \
-	(RV_X(x, 10, 1) << 8) | (RV_X(x, 11, 1) << 12))
-#define ENCODE_UJTYPE_IMM(x) \
-	(ENCODE_UTYPE_IMM(RISCV_CONST_HIGH_PART(x)) | \
-	(ENCODE_ITYPE_IMM(RISCV_CONST_LOW_PART(x)) << 32))
-#define ENCODE_UITYPE_IMM(x) \
-	(ENCODE_UTYPE_IMM(x) | (ENCODE_ITYPE_IMM(x) << 32))
-
-#define CLEAN_IMM(type, x) \
-	((~ENCODE_##type##_IMM((uint64_t)(-1))) & (x))
-
-int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
-				     Elf_Shdr *section,
-				     const Elf_Shdr *relsec,
-				     const Elf_Shdr *symtab)
-{
-	const char *strtab, *name, *shstrtab;
-	const Elf_Shdr *sechdrs;
-	Elf64_Rela *relas;
-	int i, r_type;
-
-	/* String & section header string table */
-	sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
-	strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset;
-	shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
-
-	relas = (void *)pi->ehdr + relsec->sh_offset;
-
-	for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) {
-		const Elf_Sym *sym;	/* symbol to relocate */
-		unsigned long addr;	/* final location after relocation */
-		unsigned long val;	/* relocated symbol value */
-		unsigned long sec_base;	/* relocated symbol value */
-		void *loc;		/* tmp location to modify */
-
-		sym = (void *)pi->ehdr + symtab->sh_offset;
-		sym += ELF64_R_SYM(relas[i].r_info);
-
-		if (sym->st_name)
-			name = strtab + sym->st_name;
-		else
-			name = shstrtab + sechdrs[sym->st_shndx].sh_name;
-
-		loc = pi->purgatory_buf;
-		loc += section->sh_offset;
-		loc += relas[i].r_offset;
-
-		if (sym->st_shndx == SHN_ABS)
-			sec_base = 0;
-		else if (sym->st_shndx >= pi->ehdr->e_shnum) {
-			pr_err("Invalid section %d for symbol %s\n",
-			       sym->st_shndx, name);
-			return -ENOEXEC;
-		} else
-			sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
-
-		val = sym->st_value;
-		val += sec_base;
-		val += relas[i].r_addend;
-
-		addr = section->sh_addr + relas[i].r_offset;
-
-		r_type = ELF64_R_TYPE(relas[i].r_info);
-
-		switch (r_type) {
-		case R_RISCV_BRANCH:
-			*(u32 *)loc = CLEAN_IMM(BTYPE, *(u32 *)loc) |
-				 ENCODE_BTYPE_IMM(val - addr);
-			break;
-		case R_RISCV_JAL:
-			*(u32 *)loc = CLEAN_IMM(JTYPE, *(u32 *)loc) |
-				 ENCODE_JTYPE_IMM(val - addr);
-			break;
-		/*
-		 * With no R_RISCV_PCREL_LO12_S, R_RISCV_PCREL_LO12_I
-		 * sym is expected to be next to R_RISCV_PCREL_HI20
-		 * in purgatory relsec. Handle it like R_RISCV_CALL
-		 * sym, instead of searching the whole relsec.
-		 */
-		case R_RISCV_PCREL_HI20:
-		case R_RISCV_CALL_PLT:
-		case R_RISCV_CALL:
-			*(u64 *)loc = CLEAN_IMM(UITYPE, *(u64 *)loc) |
-				 ENCODE_UJTYPE_IMM(val - addr);
-			break;
-		case R_RISCV_RVC_BRANCH:
-			*(u32 *)loc = CLEAN_IMM(CBTYPE, *(u32 *)loc) |
-				 ENCODE_CBTYPE_IMM(val - addr);
-			break;
-		case R_RISCV_RVC_JUMP:
-			*(u32 *)loc = CLEAN_IMM(CJTYPE, *(u32 *)loc) |
-				 ENCODE_CJTYPE_IMM(val - addr);
-			break;
-		case R_RISCV_ADD16:
-			*(u16 *)loc += val;
-			break;
-		case R_RISCV_SUB16:
-			*(u16 *)loc -= val;
-			break;
-		case R_RISCV_ADD32:
-			*(u32 *)loc += val;
-			break;
-		case R_RISCV_SUB32:
-			*(u32 *)loc -= val;
-			break;
-		/* It has been applied by R_RISCV_PCREL_HI20 sym */
-		case R_RISCV_PCREL_LO12_I:
-		case R_RISCV_ALIGN:
-		case R_RISCV_RELAX:
-			break;
-		case R_RISCV_64:
-			*(u64 *)loc = val;
-			break;
-		default:
-			pr_err("Unknown rela relocation: %d\n", r_type);
-			return -ENOEXEC;
-		}
-	}
-	return 0;
-}
-
-const struct kexec_file_ops elf_kexec_ops = {
-	.probe = kexec_elf_probe,
-	.load  = elf_kexec_load,
-};
diff --git a/arch/riscv/kernel/kexec_elf.c b/arch/riscv/kernel/kexec_elf.c
new file mode 100644
index 000000000000..f4755d49b89e
--- /dev/null
+++ b/arch/riscv/kernel/kexec_elf.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Load ELF vmlinux file for the kexec_file_load syscall.
+ *
+ * Copyright (C) 2021 Huawei Technologies Co, Ltd.
+ *
+ * Author: Liao Chang (liaochang1@huawei.com)
+ *
+ * Based on kexec-tools' kexec-elf-riscv.c, heavily modified
+ * for kernel.
+ */
+
+#define pr_fmt(fmt)	"kexec_image: " fmt
+
+#include <linux/elf.h>
+#include <linux/kexec.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/libfdt.h>
+#include <linux/types.h>
+#include <linux/memblock.h>
+#include <asm/setup.h>
+
+static int riscv_kexec_elf_load(struct kimage *image, struct elfhdr *ehdr,
+				struct kexec_elf_info *elf_info, unsigned long old_pbase,
+				unsigned long new_pbase)
+{
+	int i;
+	int ret = 0;
+	size_t size;
+	struct kexec_buf kbuf;
+	const struct elf_phdr *phdr;
+
+	kbuf.image = image;
+
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		phdr = &elf_info->proghdrs[i];
+		if (phdr->p_type != PT_LOAD)
+			continue;
+
+		size = phdr->p_filesz;
+		if (size > phdr->p_memsz)
+			size = phdr->p_memsz;
+
+		kbuf.buffer = (void *) elf_info->buffer + phdr->p_offset;
+		kbuf.bufsz = size;
+		kbuf.buf_align = phdr->p_align;
+		kbuf.mem = phdr->p_paddr - old_pbase + new_pbase;
+		kbuf.memsz = phdr->p_memsz;
+		kbuf.top_down = false;
+		ret = kexec_add_buffer(&kbuf);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * Go through the available phsyical memory regions and find one that hold
+ * an image of the specified size.
+ */
+static int elf_find_pbase(struct kimage *image, unsigned long kernel_len,
+			  struct elfhdr *ehdr, struct kexec_elf_info *elf_info,
+			  unsigned long *old_pbase, unsigned long *new_pbase)
+{
+	int i;
+	int ret;
+	struct kexec_buf kbuf;
+	const struct elf_phdr *phdr;
+	unsigned long lowest_paddr = ULONG_MAX;
+	unsigned long lowest_vaddr = ULONG_MAX;
+
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		phdr = &elf_info->proghdrs[i];
+		if (phdr->p_type != PT_LOAD)
+			continue;
+
+		if (lowest_paddr > phdr->p_paddr)
+			lowest_paddr = phdr->p_paddr;
+
+		if (lowest_vaddr > phdr->p_vaddr)
+			lowest_vaddr = phdr->p_vaddr;
+	}
+
+	kbuf.image = image;
+	kbuf.buf_min = lowest_paddr;
+	kbuf.buf_max = ULONG_MAX;
+
+	/*
+	 * Current riscv boot protocol requires 2MB alignment for
+	 * RV64 and 4MB alignment for RV32
+	 *
+	 */
+	kbuf.buf_align = PMD_SIZE;
+	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+	kbuf.memsz = ALIGN(kernel_len, PAGE_SIZE);
+	kbuf.top_down = false;
+	ret = arch_kexec_locate_mem_hole(&kbuf);
+	if (!ret) {
+		*old_pbase = lowest_paddr;
+		*new_pbase = kbuf.mem;
+		image->start = ehdr->e_entry - lowest_vaddr + kbuf.mem;
+	}
+	return ret;
+}
+
+static void *elf_kexec_load(struct kimage *image, char *kernel_buf,
+			    unsigned long kernel_len, char *initrd,
+			    unsigned long initrd_len, char *cmdline,
+			    unsigned long cmdline_len)
+{
+	int ret;
+	unsigned long old_kernel_pbase = ULONG_MAX;
+	unsigned long new_kernel_pbase = 0UL;
+	struct elfhdr ehdr;
+	struct kexec_elf_info elf_info;
+
+	ret = kexec_build_elf_info(kernel_buf, kernel_len, &ehdr, &elf_info);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = elf_find_pbase(image, kernel_len, &ehdr, &elf_info,
+			     &old_kernel_pbase, &new_kernel_pbase);
+	if (ret)
+		goto out;
+
+	/* Add the kernel binary to the image */
+	ret = riscv_kexec_elf_load(image, &ehdr, &elf_info,
+				   old_kernel_pbase, new_kernel_pbase);
+	if (ret)
+		goto out;
+
+	ret = load_extra_segments(image, image->start, kernel_len,
+				  initrd, initrd_len, cmdline, cmdline_len);
+out:
+	kexec_free_elf_info(&elf_info);
+	return ret ? ERR_PTR(ret) : NULL;
+}
+
+const struct kexec_file_ops elf_kexec_ops = {
+	.probe = kexec_elf_probe,
+	.load  = elf_kexec_load,
+};
diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c
index b0bf8c1722c0..99bd5a5f4234 100644
--- a/arch/riscv/kernel/machine_kexec_file.c
+++ b/arch/riscv/kernel/machine_kexec_file.c
@@ -7,8 +7,368 @@
  * Author: Liao Chang (liaochang1@huawei.com)
  */
 #include <linux/kexec.h>
+#include <linux/elf.h>
+#include <linux/slab.h>
+#include <linux/of.h>
+#include <linux/libfdt.h>
+#include <linux/types.h>
+#include <linux/memblock.h>
+#include <linux/vmalloc.h>
+#include <asm/setup.h>
 
 const struct kexec_file_ops * const kexec_file_loaders[] = {
 	&elf_kexec_ops,
 	NULL
 };
+
+int arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+	kvfree(image->arch.fdt);
+	image->arch.fdt = NULL;
+
+	vfree(image->elf_headers);
+	image->elf_headers = NULL;
+	image->elf_headers_sz = 0;
+
+	return kexec_image_post_load_cleanup_default(image);
+}
+
+#ifdef CONFIG_CRASH_DUMP
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
+{
+	unsigned int *nr_ranges = arg;
+
+	(*nr_ranges)++;
+	return 0;
+}
+
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
+{
+	struct crash_mem *cmem = arg;
+
+	cmem->ranges[cmem->nr_ranges].start = res->start;
+	cmem->ranges[cmem->nr_ranges].end = res->end;
+	cmem->nr_ranges++;
+
+	return 0;
+}
+
+static int prepare_elf_headers(void **addr, unsigned long *sz)
+{
+	struct crash_mem *cmem;
+	unsigned int nr_ranges;
+	int ret;
+
+	nr_ranges = 1; /* For exclusion of crashkernel region */
+	walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback);
+
+	cmem = kmalloc(struct_size(cmem, ranges, nr_ranges), GFP_KERNEL);
+	if (!cmem)
+		return -ENOMEM;
+
+	cmem->max_nr_ranges = nr_ranges;
+	cmem->nr_ranges = 0;
+	ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback);
+	if (ret)
+		goto out;
+
+	/* Exclude crashkernel region */
+	ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
+	if (!ret)
+		ret = crash_prepare_elf64_headers(cmem, true, addr, sz);
+
+out:
+	kfree(cmem);
+	return ret;
+}
+
+static char *setup_kdump_cmdline(struct kimage *image, char *cmdline,
+				 unsigned long cmdline_len)
+{
+	int elfcorehdr_strlen;
+	char *cmdline_ptr;
+
+	cmdline_ptr = kzalloc(COMMAND_LINE_SIZE, GFP_KERNEL);
+	if (!cmdline_ptr)
+		return NULL;
+
+	elfcorehdr_strlen = sprintf(cmdline_ptr, "elfcorehdr=0x%lx ",
+		image->elf_load_addr);
+
+	if (elfcorehdr_strlen + cmdline_len > COMMAND_LINE_SIZE) {
+		pr_err("Appending elfcorehdr=<addr> exceeds cmdline size\n");
+		kfree(cmdline_ptr);
+		return NULL;
+	}
+
+	memcpy(cmdline_ptr + elfcorehdr_strlen, cmdline, cmdline_len);
+	/* Ensure it's nul terminated */
+	cmdline_ptr[COMMAND_LINE_SIZE - 1] = '\0';
+	return cmdline_ptr;
+}
+#endif
+
+#define RV_X(x, s, n)  (((x) >> (s)) & ((1 << (n)) - 1))
+#define RISCV_IMM_BITS 12
+#define RISCV_IMM_REACH (1LL << RISCV_IMM_BITS)
+#define RISCV_CONST_HIGH_PART(x) \
+	(((x) + (RISCV_IMM_REACH >> 1)) & ~(RISCV_IMM_REACH - 1))
+#define RISCV_CONST_LOW_PART(x) ((x) - RISCV_CONST_HIGH_PART(x))
+
+#define ENCODE_ITYPE_IMM(x) \
+	(RV_X(x, 0, 12) << 20)
+#define ENCODE_BTYPE_IMM(x) \
+	((RV_X(x, 1, 4) << 8) | (RV_X(x, 5, 6) << 25) | \
+	(RV_X(x, 11, 1) << 7) | (RV_X(x, 12, 1) << 31))
+#define ENCODE_UTYPE_IMM(x) \
+	(RV_X(x, 12, 20) << 12)
+#define ENCODE_JTYPE_IMM(x) \
+	((RV_X(x, 1, 10) << 21) | (RV_X(x, 11, 1) << 20) | \
+	(RV_X(x, 12, 8) << 12) | (RV_X(x, 20, 1) << 31))
+#define ENCODE_CBTYPE_IMM(x) \
+	((RV_X(x, 1, 2) << 3) | (RV_X(x, 3, 2) << 10) | (RV_X(x, 5, 1) << 2) | \
+	(RV_X(x, 6, 2) << 5) | (RV_X(x, 8, 1) << 12))
+#define ENCODE_CJTYPE_IMM(x) \
+	((RV_X(x, 1, 3) << 3) | (RV_X(x, 4, 1) << 11) | (RV_X(x, 5, 1) << 2) | \
+	(RV_X(x, 6, 1) << 7) | (RV_X(x, 7, 1) << 6) | (RV_X(x, 8, 2) << 9) | \
+	(RV_X(x, 10, 1) << 8) | (RV_X(x, 11, 1) << 12))
+#define ENCODE_UJTYPE_IMM(x) \
+	(ENCODE_UTYPE_IMM(RISCV_CONST_HIGH_PART(x)) | \
+	(ENCODE_ITYPE_IMM(RISCV_CONST_LOW_PART(x)) << 32))
+#define ENCODE_UITYPE_IMM(x) \
+	(ENCODE_UTYPE_IMM(x) | (ENCODE_ITYPE_IMM(x) << 32))
+
+#define CLEAN_IMM(type, x) \
+	((~ENCODE_##type##_IMM((uint64_t)(-1))) & (x))
+
+int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
+				     Elf_Shdr *section,
+				     const Elf_Shdr *relsec,
+				     const Elf_Shdr *symtab)
+{
+	const char *strtab, *name, *shstrtab;
+	const Elf_Shdr *sechdrs;
+	Elf64_Rela *relas;
+	int i, r_type;
+
+	/* String & section header string table */
+	sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff;
+	strtab = (char *)pi->ehdr + sechdrs[symtab->sh_link].sh_offset;
+	shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset;
+
+	relas = (void *)pi->ehdr + relsec->sh_offset;
+
+	for (i = 0; i < relsec->sh_size / sizeof(*relas); i++) {
+		const Elf_Sym *sym;	/* symbol to relocate */
+		unsigned long addr;	/* final location after relocation */
+		unsigned long val;	/* relocated symbol value */
+		unsigned long sec_base;	/* relocated symbol value */
+		void *loc;		/* tmp location to modify */
+
+		sym = (void *)pi->ehdr + symtab->sh_offset;
+		sym += ELF64_R_SYM(relas[i].r_info);
+
+		if (sym->st_name)
+			name = strtab + sym->st_name;
+		else
+			name = shstrtab + sechdrs[sym->st_shndx].sh_name;
+
+		loc = pi->purgatory_buf;
+		loc += section->sh_offset;
+		loc += relas[i].r_offset;
+
+		if (sym->st_shndx == SHN_ABS)
+			sec_base = 0;
+		else if (sym->st_shndx >= pi->ehdr->e_shnum) {
+			pr_err("Invalid section %d for symbol %s\n",
+			       sym->st_shndx, name);
+			return -ENOEXEC;
+		} else
+			sec_base = pi->sechdrs[sym->st_shndx].sh_addr;
+
+		val = sym->st_value;
+		val += sec_base;
+		val += relas[i].r_addend;
+
+		addr = section->sh_addr + relas[i].r_offset;
+
+		r_type = ELF64_R_TYPE(relas[i].r_info);
+
+		switch (r_type) {
+		case R_RISCV_BRANCH:
+			*(u32 *)loc = CLEAN_IMM(BTYPE, *(u32 *)loc) |
+				 ENCODE_BTYPE_IMM(val - addr);
+			break;
+		case R_RISCV_JAL:
+			*(u32 *)loc = CLEAN_IMM(JTYPE, *(u32 *)loc) |
+				 ENCODE_JTYPE_IMM(val - addr);
+			break;
+		/*
+		 * With no R_RISCV_PCREL_LO12_S, R_RISCV_PCREL_LO12_I
+		 * sym is expected to be next to R_RISCV_PCREL_HI20
+		 * in purgatory relsec. Handle it like R_RISCV_CALL
+		 * sym, instead of searching the whole relsec.
+		 */
+		case R_RISCV_PCREL_HI20:
+		case R_RISCV_CALL_PLT:
+		case R_RISCV_CALL:
+			*(u64 *)loc = CLEAN_IMM(UITYPE, *(u64 *)loc) |
+				 ENCODE_UJTYPE_IMM(val - addr);
+			break;
+		case R_RISCV_RVC_BRANCH:
+			*(u32 *)loc = CLEAN_IMM(CBTYPE, *(u32 *)loc) |
+				 ENCODE_CBTYPE_IMM(val - addr);
+			break;
+		case R_RISCV_RVC_JUMP:
+			*(u32 *)loc = CLEAN_IMM(CJTYPE, *(u32 *)loc) |
+				 ENCODE_CJTYPE_IMM(val - addr);
+			break;
+		case R_RISCV_ADD16:
+			*(u16 *)loc += val;
+			break;
+		case R_RISCV_SUB16:
+			*(u16 *)loc -= val;
+			break;
+		case R_RISCV_ADD32:
+			*(u32 *)loc += val;
+			break;
+		case R_RISCV_SUB32:
+			*(u32 *)loc -= val;
+			break;
+		/* It has been applied by R_RISCV_PCREL_HI20 sym */
+		case R_RISCV_PCREL_LO12_I:
+		case R_RISCV_ALIGN:
+		case R_RISCV_RELAX:
+			break;
+		case R_RISCV_64:
+			*(u64 *)loc = val;
+			break;
+		default:
+			pr_err("Unknown rela relocation: %d\n", r_type);
+			return -ENOEXEC;
+		}
+	}
+	return 0;
+}
+
+
+int load_extra_segments(struct kimage *image, unsigned long kernel_start,
+			    unsigned long kernel_len, char *initrd,
+			    unsigned long initrd_len, char *cmdline,
+			    unsigned long cmdline_len)
+{
+	int ret;
+	void *fdt;
+	unsigned long initrd_pbase = 0UL;
+	struct kexec_buf kbuf;
+	char *modified_cmdline = NULL;
+
+	kbuf.image = image;
+	kbuf.buf_min = kernel_start + kernel_len;
+	kbuf.buf_max = ULONG_MAX;
+
+#ifdef CONFIG_CRASH_DUMP
+	/* Add elfcorehdr */
+	if (image->type == KEXEC_TYPE_CRASH) {
+		void *headers;
+		unsigned long headers_sz;
+		ret = prepare_elf_headers(&headers, &headers_sz);
+		if (ret) {
+			pr_err("Preparing elf core header failed\n");
+			goto out;
+		}
+
+		kbuf.buffer = headers;
+		kbuf.bufsz = headers_sz;
+		kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+		kbuf.memsz = headers_sz;
+		kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+		kbuf.top_down = true;
+
+		ret = kexec_add_buffer(&kbuf);
+		if (ret) {
+			vfree(headers);
+			goto out;
+		}
+		image->elf_headers = headers;
+		image->elf_load_addr = kbuf.mem;
+		image->elf_headers_sz = headers_sz;
+
+		kexec_dprintk("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+			      image->elf_load_addr, kbuf.bufsz, kbuf.memsz);
+
+		/* Setup cmdline for kdump kernel case */
+		modified_cmdline = setup_kdump_cmdline(image, cmdline,
+						       cmdline_len);
+		if (!modified_cmdline) {
+			pr_err("Setting up cmdline for kdump kernel failed\n");
+			ret = -EINVAL;
+			goto out;
+		}
+		cmdline = modified_cmdline;
+	}
+#endif
+
+#ifdef CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY
+	/* Add purgatory to the image */
+	kbuf.top_down = true;
+	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+	ret = kexec_load_purgatory(image, &kbuf);
+	if (ret) {
+		pr_err("Error loading purgatory ret=%d\n", ret);
+		goto out;
+	}
+	kexec_dprintk("Loaded purgatory at 0x%lx\n", kbuf.mem);
+
+	ret = kexec_purgatory_get_set_symbol(image, "riscv_kernel_entry",
+					     &kernel_start,
+					     sizeof(kernel_start), 0);
+	if (ret)
+		pr_err("Error update purgatory ret=%d\n", ret);
+#endif /* CONFIG_ARCH_SUPPORTS_KEXEC_PURGATORY */
+
+	/* Add the initrd to the image */
+	if (initrd != NULL) {
+		kbuf.buffer = initrd;
+		kbuf.bufsz = kbuf.memsz = initrd_len;
+		kbuf.buf_align = PAGE_SIZE;
+		kbuf.top_down = true;
+		kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+		ret = kexec_add_buffer(&kbuf);
+		if (ret)
+			goto out;
+		initrd_pbase = kbuf.mem;
+		kexec_dprintk("Loaded initrd at 0x%lx\n", initrd_pbase);
+	}
+
+	/* Add the DTB to the image */
+	fdt = of_kexec_alloc_and_setup_fdt(image, initrd_pbase,
+					   initrd_len, cmdline, 0);
+	if (!fdt) {
+		pr_err("Error setting up the new device tree.\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	fdt_pack(fdt);
+	kbuf.buffer = fdt;
+	kbuf.bufsz = kbuf.memsz = fdt_totalsize(fdt);
+	kbuf.buf_align = PAGE_SIZE;
+	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+	kbuf.top_down = true;
+	ret = kexec_add_buffer(&kbuf);
+	if (ret) {
+		pr_err("Error add DTB kbuf ret=%d\n", ret);
+		goto out_free_fdt;
+	}
+	/* Cache the fdt buffer address for memory cleanup */
+	image->arch.fdt = fdt;
+	kexec_dprintk("Loaded device tree at 0x%lx\n", kbuf.mem);
+	goto out;
+
+out_free_fdt:
+	kvfree(fdt);
+out:
+	kfree(modified_cmdline);
+	return ret;
+}
-- 
cgit v1.2.3


From 809a11eea8e8c80491e3ba3a286af25409c072d5 Mon Sep 17 00:00:00 2001
From: Song Shuai <songshuaishuai@tinylab.org>
Date: Wed, 9 Apr 2025 21:29:59 +0200
Subject: riscv: kexec_file: Support loading Image binary file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch creates image_kexec_ops to load Image binary file
for kexec_file_load() syscall.

Signed-off-by: Song Shuai <songshuaishuai@tinylab.org>
Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20250409193004.643839-3-bjorn@kernel.org
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/image.h         |  2 +
 arch/riscv/include/asm/kexec.h         |  1 +
 arch/riscv/kernel/Makefile             |  2 +-
 arch/riscv/kernel/kexec_image.c        | 96 ++++++++++++++++++++++++++++++++++
 arch/riscv/kernel/machine_kexec_file.c |  1 +
 5 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 arch/riscv/kernel/kexec_image.c

(limited to 'arch')

diff --git a/arch/riscv/include/asm/image.h b/arch/riscv/include/asm/image.h
index e0b319af3681..8927a6ea1127 100644
--- a/arch/riscv/include/asm/image.h
+++ b/arch/riscv/include/asm/image.h
@@ -30,6 +30,8 @@
 			      RISCV_HEADER_VERSION_MINOR)
 
 #ifndef __ASSEMBLY__
+#define riscv_image_flag_field(flags, field)\
+			       (((flags) >> field##_SHIFT) & field##_MASK)
 /**
  * struct riscv_image_header - riscv kernel image header
  * @code0:		Executable code
diff --git a/arch/riscv/include/asm/kexec.h b/arch/riscv/include/asm/kexec.h
index 518825fe4160..b9ee8346cc8c 100644
--- a/arch/riscv/include/asm/kexec.h
+++ b/arch/riscv/include/asm/kexec.h
@@ -56,6 +56,7 @@ extern riscv_kexec_method riscv_kexec_norelocate;
 
 #ifdef CONFIG_KEXEC_FILE
 extern const struct kexec_file_ops elf_kexec_ops;
+extern const struct kexec_file_ops image_kexec_ops;
 
 struct purgatory_info;
 int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index d56305c8e631..0ead29826419 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -107,7 +107,7 @@ obj-$(CONFIG_HOTPLUG_CPU)	+= cpu-hotplug.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
 obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_KEXEC_CORE)	+= kexec_relocate.o crash_save_regs.o machine_kexec.o
-obj-$(CONFIG_KEXEC_FILE)	+= kexec_elf.o machine_kexec_file.o
+obj-$(CONFIG_KEXEC_FILE)	+= kexec_elf.o kexec_image.o machine_kexec_file.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_VMCORE_INFO)	+= vmcore_info.o
 
diff --git a/arch/riscv/kernel/kexec_image.c b/arch/riscv/kernel/kexec_image.c
new file mode 100644
index 000000000000..26a81774a78a
--- /dev/null
+++ b/arch/riscv/kernel/kexec_image.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * RISC-V Kexec image loader
+ *
+ */
+
+#define pr_fmt(fmt)	"kexec_file(Image): " fmt
+
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
+#include <linux/pe.h>
+#include <linux/string.h>
+#include <asm/byteorder.h>
+#include <asm/image.h>
+
+static int image_probe(const char *kernel_buf, unsigned long kernel_len)
+{
+	const struct riscv_image_header *h = (const struct riscv_image_header *)kernel_buf;
+
+	if (!h || kernel_len < sizeof(*h))
+		return -EINVAL;
+
+	/* According to Documentation/riscv/boot-image-header.rst,
+	 * use "magic2" field to check when version >= 0.2.
+	 */
+
+	if (h->version >= RISCV_HEADER_VERSION &&
+	    memcmp(&h->magic2, RISCV_IMAGE_MAGIC2, sizeof(h->magic2)))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void *image_load(struct kimage *image,
+			char *kernel, unsigned long kernel_len,
+			char *initrd, unsigned long initrd_len,
+			char *cmdline, unsigned long cmdline_len)
+{
+	struct riscv_image_header *h;
+	u64 flags;
+	bool be_image, be_kernel;
+	struct kexec_buf kbuf;
+	int ret;
+
+	/* Check Image header */
+	h = (struct riscv_image_header *)kernel;
+	if (!h->image_size) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Check endianness */
+	flags = le64_to_cpu(h->flags);
+	be_image = riscv_image_flag_field(flags, RISCV_IMAGE_FLAG_BE);
+	be_kernel = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN);
+	if (be_image != be_kernel) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Load the kernel image */
+	kbuf.image = image;
+	kbuf.buf_min = 0;
+	kbuf.buf_max = ULONG_MAX;
+	kbuf.top_down = false;
+
+	kbuf.buffer = kernel;
+	kbuf.bufsz = kernel_len;
+	kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
+	kbuf.memsz = le64_to_cpu(h->image_size);
+	kbuf.buf_align = le64_to_cpu(h->text_offset);
+
+	ret = kexec_add_buffer(&kbuf);
+	if (ret) {
+		pr_err("Error add kernel image ret=%d\n", ret);
+		goto out;
+	}
+
+	image->start = kbuf.mem;
+
+	pr_info("Loaded kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
+		kbuf.mem, kbuf.bufsz, kbuf.memsz);
+
+	ret = load_extra_segments(image, kbuf.mem, kbuf.memsz,
+				  initrd, initrd_len, cmdline, cmdline_len);
+
+out:
+	return ret ? ERR_PTR(ret) : NULL;
+}
+
+const struct kexec_file_ops image_kexec_ops = {
+	.probe = image_probe,
+	.load = image_load,
+};
diff --git a/arch/riscv/kernel/machine_kexec_file.c b/arch/riscv/kernel/machine_kexec_file.c
index 99bd5a5f4234..e36104af2e24 100644
--- a/arch/riscv/kernel/machine_kexec_file.c
+++ b/arch/riscv/kernel/machine_kexec_file.c
@@ -18,6 +18,7 @@
 
 const struct kexec_file_ops * const kexec_file_loaders[] = {
 	&elf_kexec_ops,
+	&image_kexec_ops,
 	NULL
 };
 
-- 
cgit v1.2.3


From 850d7b14c8f77407b7d2a1edeb83d3e36c46e7a8 Mon Sep 17 00:00:00 2001
From: Yao Zi <ziyao@disroot.org>
Date: Wed, 26 Mar 2025 07:34:51 +0000
Subject: riscv/kexec_file: Fix comment in purgatory relocator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apparently sec_base doesn't mean relocated symbol value, which seems a
copy-pasting error in the comment. Assigned with the address of section
indexed by sym->st_shndx, it should represent base address of the
relevant section. Let's fix the comment to avoid possible confusion.

Fixes: 838b3e28488f ("RISC-V: Load purgatory in kexec_file")
Signed-off-by: Yao Zi <ziyao@disroot.org>
Reviewed-by: Björn Töpel <bjorn@rivosinc.com>
Link: https://lore.kernel.org/r/20250326073450.57648-2-ziyao@disroot.org
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/kernel/elf_kexec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/elf_kexec.c b/arch/riscv/kernel/elf_kexec.c
index e783a72d051f..0dc5450f2c7f 100644
--- a/arch/riscv/kernel/elf_kexec.c
+++ b/arch/riscv/kernel/elf_kexec.c
@@ -390,7 +390,7 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 		const Elf_Sym *sym;	/* symbol to relocate */
 		unsigned long addr;	/* final location after relocation */
 		unsigned long val;	/* relocated symbol value */
-		unsigned long sec_base;	/* relocated symbol value */
+		unsigned long sec_base;	/* relocated section base address */
 		void *loc;		/* tmp location to modify */
 
 		sym = (void *)pi->ehdr + symtab->sh_offset;
-- 
cgit v1.2.3


From f0f4e64b9e3527c11dc6dcb005e577d661eb9ab5 Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alexghiti@rivosinc.com>
Date: Mon, 21 Apr 2025 16:24:38 +0200
Subject: riscv: Introduce Zicbop instructions

The S-type instructions are first introduced and then used to define the
encoding of the Zicbop prefetching instructions.

Co-developed-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Guo Ren <guoren@kernel.org>
Tested-by: Andrea Parri <parri.andrea@gmail.com>
Link: https://lore.kernel.org/r/20250421142441.395849-2-alexghiti@rivosinc.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/insn-def.h | 60 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
index 71060a2f838e..02c92c1657d2 100644
--- a/arch/riscv/include/asm/insn-def.h
+++ b/arch/riscv/include/asm/insn-def.h
@@ -18,6 +18,13 @@
 #define INSN_I_RD_SHIFT			 7
 #define INSN_I_OPCODE_SHIFT		 0
 
+#define INSN_S_SIMM7_SHIFT		25
+#define INSN_S_RS2_SHIFT		20
+#define INSN_S_RS1_SHIFT		15
+#define INSN_S_FUNC3_SHIFT		12
+#define INSN_S_SIMM5_SHIFT		 7
+#define INSN_S_OPCODE_SHIFT		 0
+
 #ifdef __ASSEMBLY__
 
 #ifdef CONFIG_AS_HAS_INSN
@@ -30,6 +37,10 @@
 	.insn	i \opcode, \func3, \rd, \rs1, \simm12
 	.endm
 
+	.macro insn_s, opcode, func3, rs2, simm12, rs1
+	.insn	s \opcode, \func3, \rs2, \simm12(\rs1)
+	.endm
+
 #else
 
 #include <asm/gpr-num.h>
@@ -51,10 +62,20 @@
 		 (\simm12 << INSN_I_SIMM12_SHIFT))
 	.endm
 
+	.macro insn_s, opcode, func3, rs2, simm12, rs1
+	.4byte	((\opcode << INSN_S_OPCODE_SHIFT) |		\
+		 (\func3 << INSN_S_FUNC3_SHIFT) |		\
+		 (.L__gpr_num_\rs2 << INSN_S_RS2_SHIFT) |	\
+		 (.L__gpr_num_\rs1 << INSN_S_RS1_SHIFT) |	\
+		 ((\simm12 & 0x1f) << INSN_S_SIMM5_SHIFT) |	\
+		 (((\simm12 >> 5) & 0x7f) << INSN_S_SIMM7_SHIFT))
+	.endm
+
 #endif
 
 #define __INSN_R(...)	insn_r __VA_ARGS__
 #define __INSN_I(...)	insn_i __VA_ARGS__
+#define __INSN_S(...)	insn_s __VA_ARGS__
 
 #else /* ! __ASSEMBLY__ */
 
@@ -66,6 +87,9 @@
 #define __INSN_I(opcode, func3, rd, rs1, simm12)	\
 	".insn	i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n"
 
+#define __INSN_S(opcode, func3, rs2, simm12, rs1)	\
+	".insn	s " opcode ", " func3 ", " rs2 ", " simm12 "(" rs1 ")\n"
+
 #else
 
 #include <linux/stringify.h>
@@ -92,12 +116,26 @@
 "		 (\\simm12 << " __stringify(INSN_I_SIMM12_SHIFT) "))\n"	\
 "	.endm\n"
 
+#define DEFINE_INSN_S							\
+	__DEFINE_ASM_GPR_NUMS						\
+"	.macro insn_s, opcode, func3, rs2, simm12, rs1\n"		\
+"	.4byte	((\\opcode << " __stringify(INSN_S_OPCODE_SHIFT) ") |"	\
+"		 (\\func3 << " __stringify(INSN_S_FUNC3_SHIFT) ") |"	\
+"		 (.L__gpr_num_\\rs2 << " __stringify(INSN_S_RS2_SHIFT) ") |" \
+"		 (.L__gpr_num_\\rs1 << " __stringify(INSN_S_RS1_SHIFT) ") |" \
+"		 ((\\simm12 & 0x1f) << " __stringify(INSN_S_SIMM5_SHIFT) ") |" \
+"		 (((\\simm12 >> 5) & 0x7f) << " __stringify(INSN_S_SIMM7_SHIFT) "))\n" \
+"	.endm\n"
+
 #define UNDEFINE_INSN_R							\
 "	.purgem insn_r\n"
 
 #define UNDEFINE_INSN_I							\
 "	.purgem insn_i\n"
 
+#define UNDEFINE_INSN_S							\
+"	.purgem insn_s\n"
+
 #define __INSN_R(opcode, func3, func7, rd, rs1, rs2)			\
 	DEFINE_INSN_R							\
 	"insn_r " opcode ", " func3 ", " func7 ", " rd ", " rs1 ", " rs2 "\n" \
@@ -108,6 +146,11 @@
 	"insn_i " opcode ", " func3 ", " rd ", " rs1 ", " simm12 "\n" \
 	UNDEFINE_INSN_I
 
+#define __INSN_S(opcode, func3, rs2, simm12, rs1)			\
+	DEFINE_INSN_S							\
+	"insn_s " opcode ", " func3 ", " rs2 ", " simm12 ", " rs1 "\n"	\
+	UNDEFINE_INSN_S
+
 #endif
 
 #endif /* ! __ASSEMBLY__ */
@@ -120,6 +163,10 @@
 	__INSN_I(RV_##opcode, RV_##func3, RV_##rd,		\
 		 RV_##rs1, RV_##simm12)
 
+#define INSN_S(opcode, func3, rs2, simm12, rs1)			\
+	__INSN_S(RV_##opcode, RV_##func3, RV_##rs2,		\
+		 RV_##simm12, RV_##rs1)
+
 #define RV_OPCODE(v)		__ASM_STR(v)
 #define RV_FUNC3(v)		__ASM_STR(v)
 #define RV_FUNC7(v)		__ASM_STR(v)
@@ -133,6 +180,7 @@
 #define RV___RS2(v)		__RV_REG(v)
 
 #define RV_OPCODE_MISC_MEM	RV_OPCODE(15)
+#define RV_OPCODE_OP_IMM	RV_OPCODE(19)
 #define RV_OPCODE_SYSTEM	RV_OPCODE(115)
 
 #define HFENCE_VVMA(vaddr, asid)				\
@@ -196,6 +244,18 @@
 	INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0),		\
 	       RS1(base), SIMM12(4))
 
+#define PREFETCH_I(base, offset)				\
+	INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(0),		\
+	       SIMM12((offset) & 0xfe0), RS1(base))
+
+#define PREFETCH_R(base, offset)				\
+	INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(1),		\
+	       SIMM12((offset) & 0xfe0), RS1(base))
+
+#define PREFETCH_W(base, offset)				\
+	INSN_S(OPCODE_OP_IMM, FUNC3(6), __RS2(3),		\
+	       SIMM12((offset) & 0xfe0), RS1(base))
+
 #define RISCV_PAUSE	".4byte 0x100000f"
 #define ZAWRS_WRS_NTO	".4byte 0x00d00073"
 #define ZAWRS_WRS_STO	".4byte 0x01d00073"
-- 
cgit v1.2.3


From 8d496b5a989120c1bce1ad8eb48ebae0350722d7 Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alexghiti@rivosinc.com>
Date: Mon, 21 Apr 2025 16:24:39 +0200
Subject: riscv: Add support for Zicbop

Zicbop introduces cache blocks prefetching instructions, add the
necessary support for the kernel to use it in the coming commits.

Co-developed-by: Guo Ren <guoren@kernel.org>
Signed-off-by: Guo Ren <guoren@kernel.org>
Tested-by: Andrea Parri <parri.andrea@gmail.com>
Link: https://lore.kernel.org/r/20250421142441.395849-3-alexghiti@rivosinc.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig                  | 15 +++++++++++++++
 arch/riscv/include/asm/barrier.h    |  5 -----
 arch/riscv/include/asm/cacheflush.h |  1 +
 arch/riscv/include/asm/hwcap.h      |  1 +
 arch/riscv/include/asm/insn-def.h   |  6 ++++++
 arch/riscv/include/asm/processor.h  |  1 -
 arch/riscv/kernel/cpufeature.c      | 21 +++++++++++++++++++++
 arch/riscv/mm/cacheflush.c          | 14 +++++++++++---
 8 files changed, 55 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index bbec87b79309..28765ce563de 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -842,6 +842,21 @@ config RISCV_ISA_ZICBOZ
 
 	   If you don't know what to do here, say Y.
 
+config RISCV_ISA_ZICBOP
+	bool "Zicbop extension support for cache block prefetch"
+	depends on MMU
+	depends on RISCV_ALTERNATIVE
+	default y
+	help
+	  Adds support to dynamically detect the presence of the ZICBOP
+	  extension (Cache Block Prefetch Operations) and enable its
+	  usage.
+
+	  The Zicbop extension can be used to prefetch cache blocks for
+	  read/write fetch.
+
+	  If you don't know what to do here, say Y.
+
 config TOOLCHAIN_NEEDS_EXPLICIT_ZICSR_ZIFENCEI
 	def_bool y
 	# https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=aed44286efa8ae8717a77d94b51ac3614e2ca6dc
diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h
index e1d9bf1deca6..b8c5726d86ac 100644
--- a/arch/riscv/include/asm/barrier.h
+++ b/arch/riscv/include/asm/barrier.h
@@ -14,11 +14,6 @@
 #include <asm/cmpxchg.h>
 #include <asm/fence.h>
 
-#define nop()		__asm__ __volatile__ ("nop")
-#define __nops(n)	".rept	" #n "\nnop\n.endr\n"
-#define nops(n)		__asm__ __volatile__ (__nops(n))
-
-
 /* These barriers need to enforce ordering on both devices or memory. */
 #define __mb()		RISCV_FENCE(iorw, iorw)
 #define __rmb()		RISCV_FENCE(ir, ir)
diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h
index 8de73f91bfa3..effa02c2e682 100644
--- a/arch/riscv/include/asm/cacheflush.h
+++ b/arch/riscv/include/asm/cacheflush.h
@@ -80,6 +80,7 @@ void flush_icache_mm(struct mm_struct *mm, bool local);
 
 extern unsigned int riscv_cbom_block_size;
 extern unsigned int riscv_cboz_block_size;
+extern unsigned int riscv_cbop_block_size;
 void riscv_init_cbo_blocksizes(void);
 
 #ifdef CONFIG_RISCV_DMA_NONCOHERENT
diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
index e3cbf203cdde..affd63e11b0a 100644
--- a/arch/riscv/include/asm/hwcap.h
+++ b/arch/riscv/include/asm/hwcap.h
@@ -105,6 +105,7 @@
 #define RISCV_ISA_EXT_ZVFBFWMA		96
 #define RISCV_ISA_EXT_ZAAMO		97
 #define RISCV_ISA_EXT_ZALRSC		98
+#define RISCV_ISA_EXT_ZICBOP		99
 
 #define RISCV_ISA_EXT_XLINUXENVCFG	127
 
diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
index 02c92c1657d2..d5adbaec1d01 100644
--- a/arch/riscv/include/asm/insn-def.h
+++ b/arch/riscv/include/asm/insn-def.h
@@ -263,4 +263,10 @@
 
 #define RISCV_INSN_NOP4	_AC(0x00000013, U)
 
+#ifndef __ASSEMBLY__
+#define nop()           __asm__ __volatile__ ("nop")
+#define __nops(n)       ".rept  " #n "\nnop\n.endr\n"
+#define nops(n)         __asm__ __volatile__ (__nops(n))
+#endif
+
 #endif /* __ASM_INSN_DEF_H */
diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index 5f56eb9d114a..09d4c963399a 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -52,7 +52,6 @@
 #endif
 
 #ifndef __ASSEMBLY__
-#include <linux/cpumask.h>
 
 struct task_struct;
 struct pt_regs;
diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
index 2054f6c4b0ae..743d53415572 100644
--- a/arch/riscv/kernel/cpufeature.c
+++ b/arch/riscv/kernel/cpufeature.c
@@ -32,6 +32,7 @@
 #define NUM_ALPHA_EXTS ('z' - 'a' + 1)
 
 static bool any_cpu_has_zicboz;
+static bool any_cpu_has_zicbop;
 static bool any_cpu_has_zicbom;
 
 unsigned long elf_hwcap __read_mostly;
@@ -119,6 +120,21 @@ static int riscv_ext_zicboz_validate(const struct riscv_isa_ext_data *data,
 	return 0;
 }
 
+static int riscv_ext_zicbop_validate(const struct riscv_isa_ext_data *data,
+				     const unsigned long *isa_bitmap)
+{
+	if (!riscv_cbop_block_size) {
+		pr_err("Zicbop detected in ISA string, disabling as no cbop-block-size found\n");
+		return -EINVAL;
+	}
+	if (!is_power_of_2(riscv_cbop_block_size)) {
+		pr_err("Zicbop disabled as cbop-block-size present, but is not a power-of-2\n");
+		return -EINVAL;
+	}
+	any_cpu_has_zicbop = true;
+	return 0;
+}
+
 static int riscv_ext_f_validate(const struct riscv_isa_ext_data *data,
 				const unsigned long *isa_bitmap)
 {
@@ -442,6 +458,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
 	__RISCV_ISA_EXT_SUPERSET_VALIDATE(v, RISCV_ISA_EXT_v, riscv_v_exts, riscv_ext_vector_float_validate),
 	__RISCV_ISA_EXT_DATA(h, RISCV_ISA_EXT_h),
 	__RISCV_ISA_EXT_SUPERSET_VALIDATE(zicbom, RISCV_ISA_EXT_ZICBOM, riscv_xlinuxenvcfg_exts, riscv_ext_zicbom_validate),
+	__RISCV_ISA_EXT_DATA_VALIDATE(zicbop, RISCV_ISA_EXT_ZICBOP, riscv_ext_zicbop_validate),
 	__RISCV_ISA_EXT_SUPERSET_VALIDATE(zicboz, RISCV_ISA_EXT_ZICBOZ, riscv_xlinuxenvcfg_exts, riscv_ext_zicboz_validate),
 	__RISCV_ISA_EXT_DATA(ziccrse, RISCV_ISA_EXT_ZICCRSE),
 	__RISCV_ISA_EXT_DATA(zicntr, RISCV_ISA_EXT_ZICNTR),
@@ -1112,6 +1129,10 @@ void __init riscv_user_isa_enable(void)
 		current->thread.envcfg |= ENVCFG_CBCFE;
 	else if (any_cpu_has_zicbom)
 		pr_warn("Zicbom disabled as it is unavailable on some harts\n");
+
+	if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICBOP) &&
+	    any_cpu_has_zicbop)
+		pr_warn("Zicbop disabled as it is unavailable on some harts\n");
 }
 
 #ifdef CONFIG_RISCV_ALTERNATIVE
diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c
index b81672729887..6265052ef8b6 100644
--- a/arch/riscv/mm/cacheflush.c
+++ b/arch/riscv/mm/cacheflush.c
@@ -101,6 +101,9 @@ EXPORT_SYMBOL_GPL(riscv_cbom_block_size);
 unsigned int riscv_cboz_block_size;
 EXPORT_SYMBOL_GPL(riscv_cboz_block_size);
 
+unsigned int riscv_cbop_block_size;
+EXPORT_SYMBOL_GPL(riscv_cbop_block_size);
+
 static void __init cbo_get_block_size(struct device_node *node,
 				      const char *name, u32 *block_size,
 				      unsigned long *first_hartid)
@@ -125,8 +128,8 @@ static void __init cbo_get_block_size(struct device_node *node,
 
 void __init riscv_init_cbo_blocksizes(void)
 {
-	unsigned long cbom_hartid, cboz_hartid;
-	u32 cbom_block_size = 0, cboz_block_size = 0;
+	unsigned long cbom_hartid, cboz_hartid, cbop_hartid;
+	u32 cbom_block_size = 0, cboz_block_size = 0, cbop_block_size = 0;
 	struct device_node *node;
 	struct acpi_table_header *rhct;
 	acpi_status status;
@@ -138,13 +141,15 @@ void __init riscv_init_cbo_blocksizes(void)
 					   &cbom_block_size, &cbom_hartid);
 			cbo_get_block_size(node, "riscv,cboz-block-size",
 					   &cboz_block_size, &cboz_hartid);
+			cbo_get_block_size(node, "riscv,cbop-block-size",
+					   &cbop_block_size, &cbop_hartid);
 		}
 	} else {
 		status = acpi_get_table(ACPI_SIG_RHCT, 0, &rhct);
 		if (ACPI_FAILURE(status))
 			return;
 
-		acpi_get_cbo_block_size(rhct, &cbom_block_size, &cboz_block_size, NULL);
+		acpi_get_cbo_block_size(rhct, &cbom_block_size, &cboz_block_size, &cbop_block_size);
 		acpi_put_table((struct acpi_table_header *)rhct);
 	}
 
@@ -153,6 +158,9 @@ void __init riscv_init_cbo_blocksizes(void)
 
 	if (cboz_block_size)
 		riscv_cboz_block_size = cboz_block_size;
+
+	if (cbop_block_size)
+		riscv_cbop_block_size = cbop_block_size;
 }
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From a5f947c73115efb6fb0d9579e71ce1ee5cf706aa Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Mon, 21 Apr 2025 16:24:40 +0200
Subject: riscv: Add ARCH_HAS_PREFETCH[W] support with Zicbop

Enable Linux prefetch and prefetchw primitives using Zicbop.

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Link: https://lore.kernel.org/r/20231231082955.16516-3-guoren@kernel.org
Tested-by: Andrea Parri <parri.andrea@gmail.com>
Link: https://lore.kernel.org/r/20250421142441.395849-4-alexghiti@rivosinc.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/processor.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index 09d4c963399a..39dfab495a4c 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -13,6 +13,9 @@
 #include <vdso/processor.h>
 
 #include <asm/ptrace.h>
+#include <asm/insn-def.h>
+#include <asm/alternative-macros.h>
+#include <asm/hwcap.h>
 
 #define arch_get_mmap_end(addr, len, flags)			\
 ({								\
@@ -135,6 +138,27 @@ static inline void arch_thread_struct_whitelist(unsigned long *offset,
 #define KSTK_EIP(tsk)		(task_pt_regs(tsk)->epc)
 #define KSTK_ESP(tsk)		(task_pt_regs(tsk)->sp)
 
+#define PREFETCH_ASM(x)							\
+	ALTERNATIVE(__nops(1), PREFETCH_R(x, 0), 0,			\
+		    RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP)
+
+#define PREFETCHW_ASM(x)						\
+	ALTERNATIVE(__nops(1), PREFETCH_W(x, 0), 0,			\
+		    RISCV_ISA_EXT_ZICBOP, CONFIG_RISCV_ISA_ZICBOP)
+
+#ifdef CONFIG_RISCV_ISA_ZICBOP
+#define ARCH_HAS_PREFETCH
+static inline void prefetch(const void *x)
+{
+	__asm__ __volatile__(PREFETCH_ASM(%0) : : "r" (x) : "memory");
+}
+
+#define ARCH_HAS_PREFETCHW
+static inline void prefetchw(const void *x)
+{
+	__asm__ __volatile__(PREFETCHW_ASM(%0) : : "r" (x) : "memory");
+}
+#endif /* CONFIG_RISCV_ISA_ZICBOP */
 
 /* Do necessary setup to start up a newly executed thread. */
 extern void start_thread(struct pt_regs *regs,
-- 
cgit v1.2.3


From eb87e56d651d0a72009842bc0d8eeae7b605c97d Mon Sep 17 00:00:00 2001
From: Guo Ren <guoren@linux.alibaba.com>
Date: Mon, 21 Apr 2025 16:24:41 +0200
Subject: riscv: xchg: Prefetch the destination word for sc.w

The cost of changing a cacheline from shared to exclusive state can be
significant, especially when this is triggered by an exclusive store,
since it may result in having to retry the transaction.

This patch makes use of prefetch.w to prefetch cachelines for write
prior to lr/sc loops when using the xchg_small atomic routine.

This patch is inspired by commit 0ea366f5e1b6 ("arm64: atomics:
prefetch the destination word for write prior to stxr").

Signed-off-by: Guo Ren <guoren@linux.alibaba.com>
Signed-off-by: Guo Ren <guoren@kernel.org>
Link: https://lore.kernel.org/r/20231231082955.16516-4-guoren@kernel.org
Tested-by: Andrea Parri <parri.andrea@gmail.com>
Link: https://lore.kernel.org/r/20250421142441.395849-5-alexghiti@rivosinc.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/cmpxchg.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index 2ec119eb147b..0b749e710216 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -13,6 +13,7 @@
 #include <asm/hwcap.h>
 #include <asm/insn-def.h>
 #include <asm/cpufeature-macros.h>
+#include <asm/processor.h>
 
 #define __arch_xchg_masked(sc_sfx, swap_sfx, prepend, sc_append,		\
 			   swap_append, r, p, n)				\
@@ -37,6 +38,7 @@
 										\
 		__asm__ __volatile__ (						\
 		       prepend							\
+		       PREFETCHW_ASM(%5)					\
 		       "0:	lr.w %0, %2\n"					\
 		       "	and  %1, %0, %z4\n"				\
 		       "	or   %1, %1, %z3\n"				\
@@ -44,7 +46,7 @@
 		       "	bnez %1, 0b\n"					\
 		       sc_append						\
 		       : "=&r" (__retx), "=&r" (__rc), "+A" (*(__ptr32b))	\
-		       : "rJ" (__newx), "rJ" (~__mask)				\
+		       : "rJ" (__newx), "rJ" (~__mask), "rJ" (__ptr32b)		\
 		       : "memory");						\
 										\
 		r = (__typeof__(*(p)))((__retx & __mask) >> __s);		\
-- 
cgit v1.2.3


From c3cc2a4a3a23faf6b88459471c1c16ab3837cc2f Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alexghiti@rivosinc.com>
Date: Fri, 21 Mar 2025 13:39:54 +0100
Subject: riscv: Add support for PUD THP

Add the necessary page table functions to deal with PUD THP, this
enables the use of PUD pfnmap.

Link: https://lore.kernel.org/r/20250321123954.225097-1-alexghiti@rivosinc.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig                  |  1 +
 arch/riscv/include/asm/pgtable-64.h |  5 +-
 arch/riscv/include/asm/pgtable.h    | 97 +++++++++++++++++++++++++++++++++++++
 arch/riscv/include/asm/tlbflush.h   |  2 +
 arch/riscv/mm/pgtable.c             | 10 ++++
 arch/riscv/mm/tlbflush.c            |  7 +++
 6 files changed, 120 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index bbec87b79309..63ef4aa03506 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -143,6 +143,7 @@ config RISCV
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 64BIT && MMU
+	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if 64BIT && MMU
 	select HAVE_ARCH_USERFAULTFD_MINOR if 64BIT && USERFAULTFD
 	select HAVE_ARCH_VMAP_STACK if MMU && 64BIT
 	select HAVE_ASM_MODVERSIONS
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index 0897dd99ab8d..a2c00235c447 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -184,7 +184,7 @@ static inline int pud_none(pud_t pud)
 
 static inline int pud_bad(pud_t pud)
 {
-	return !pud_present(pud);
+	return !pud_present(pud) || (pud_val(pud) & _PAGE_LEAF);
 }
 
 #define pud_leaf	pud_leaf
@@ -401,6 +401,7 @@ p4d_t *p4d_offset(pgd_t *pgd, unsigned long address);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline int pte_devmap(pte_t pte);
 static inline pte_t pmd_pte(pmd_t pmd);
+static inline pte_t pud_pte(pud_t pud);
 
 static inline int pmd_devmap(pmd_t pmd)
 {
@@ -409,7 +410,7 @@ static inline int pmd_devmap(pmd_t pmd)
 
 static inline int pud_devmap(pud_t pud)
 {
-	return 0;
+	return pte_devmap(pud_pte(pud));
 }
 
 static inline int pgd_devmap(pgd_t pgd)
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 428e48e5f57d..b84e2ff83cb7 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -902,6 +902,103 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
 #define pmdp_collapse_flush pmdp_collapse_flush
 extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
 				 unsigned long address, pmd_t *pmdp);
+
+static inline pud_t pud_wrprotect(pud_t pud)
+{
+	return pte_pud(pte_wrprotect(pud_pte(pud)));
+}
+
+static inline int pud_trans_huge(pud_t pud)
+{
+	return pud_leaf(pud);
+}
+
+static inline int pud_dirty(pud_t pud)
+{
+	return pte_dirty(pud_pte(pud));
+}
+
+static inline pud_t pud_mkyoung(pud_t pud)
+{
+	return pte_pud(pte_mkyoung(pud_pte(pud)));
+}
+
+static inline pud_t pud_mkold(pud_t pud)
+{
+	return pte_pud(pte_mkold(pud_pte(pud)));
+}
+
+static inline pud_t pud_mkdirty(pud_t pud)
+{
+	return pte_pud(pte_mkdirty(pud_pte(pud)));
+}
+
+static inline pud_t pud_mkclean(pud_t pud)
+{
+	return pte_pud(pte_mkclean(pud_pte(pud)));
+}
+
+static inline pud_t pud_mkwrite(pud_t pud)
+{
+	return pte_pud(pte_mkwrite_novma(pud_pte(pud)));
+}
+
+static inline pud_t pud_mkhuge(pud_t pud)
+{
+	return pud;
+}
+
+static inline pud_t pud_mkdevmap(pud_t pud)
+{
+	return pte_pud(pte_mkdevmap(pud_pte(pud)));
+}
+
+static inline int pudp_set_access_flags(struct vm_area_struct *vma,
+					unsigned long address, pud_t *pudp,
+					pud_t entry, int dirty)
+{
+	return ptep_set_access_flags(vma, address, (pte_t *)pudp, pud_pte(entry), dirty);
+}
+
+static inline int pudp_test_and_clear_young(struct vm_area_struct *vma,
+					    unsigned long address, pud_t *pudp)
+{
+	return ptep_test_and_clear_young(vma, address, (pte_t *)pudp);
+}
+
+static inline int pud_young(pud_t pud)
+{
+	return pte_young(pud_pte(pud));
+}
+
+static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
+					unsigned long address, pud_t *pudp)
+{
+	pte_t *ptep = (pte_t *)pudp;
+
+	update_mmu_cache(vma, address, ptep);
+}
+
+static inline pud_t pudp_establish(struct vm_area_struct *vma,
+				   unsigned long address, pud_t *pudp, pud_t pud)
+{
+	page_table_check_pud_set(vma->vm_mm, pudp, pud);
+	return __pud(atomic_long_xchg((atomic_long_t *)pudp, pud_val(pud)));
+}
+
+static inline pud_t pud_mkinvalid(pud_t pud)
+{
+	return __pud(pud_val(pud) & ~(_PAGE_PRESENT | _PAGE_PROT_NONE));
+}
+
+extern pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
+			     pud_t *pudp);
+
+static inline pud_t pud_modify(pud_t pud, pgprot_t newprot)
+{
+	return pte_pud(pte_modify(pud_pte(pud), newprot));
+}
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index ce0dd0fed764..1a20dd746a49 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -56,6 +56,8 @@ void local_flush_tlb_kernel_range(unsigned long start, unsigned long end);
 #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
 void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
 			unsigned long end);
+void flush_pud_tlb_range(struct vm_area_struct *vma, unsigned long start,
+			 unsigned long end);
 #endif
 
 bool arch_tlbbatch_should_defer(struct mm_struct *mm);
diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c
index 4ae67324f992..8b6c0a112a8d 100644
--- a/arch/riscv/mm/pgtable.c
+++ b/arch/riscv/mm/pgtable.c
@@ -154,4 +154,14 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
 	flush_tlb_mm(vma->vm_mm);
 	return pmd;
 }
+
+pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
+		      pud_t *pudp)
+{
+	VM_WARN_ON_ONCE(!pud_present(*pudp));
+	pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
+
+	flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+	return old;
+}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index f9e27ba1df99..97c8fde3cbfe 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -182,6 +182,13 @@ void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	__flush_tlb_range(vma->vm_mm, mm_cpumask(vma->vm_mm),
 			  start, end - start, PMD_SIZE);
 }
+
+void flush_pud_tlb_range(struct vm_area_struct *vma, unsigned long start,
+			 unsigned long end)
+{
+	__flush_tlb_range(vma->vm_mm, mm_cpumask(vma->vm_mm),
+			  start, end - start, PUD_SIZE);
+}
 #endif
 
 bool arch_tlbbatch_should_defer(struct mm_struct *mm)
-- 
cgit v1.2.3


From be17c0df67959fe4f88dac75dc26ed9252d4b133 Mon Sep 17 00:00:00 2001
From: Samuel Holland <samuel.holland@sifive.com>
Date: Wed, 9 Apr 2025 10:14:51 -0700
Subject: riscv: module: Optimize PLT/GOT entry counting

perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent
inside module_frob_arch_sections(). This is because amdgpu.ko contains
about 300000 relocations in its .rela.text section, and the algorithm in
count_max_entries() takes quadratic time.

Apply two optimizations from the arm64 code, which together reduce the
total execution time by 99.58%. First, sort the relocations so duplicate
entries are adjacent. Second, reduce the number of relocations that must
be sorted by filtering to only relocations that need PLT/GOT entries, as
done in commit d4e0340919fb ("arm64/module: Optimize module load time by
optimizing PLT counting").

Unlike the arm64 code, here the filtering and sorting is done in a
scratch buffer, because the HI20 relocation search optimization in
apply_relocate_add() depends on the original order of the relocations.
This allows accumulating PLT/GOT relocations across sections so sorting
and counting is only done once per module.

Signed-off-by: Samuel Holland <samuel.holland@sifive.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Link: https://lore.kernel.org/r/20250409171526.862481-3-samuel.holland@sifive.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/kernel/module-sections.c | 81 +++++++++++++++++++++++++++++--------
 1 file changed, 65 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
index 91d0b355ceef..75551ac6504c 100644
--- a/arch/riscv/kernel/module-sections.c
+++ b/arch/riscv/kernel/module-sections.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <linux/sort.h>
 
 unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
 {
@@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
 	return (unsigned long)&plt[i];
 }
 
-static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
+#define cmp_3way(a, b)	((a) < (b) ? -1 : (a) > (b))
+
+static int cmp_rela(const void *a, const void *b)
 {
-	return x->r_info == y->r_info && x->r_addend == y->r_addend;
+	const Elf_Rela *x = a, *y = b;
+	int i;
+
+	/* sort by type, symbol index and addend */
+	i = cmp_3way(x->r_info, y->r_info);
+	if (i == 0)
+		i = cmp_3way(x->r_addend, y->r_addend);
+	return i;
 }
 
 static bool duplicate_rela(const Elf_Rela *rela, int idx)
 {
-	int i;
-	for (i = 0; i < idx; i++) {
-		if (is_rela_equal(&rela[i], &rela[idx]))
-			return true;
-	}
-	return false;
+	/*
+	 * Entries are sorted by type, symbol index and addend. That means
+	 * that, if a duplicate entry exists, it must be in the preceding slot.
+	 */
+	return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
 }
 
-static void count_max_entries(Elf_Rela *relas, int num,
+static void count_max_entries(const Elf_Rela *relas, size_t num,
 			      unsigned int *plts, unsigned int *gots)
 {
-	for (int i = 0; i < num; i++) {
+	for (size_t i = 0; i < num; i++) {
+		if (duplicate_rela(relas, i))
+			continue;
+
 		switch (ELF_R_TYPE(relas[i].r_info)) {
 		case R_RISCV_CALL_PLT:
 		case R_RISCV_PLT32:
-			if (!duplicate_rela(relas, i))
-				(*plts)++;
+			(*plts)++;
 			break;
 		case R_RISCV_GOT_HI20:
-			if (!duplicate_rela(relas, i))
-				(*gots)++;
+			(*gots)++;
 			break;
+		default:
+			unreachable();
 		}
 	}
 }
 
+static bool rela_needs_plt_got_entry(const Elf_Rela *rela)
+{
+	switch (ELF_R_TYPE(rela->r_info)) {
+	case R_RISCV_CALL_PLT:
+	case R_RISCV_GOT_HI20:
+	case R_RISCV_PLT32:
+		return true;
+	default:
+		return false;
+	}
+}
+
 int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 			      char *secstrings, struct module *mod)
 {
+	size_t num_scratch_relas = 0;
 	unsigned int num_plts = 0;
 	unsigned int num_gots = 0;
+	Elf_Rela *scratch = NULL;
+	size_t scratch_size = 0;
 	int i;
 
 	/*
@@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 
 	/* Calculate the maxinum number of entries */
 	for (i = 0; i < ehdr->e_shnum; i++) {
+		size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela);
 		Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
-		int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela);
 		Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
+		size_t scratch_size_needed;
 
 		if (sechdrs[i].sh_type != SHT_RELA)
 			continue;
@@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 		if (!(dst_sec->sh_flags & SHF_EXECINSTR))
 			continue;
 
-		count_max_entries(relas, num_rela, &num_plts, &num_gots);
+		/*
+		 * apply_relocate_add() relies on HI20 and LO12 relocation pairs being
+		 * close together, so sort a copy of the section to avoid interfering.
+		 */
+		scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch);
+		if (scratch_size_needed > scratch_size) {
+			scratch_size = scratch_size_needed;
+			scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
+			if (!scratch)
+				return -ENOMEM;
+		}
+
+		for (size_t j = 0; j < num_relas; j++)
+			if (rela_needs_plt_got_entry(&relas[j]))
+				scratch[num_scratch_relas++] = relas[j];
+	}
+
+	if (scratch) {
+		/* sort the accumulated PLT/GOT relocations so duplicates are adjacent */
+		sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL);
+		count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots);
+		kvfree(scratch);
 	}
 
 	mod->arch.plt.shdr->sh_type = SHT_NOBITS;
-- 
cgit v1.2.3


From 48d9aabf2dc5d93b402ce55e3187e95698f2b9e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B0=A2=E8=87=B4=E9=82=A6=20=28XIE=20Zhibang=29?=
 <Yeking@Red54.com>
Date: Fri, 28 Mar 2025 10:14:22 +0000
Subject: RISC-V: Kconfig: Fix help text of CMDLINE_EXTEND
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It is the built-in command line appended to the bootloader command line,
not the bootloader command line appended to the built-in command line.

Fixes: 3aed8c43267e ("RISC-V: Update Kconfig to better handle CMDLINE")
Signed-off-by: 谢致邦 (XIE Zhibang) <Yeking@Red54.com>
Link: https://lore.kernel.org/r/tencent_A93C7FB46BFD20054AD2FEF4645913FF550A@qq.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 674cf6ff7188..78640cd353fd 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -1176,8 +1176,8 @@ config CMDLINE_FALLBACK
 config CMDLINE_EXTEND
 	bool "Extend bootloader kernel arguments"
 	help
-	  The command-line arguments provided during boot will be
-	  appended to the built-in command line. This is useful in
+	  The built-in command line will be appended to the command-
+	  line arguments provided during boot. This is useful in
 	  cases where the provided arguments are insufficient and
 	  you don't want to or cannot modify them.
 
-- 
cgit v1.2.3


From d7e0cce103663a60b14c52a0bcad62c7e8c0e6e8 Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Tue, 22 Apr 2025 19:31:56 +0800
Subject: riscv: Make regs_irqs_disabled() more clear

The return value of regs_irqs_disabled() is true or false, so change
its type to reflect that and also make it always inline.

Suggested-by: Huacai Chen <chenhuacai@loongson.cn>
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20250422113156.25742-1-yangtiezhu@loongson.cn
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/ptrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/ptrace.h b/arch/riscv/include/asm/ptrace.h
index 2910231977cb..a7dc0e330757 100644
--- a/arch/riscv/include/asm/ptrace.h
+++ b/arch/riscv/include/asm/ptrace.h
@@ -175,7 +175,7 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
 	return 0;
 }
 
-static inline int regs_irqs_disabled(struct pt_regs *regs)
+static __always_inline bool regs_irqs_disabled(struct pt_regs *regs)
 {
 	return !(regs->status & SR_PIE);
 }
-- 
cgit v1.2.3


From 415a8c81da3dab0a585bd4f8d505a11ad5a171a7 Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alexghiti@rivosinc.com>
Date: Mon, 21 Apr 2025 16:14:13 +0200
Subject: riscv: hwprobe: export Zabha extension
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export Zabha through the hwprobe syscall.

Reviewed-by: Clément Léger <cleger@rivosinc.com>
Link: https://lore.kernel.org/r/20250421141413.394444-1-alexghiti@rivosinc.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/uapi/asm/hwprobe.h | 1 +
 arch/riscv/kernel/sys_hwprobe.c       | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/include/uapi/asm/hwprobe.h b/arch/riscv/include/uapi/asm/hwprobe.h
index 3c2fce939673..fca15f2bf6f3 100644
--- a/arch/riscv/include/uapi/asm/hwprobe.h
+++ b/arch/riscv/include/uapi/asm/hwprobe.h
@@ -81,6 +81,7 @@ struct riscv_hwprobe {
 #define		RISCV_HWPROBE_EXT_ZICBOM	(1ULL << 55)
 #define		RISCV_HWPROBE_EXT_ZAAMO		(1ULL << 56)
 #define		RISCV_HWPROBE_EXT_ZALRSC	(1ULL << 57)
+#define		RISCV_HWPROBE_EXT_ZABHA		(1ULL << 58)
 #define RISCV_HWPROBE_KEY_CPUPERF_0	5
 #define		RISCV_HWPROBE_MISALIGNED_UNKNOWN	(0 << 0)
 #define		RISCV_HWPROBE_MISALIGNED_EMULATED	(1 << 0)
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index 249aec8594a9..ed3123396a96 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -96,6 +96,7 @@ static void hwprobe_isa_ext0(struct riscv_hwprobe *pair,
 		 * presence in the hart_isa bitmap, are made.
 		 */
 		EXT_KEY(ZAAMO);
+		EXT_KEY(ZABHA);
 		EXT_KEY(ZACAS);
 		EXT_KEY(ZALRSC);
 		EXT_KEY(ZAWRS);
-- 
cgit v1.2.3


From a4348546332c9fae12b29acb514535e0a52b9b3c Mon Sep 17 00:00:00 2001
From: Alexandre Ghiti <alexghiti@rivosinc.com>
Date: Mon, 2 Jun 2025 21:39:14 +0200
Subject: riscv: make unsafe user copy routines use existing assembly routines

The current implementation is underperforming and in addition, it
triggers misaligned access traps on platforms which do not handle
misaligned accesses in hardware.

Use the existing assembly routines to solve both problems at once.

Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20250602193918.868962-2-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/asm-prototypes.h |  2 +-
 arch/riscv/include/asm/uaccess.h        | 33 ++++++----------------
 arch/riscv/lib/riscv_v_helpers.c        | 11 ++++++--
 arch/riscv/lib/uaccess.S                | 50 ++++++++++++++++++++++-----------
 arch/riscv/lib/uaccess_vector.S         | 15 ++++++++--
 5 files changed, 63 insertions(+), 48 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/asm-prototypes.h b/arch/riscv/include/asm/asm-prototypes.h
index cd627ec289f1..5d10edde6d17 100644
--- a/arch/riscv/include/asm/asm-prototypes.h
+++ b/arch/riscv/include/asm/asm-prototypes.h
@@ -12,7 +12,7 @@ long long __ashlti3(long long a, int b);
 #ifdef CONFIG_RISCV_ISA_V
 
 #ifdef CONFIG_MMU
-asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n);
+asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n, bool enable_sum);
 #endif /* CONFIG_MMU  */
 
 void xor_regs_2_(unsigned long bytes, unsigned long *__restrict p1,
diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h
index 87d01168f80a..046de7ced09c 100644
--- a/arch/riscv/include/asm/uaccess.h
+++ b/arch/riscv/include/asm/uaccess.h
@@ -450,35 +450,18 @@ static inline void user_access_restore(unsigned long enabled) { }
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
 } while (0)
 
-#define unsafe_copy_loop(dst, src, len, type, op, label)		\
-	while (len >= sizeof(type)) {					\
-		op(*(type *)(src), (type __user *)(dst), label);	\
-		dst += sizeof(type);					\
-		src += sizeof(type);					\
-		len -= sizeof(type);					\
-	}
+unsigned long __must_check __asm_copy_to_user_sum_enabled(void __user *to,
+	const void *from, unsigned long n);
+unsigned long __must_check __asm_copy_from_user_sum_enabled(void *to,
+	const void __user *from, unsigned long n);
 
 #define unsafe_copy_to_user(_dst, _src, _len, label)			\
-do {									\
-	char __user *__ucu_dst = (_dst);				\
-	const char *__ucu_src = (_src);					\
-	size_t __ucu_len = (_len);					\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, unsafe_put_user, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, unsafe_put_user, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, unsafe_put_user, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, unsafe_put_user, label);	\
-} while (0)
+	if (__asm_copy_to_user_sum_enabled(_dst, _src, _len))		\
+		goto label;
 
 #define unsafe_copy_from_user(_dst, _src, _len, label)			\
-do {									\
-	char *__ucu_dst = (_dst);					\
-	const char __user *__ucu_src = (_src);				\
-	size_t __ucu_len = (_len);					\
-	unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u64, unsafe_get_user, label);	\
-	unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u32, unsafe_get_user, label);	\
-	unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u16, unsafe_get_user, label);	\
-	unsafe_copy_loop(__ucu_src, __ucu_dst, __ucu_len, u8, unsafe_get_user, label);	\
-} while (0)
+	if (__asm_copy_from_user_sum_enabled(_dst, _src, _len))		\
+		goto label;
 
 #else /* CONFIG_MMU */
 #include <asm-generic/uaccess.h>
diff --git a/arch/riscv/lib/riscv_v_helpers.c b/arch/riscv/lib/riscv_v_helpers.c
index be38a93cedae..7bbdfc6d4552 100644
--- a/arch/riscv/lib/riscv_v_helpers.c
+++ b/arch/riscv/lib/riscv_v_helpers.c
@@ -16,8 +16,11 @@
 #ifdef CONFIG_MMU
 size_t riscv_v_usercopy_threshold = CONFIG_RISCV_ISA_V_UCOPY_THRESHOLD;
 int __asm_vector_usercopy(void *dst, void *src, size_t n);
+int __asm_vector_usercopy_sum_enabled(void *dst, void *src, size_t n);
 int fallback_scalar_usercopy(void *dst, void *src, size_t n);
-asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n)
+int fallback_scalar_usercopy_sum_enabled(void *dst, void *src, size_t n);
+asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n,
+				     bool enable_sum)
 {
 	size_t remain, copied;
 
@@ -26,7 +29,8 @@ asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n)
 		goto fallback;
 
 	kernel_vector_begin();
-	remain = __asm_vector_usercopy(dst, src, n);
+	remain = enable_sum ? __asm_vector_usercopy(dst, src, n) :
+			      __asm_vector_usercopy_sum_enabled(dst, src, n);
 	kernel_vector_end();
 
 	if (remain) {
@@ -40,6 +44,7 @@ asmlinkage int enter_vector_usercopy(void *dst, void *src, size_t n)
 	return remain;
 
 fallback:
-	return fallback_scalar_usercopy(dst, src, n);
+	return enable_sum ? fallback_scalar_usercopy(dst, src, n) :
+			    fallback_scalar_usercopy_sum_enabled(dst, src, n);
 }
 #endif
diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
index 6a9f116bb545..4efea1b3326c 100644
--- a/arch/riscv/lib/uaccess.S
+++ b/arch/riscv/lib/uaccess.S
@@ -17,14 +17,43 @@ SYM_FUNC_START(__asm_copy_to_user)
 	ALTERNATIVE("j fallback_scalar_usercopy", "nop", 0, RISCV_ISA_EXT_ZVE32X, CONFIG_RISCV_ISA_V)
 	REG_L	t0, riscv_v_usercopy_threshold
 	bltu	a2, t0, fallback_scalar_usercopy
-	tail enter_vector_usercopy
+	li	a3, 1
+	tail 	enter_vector_usercopy
 #endif
-SYM_FUNC_START(fallback_scalar_usercopy)
+SYM_FUNC_END(__asm_copy_to_user)
+EXPORT_SYMBOL(__asm_copy_to_user)
+SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user)
+EXPORT_SYMBOL(__asm_copy_from_user)
 
+SYM_FUNC_START(fallback_scalar_usercopy)
 	/* Enable access to user memory */
-	li t6, SR_SUM
-	csrs CSR_STATUS, t6
+	li	t6, SR_SUM
+	csrs 	CSR_STATUS, t6
+	mv 	t6, ra
 
+	call 	fallback_scalar_usercopy_sum_enabled
+
+	/* Disable access to user memory */
+	mv 	ra, t6
+	li 	t6, SR_SUM
+	csrc 	CSR_STATUS, t6
+	ret
+SYM_FUNC_END(fallback_scalar_usercopy)
+
+SYM_FUNC_START(__asm_copy_to_user_sum_enabled)
+#ifdef CONFIG_RISCV_ISA_V
+	ALTERNATIVE("j fallback_scalar_usercopy_sum_enabled", "nop", 0, RISCV_ISA_EXT_ZVE32X, CONFIG_RISCV_ISA_V)
+	REG_L	t0, riscv_v_usercopy_threshold
+	bltu	a2, t0, fallback_scalar_usercopy_sum_enabled
+	li	a3, 0
+	tail 	enter_vector_usercopy
+#endif
+SYM_FUNC_END(__asm_copy_to_user_sum_enabled)
+SYM_FUNC_ALIAS(__asm_copy_from_user_sum_enabled, __asm_copy_to_user_sum_enabled)
+EXPORT_SYMBOL(__asm_copy_from_user_sum_enabled)
+EXPORT_SYMBOL(__asm_copy_to_user_sum_enabled)
+
+SYM_FUNC_START(fallback_scalar_usercopy_sum_enabled)
 	/*
 	 * Save the terminal address which will be used to compute the number
 	 * of bytes copied in case of a fixup exception.
@@ -178,23 +207,12 @@ SYM_FUNC_START(fallback_scalar_usercopy)
 	bltu	a0, t0, 4b	/* t0 - end of dst */
 
 .Lout_copy_user:
-	/* Disable access to user memory */
-	csrc CSR_STATUS, t6
 	li	a0, 0
 	ret
-
-	/* Exception fixup code */
 10:
-	/* Disable access to user memory */
-	csrc CSR_STATUS, t6
 	sub a0, t5, a0
 	ret
-SYM_FUNC_END(__asm_copy_to_user)
-SYM_FUNC_END(fallback_scalar_usercopy)
-EXPORT_SYMBOL(__asm_copy_to_user)
-SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user)
-EXPORT_SYMBOL(__asm_copy_from_user)
-
+SYM_FUNC_END(fallback_scalar_usercopy_sum_enabled)
 
 SYM_FUNC_START(__clear_user)
 
diff --git a/arch/riscv/lib/uaccess_vector.S b/arch/riscv/lib/uaccess_vector.S
index 7c45f26de4f7..03b5560609a2 100644
--- a/arch/riscv/lib/uaccess_vector.S
+++ b/arch/riscv/lib/uaccess_vector.S
@@ -24,7 +24,18 @@ SYM_FUNC_START(__asm_vector_usercopy)
 	/* Enable access to user memory */
 	li	t6, SR_SUM
 	csrs	CSR_STATUS, t6
+	mv	t6, ra
 
+	call 	__asm_vector_usercopy_sum_enabled
+
+	/* Disable access to user memory */
+	mv 	ra, t6
+	li 	t6, SR_SUM
+	csrc	CSR_STATUS, t6
+	ret
+SYM_FUNC_END(__asm_vector_usercopy)
+
+SYM_FUNC_START(__asm_vector_usercopy_sum_enabled)
 loop:
 	vsetvli iVL, iNum, e8, ELEM_LMUL_SETTING, ta, ma
 	fixup vle8.v vData, (pSrc), 10f
@@ -36,8 +47,6 @@ loop:
 
 	/* Exception fixup for vector load is shared with normal exit */
 10:
-	/* Disable access to user memory */
-	csrc	CSR_STATUS, t6
 	mv	a0, iNum
 	ret
 
@@ -49,4 +58,4 @@ loop:
 	csrr	t2, CSR_VSTART
 	sub	iNum, iNum, t2
 	j	10b
-SYM_FUNC_END(__asm_vector_usercopy)
+SYM_FUNC_END(__asm_vector_usercopy_sum_enabled)
-- 
cgit v1.2.3


From 020667d661f9be65167174d28a6eda7102f7293f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Mon, 2 Jun 2025 21:39:15 +0200
Subject: riscv: process: use unsigned int instead of unsigned long for
 put_user()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The specification of prctl() for GET_UNALIGN_CTL states that the value is
returned in an unsigned int * address passed as an unsigned long. Change
the type to match that and avoid an unaligned access as well.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20250602193918.868962-3-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
index 7c244de77180..fe10e326f44e 100644
--- a/arch/riscv/kernel/process.c
+++ b/arch/riscv/kernel/process.c
@@ -57,7 +57,7 @@ int get_unalign_ctl(struct task_struct *tsk, unsigned long adr)
 	if (!unaligned_ctl_available())
 		return -EINVAL;
 
-	return put_user(tsk->thread.align_ctl, (unsigned long __user *)adr);
+	return put_user(tsk->thread.align_ctl, (unsigned int __user *)adr);
 }
 
 void __show_regs(struct pt_regs *regs)
-- 
cgit v1.2.3


From ca1a66cdd685030738cf077e3955fdedfe39fbb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= <cleger@rivosinc.com>
Date: Mon, 2 Jun 2025 21:39:16 +0200
Subject: riscv: uaccess: do not do misaligned accesses in get/put_user()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Doing misaligned access to userspace memory would make a trap on
platform where it is emulated. Latest fixes removed the kernel
capability to do unaligned accesses to userspace memory safely since
interrupts are kept disabled at all time during that. Thus doing so
would crash the kernel.

Such behavior was detected with GET_UNALIGN_CTL() that was doing
a put_user() with an unsigned long* address that should have been an
unsigned int*. Reenabling kernel misaligned access emulation is a bit
risky and it would also degrade performances. Rather than doing that,
we will try to avoid any misaligned accessed by using copy_from/to_user()
which does not do any misaligned accesses. This can be done only for
!CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS and thus allows to only generate
a bit more code for this config.

Signed-off-by: Clément Léger <cleger@rivosinc.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20250602193918.868962-4-cleger@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/uaccess.h | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h
index 046de7ced09c..d472da4450e6 100644
--- a/arch/riscv/include/asm/uaccess.h
+++ b/arch/riscv/include/asm/uaccess.h
@@ -169,8 +169,19 @@ do {								\
 
 #endif /* CONFIG_64BIT */
 
+unsigned long __must_check __asm_copy_to_user_sum_enabled(void __user *to,
+	const void *from, unsigned long n);
+unsigned long __must_check __asm_copy_from_user_sum_enabled(void *to,
+	const void __user *from, unsigned long n);
+
 #define __get_user_nocheck(x, __gu_ptr, label)			\
 do {								\
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&	\
+	    !IS_ALIGNED((uintptr_t)__gu_ptr, sizeof(*__gu_ptr))) {	\
+		if (__asm_copy_from_user_sum_enabled(&(x), __gu_ptr, sizeof(*__gu_ptr))) \
+			goto label;				\
+		break;						\
+	}							\
 	switch (sizeof(*__gu_ptr)) {				\
 	case 1:							\
 		__get_user_asm("lb", (x), __gu_ptr, label);	\
@@ -297,6 +308,13 @@ do {								\
 
 #define __put_user_nocheck(x, __gu_ptr, label)			\
 do {								\
+	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&	\
+	    !IS_ALIGNED((uintptr_t)__gu_ptr, sizeof(*__gu_ptr))) {	\
+		__inttype(x) val = (__inttype(x))x;			\
+		if (__asm_copy_to_user_sum_enabled(__gu_ptr, &(val), sizeof(*__gu_ptr))) \
+			goto label;				\
+		break;						\
+	}							\
 	switch (sizeof(*__gu_ptr)) {				\
 	case 1:							\
 		__put_user_asm("sb", (x), __gu_ptr, label);	\
@@ -450,11 +468,6 @@ static inline void user_access_restore(unsigned long enabled) { }
 	(x) = (__force __typeof__(*(ptr)))__gu_val;			\
 } while (0)
 
-unsigned long __must_check __asm_copy_to_user_sum_enabled(void __user *to,
-	const void *from, unsigned long n);
-unsigned long __must_check __asm_copy_from_user_sum_enabled(void *to,
-	const void __user *from, unsigned long n);
-
 #define unsafe_copy_to_user(_dst, _src, _len, label)			\
 	if (__asm_copy_to_user_sum_enabled(_dst, _src, _len))		\
 		goto label;
-- 
cgit v1.2.3


From c39d53750ff96b282c869a0184a7c3ecfd298ca8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miquel=20Sabat=C3=A9=20Sol=C3=A0?= <mikisabate@gmail.com>
Date: Thu, 1 May 2025 15:03:09 +0200
Subject: riscv: Improve Kconfig help for RISCV_ISA_V_PREEMPTIVE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix a couple of spelling issues plus some minor details on the grammar.

Signed-off-by: Miquel Sabaté Solà <mikisabate@gmail.com>
Link: https://lore.kernel.org/r/20250501130309.14803-1-mikisabate@gmail.com
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index a93af30727ee..98a3ecdc65f6 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -669,12 +669,12 @@ config RISCV_ISA_V_PREEMPTIVE
 	default y
 	help
 	  Usually, in-kernel SIMD routines are run with preemption disabled.
-	  Functions which envoke long running SIMD thus must yield core's
+	  Functions which invoke long running SIMD thus must yield the core's
 	  vector unit to prevent blocking other tasks for too long.
 
-	  This config allows kernel to run SIMD without explicitly disable
-	  preemption. Enabling this config will result in higher memory
-	  consumption due to the allocation of per-task's kernel Vector context.
+	  This config allows the kernel to run SIMD without explicitly disabling
+	  preemption. Enabling this config will result in higher memory consumption
+	  due to the allocation of per-task's kernel Vector context.
 
 config RISCV_ISA_ZAWRS
 	bool "Zawrs extension support for more efficient busy waiting"
-- 
cgit v1.2.3


From a56972698810089d8f1bdc296cd709726db7176b Mon Sep 17 00:00:00 2001
From: Mayuresh Chitale <mchitale@ventanamicro.com>
Date: Tue, 2 Jul 2024 15:56:37 +0530
Subject: riscv: mm: Add support for Svinval extension

The Svinval extension splits SFENCE.VMA instruction into finer-grained
invalidation and ordering operations and is mandatory for RVA23S64 profile.
When Svinval is enabled the local_flush_tlb_range_threshold_asid function
should use the following sequence to optimize the tlb flushes instead of
a simple sfence.vma:

sfence.w.inval
svinval.vma
  .
  .
svinval.vma
sfence.inval.ir

The maximum number of consecutive svinval.vma instructions that
can be executed in local_flush_tlb_range_threshold_asid function
is limited to 64. This is required to avoid soft lockups and the
approach is similar to that used in arm64.

Signed-off-by: Mayuresh Chitale <mchitale@ventanamicro.com>
Reviewed-by: Andrew Jones <ajones@ventanamicro.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Link: https://lore.kernel.org/r/20240702102637.9074-1-mchitale@ventanamicro.com
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/mm/tlbflush.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

(limited to 'arch')

diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index f9e27ba1df99..6289ed5c7eb4 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -7,6 +7,27 @@
 #include <linux/mmu_notifier.h>
 #include <asm/sbi.h>
 #include <asm/mmu_context.h>
+#include <asm/cpufeature.h>
+
+#define has_svinval()	riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL)
+
+static inline void local_sfence_inval_ir(void)
+{
+	asm volatile(SFENCE_INVAL_IR() ::: "memory");
+}
+
+static inline void local_sfence_w_inval(void)
+{
+	asm volatile(SFENCE_W_INVAL() ::: "memory");
+}
+
+static inline void local_sinval_vma(unsigned long vma, unsigned long asid)
+{
+	if (asid != FLUSH_TLB_NO_ASID)
+		asm volatile(SINVAL_VMA(%0, %1) : : "r" (vma), "r" (asid) : "memory");
+	else
+		asm volatile(SINVAL_VMA(%0, zero) : : "r" (vma) : "memory");
+}
 
 /*
  * Flush entire TLB if number of entries to be flushed is greater
@@ -27,6 +48,16 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start,
 		return;
 	}
 
+	if (has_svinval()) {
+		local_sfence_w_inval();
+		for (i = 0; i < nr_ptes_in_range; ++i) {
+			local_sinval_vma(start, asid);
+			start += stride;
+		}
+		local_sfence_inval_ir();
+		return;
+	}
+
 	for (i = 0; i < nr_ptes_in_range; ++i) {
 		local_flush_tlb_page_asid(start, asid);
 		start += stride;
-- 
cgit v1.2.3


From a869b8c29f864b9530a6473a30c09546333b571a Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@kernel.org>
Date: Sat, 26 Apr 2025 21:59:54 +0800
Subject: riscv: enable mseal sysmap for RV64

Provide support for CONFIG_MSEAL_SYSTEM_MAPPINGS for RV64, covering the
vdso, vvar.

Passed sysmap_is_sealed and mseal_test self tests.
Passed booting a buildroot rootfs image and a cli debian rootfs image.

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Cc: Jeff Xu <jeffxu@chromium.org>
Link: https://lore.kernel.org/r/20250426135954.5614-1-jszhang@kernel.org
Tested-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig       | 1 +
 arch/riscv/kernel/vdso.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index bbec87b79309..3cb0b05eef62 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -70,6 +70,7 @@ config RISCV
 	# LLD >= 14: https://github.com/llvm/llvm-project/issues/50505
 	select ARCH_SUPPORTS_LTO_CLANG if LLD_VERSION >= 140000
 	select ARCH_SUPPORTS_LTO_CLANG_THIN if LLD_VERSION >= 140000
+	select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS if 64BIT && MMU
 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU
 	select ARCH_SUPPORTS_PER_VMA_LOCK if MMU
 	select ARCH_SUPPORTS_RT
diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c
index cc2895d1fbc2..3a8e038b10a2 100644
--- a/arch/riscv/kernel/vdso.c
+++ b/arch/riscv/kernel/vdso.c
@@ -136,7 +136,7 @@ static int __setup_additional_pages(struct mm_struct *mm,
 
 	ret =
 	   _install_special_mapping(mm, vdso_base, vdso_text_len,
-		(VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC),
+		(VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_SEALED_SYSMAP),
 		vdso_info->cm);
 
 	if (IS_ERR(ret))
-- 
cgit v1.2.3


From ee0d03053e7009a3a3532fb37f6c94bfa0a8cca3 Mon Sep 17 00:00:00 2001
From: Xi Ruoyao <xry111@xry111.site>
Date: Fri, 11 Apr 2025 10:46:00 +0800
Subject: RISC-V: vDSO: Wire up getrandom() vDSO implementation

Hook up the generic vDSO implementation to the generic vDSO getrandom
implementation by providing the required __arch_chacha20_blocks_nostack
and getrandom_syscall implementations. Also wire up the selftests.

The benchmark result:

	vdso: 25000000 times in 2.466341333 seconds
	libc: 25000000 times in 41.447720005 seconds
	syscall: 25000000 times in 41.043926672 seconds

	vdso: 25000000 x 256 times in 162.286219353 seconds
	libc: 25000000 x 256 times in 2953.855018685 seconds
	syscall: 25000000 x 256 times in 2796.268546000 seconds

[ alex: - Fix dynamic relocation
        - Squash Nathan's fix https://lore.kernel.org/all/20250423-riscv-fix-compat_vdso-lld-v2-1-b7bbbc244501@kernel.org/
	- Add comment from Loongarch ]

Signed-off-by: Xi Ruoyao <xry111@xry111.site>
Link: https://lore.kernel.org/r/20250411024600.16045-1-xry111@xry111.site
Tested-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/Kconfig                         |   1 +
 arch/riscv/include/asm/vdso/getrandom.h    |  30 ++++
 arch/riscv/kernel/vdso/Makefile            |  13 ++
 arch/riscv/kernel/vdso/getrandom.c         |  10 ++
 arch/riscv/kernel/vdso/vdso.lds.S          |   3 +
 arch/riscv/kernel/vdso/vgetrandom-chacha.S | 249 +++++++++++++++++++++++++++++
 6 files changed, 306 insertions(+)
 create mode 100644 arch/riscv/include/asm/vdso/getrandom.h
 create mode 100644 arch/riscv/kernel/vdso/getrandom.c
 create mode 100644 arch/riscv/kernel/vdso/vgetrandom-chacha.S

(limited to 'arch')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 3cb0b05eef62..b5b3ead92f64 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -219,6 +219,7 @@ config RISCV
 	select THREAD_INFO_IN_TASK
 	select TRACE_IRQFLAGS_SUPPORT
 	select UACCESS_MEMCPY if !MMU
+	select VDSO_GETRANDOM if HAVE_GENERIC_VDSO
 	select USER_STACKTRACE_SUPPORT
 	select ZONE_DMA32 if 64BIT
 
diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h
new file mode 100644
index 000000000000..8dc92441702a
--- /dev/null
+++ b/arch/riscv/include/asm/vdso/getrandom.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+#ifndef __ASM_VDSO_GETRANDOM_H
+#define __ASM_VDSO_GETRANDOM_H
+
+#ifndef __ASSEMBLY__
+
+#include <asm/unistd.h>
+
+static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags)
+{
+	register long ret asm("a0");
+	register long nr asm("a7") = __NR_getrandom;
+	register void *buffer asm("a0") = _buffer;
+	register size_t len asm("a1") = _len;
+	register unsigned int flags asm("a2") = _flags;
+
+	asm volatile ("ecall\n"
+		      : "+r" (ret)
+		      : "r" (nr), "r" (buffer), "r" (len), "r" (flags)
+		      : "memory");
+
+	return ret;
+}
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __ASM_VDSO_GETRANDOM_H */
diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile
index ad73607abc28..dca888852d93 100644
--- a/arch/riscv/kernel/vdso/Makefile
+++ b/arch/riscv/kernel/vdso/Makefile
@@ -13,9 +13,17 @@ vdso-syms += flush_icache
 vdso-syms += hwprobe
 vdso-syms += sys_hwprobe
 
+ifdef CONFIG_VDSO_GETRANDOM
+vdso-syms += getrandom
+endif
+
 # Files to link into the vdso
 obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o
 
+ifdef CONFIG_VDSO_GETRANDOM
+obj-vdso += vgetrandom-chacha.o
+endif
+
 ccflags-y := -fno-stack-protector
 ccflags-y += -DDISABLE_BRANCH_PROFILING
 ccflags-y += -fno-builtin
@@ -24,6 +32,10 @@ ifneq ($(c-gettimeofday-y),)
   CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y)
 endif
 
+ifneq ($(c-getrandom-y),)
+  CFLAGS_getrandom.o += -fPIC -include $(c-getrandom-y)
+endif
+
 CFLAGS_hwprobe.o += -fPIC
 
 # Build rules
@@ -38,6 +50,7 @@ endif
 
 # Disable -pg to prevent insert call site
 CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS)
+CFLAGS_REMOVE_getrandom.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS)
 CFLAGS_REMOVE_hwprobe.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS)
 
 # Force dependency
diff --git a/arch/riscv/kernel/vdso/getrandom.c b/arch/riscv/kernel/vdso/getrandom.c
new file mode 100644
index 000000000000..f21922e8cebd
--- /dev/null
+++ b/arch/riscv/kernel/vdso/getrandom.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ */
+#include <linux/types.h>
+
+ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len)
+{
+	return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len);
+}
diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S
index 8e86965a8aae..7c15b0f4ee3b 100644
--- a/arch/riscv/kernel/vdso/vdso.lds.S
+++ b/arch/riscv/kernel/vdso/vdso.lds.S
@@ -79,6 +79,9 @@ VERSION
 		__vdso_flush_icache;
 #ifndef COMPAT_VDSO
 		__vdso_riscv_hwprobe;
+#endif
+#if defined(CONFIG_VDSO_GETRANDOM) && !defined(COMPAT_VDSO)
+		__vdso_getrandom;
 #endif
 	local: *;
 	};
diff --git a/arch/riscv/kernel/vdso/vgetrandom-chacha.S b/arch/riscv/kernel/vdso/vgetrandom-chacha.S
new file mode 100644
index 000000000000..5f0dad8f2373
--- /dev/null
+++ b/arch/riscv/kernel/vdso/vgetrandom-chacha.S
@@ -0,0 +1,249 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
+ *
+ * Based on arch/loongarch/vdso/vgetrandom-chacha.S.
+ */
+
+#include <asm/asm.h>
+#include <linux/linkage.h>
+
+.text
+
+.macro	ROTRI	rd rs imm
+	slliw	t0, \rs, 32 - \imm
+	srliw	\rd, \rs, \imm
+	or	\rd, \rd, t0
+.endm
+
+.macro	OP_4REG	op d0 d1 d2 d3 s0 s1 s2 s3
+	\op	\d0, \d0, \s0
+	\op	\d1, \d1, \s1
+	\op	\d2, \d2, \s2
+	\op	\d3, \d3, \s3
+.endm
+
+/*
+ *	a0: output bytes
+ * 	a1: 32-byte key input
+ *	a2: 8-byte counter input/output
+ *	a3: number of 64-byte blocks to write to output
+ */
+SYM_FUNC_START(__arch_chacha20_blocks_nostack)
+
+#define output		a0
+#define key		a1
+#define counter		a2
+#define nblocks		a3
+#define i		a4
+#define state0		s0
+#define state1		s1
+#define state2		s2
+#define state3		s3
+#define state4		s4
+#define state5		s5
+#define state6		s6
+#define state7		s7
+#define state8		s8
+#define state9		s9
+#define state10		s10
+#define state11		s11
+#define state12		a5
+#define state13		a6
+#define state14		a7
+#define state15		t1
+#define cnt		t2
+#define copy0		t3
+#define copy1		t4
+#define copy2		t5
+#define copy3		t6
+
+/* Packs to be used with OP_4REG */
+#define line0		state0, state1, state2, state3
+#define line1		state4, state5, state6, state7
+#define line2		state8, state9, state10, state11
+#define line3		state12, state13, state14, state15
+
+#define line1_perm	state5, state6, state7, state4
+#define line2_perm	state10, state11, state8, state9
+#define line3_perm	state15, state12, state13, state14
+
+#define copy		copy0, copy1, copy2, copy3
+
+#define _16		16, 16, 16, 16
+#define _20		20, 20, 20, 20
+#define _24		24, 24, 24, 24
+#define _25		25, 25, 25, 25
+
+	/*
+	 * The ABI requires s0-s9 saved.
+	 * This does not violate the stack-less requirement: no sensitive data
+	 * is spilled onto the stack.
+	 */
+	addi		sp, sp, -12*SZREG
+	REG_S		s0,         (sp)
+	REG_S		s1,    SZREG(sp)
+	REG_S		s2,  2*SZREG(sp)
+	REG_S		s3,  3*SZREG(sp)
+	REG_S		s4,  4*SZREG(sp)
+	REG_S		s5,  5*SZREG(sp)
+	REG_S		s6,  6*SZREG(sp)
+	REG_S		s7,  7*SZREG(sp)
+	REG_S		s8,  8*SZREG(sp)
+	REG_S		s9,  9*SZREG(sp)
+	REG_S		s10, 10*SZREG(sp)
+	REG_S		s11, 11*SZREG(sp)
+
+	ld		cnt, (counter)
+
+	li		copy0, 0x61707865
+	li		copy1, 0x3320646e
+	li		copy2, 0x79622d32
+	li		copy3, 0x6b206574
+
+.Lblock:
+	/* state[0,1,2,3] = "expand 32-byte k" */
+	mv		state0, copy0
+	mv		state1, copy1
+	mv		state2, copy2
+	mv		state3, copy3
+
+	/* state[4,5,..,11] = key */
+	lw		state4,   (key)
+	lw		state5,  4(key)
+	lw		state6,  8(key)
+	lw		state7,  12(key)
+	lw		state8,  16(key)
+	lw		state9,  20(key)
+	lw		state10, 24(key)
+	lw		state11, 28(key)
+
+	/* state[12,13] = counter */
+	mv		state12, cnt
+	srli		state13, cnt, 32
+
+	/* state[14,15] = 0 */
+	mv		state14, zero
+	mv		state15, zero
+
+	li		i, 10
+.Lpermute:
+	/* odd round */
+	OP_4REG	addw	line0, line1
+	OP_4REG	xor	line3, line0
+	OP_4REG	ROTRI	line3, _16
+
+	OP_4REG	addw	line2, line3
+	OP_4REG	xor	line1, line2
+	OP_4REG	ROTRI	line1, _20
+
+	OP_4REG	addw	line0, line1
+	OP_4REG	xor	line3, line0
+	OP_4REG	ROTRI	line3, _24
+
+	OP_4REG	addw	line2, line3
+	OP_4REG	xor	line1, line2
+	OP_4REG	ROTRI	line1, _25
+
+	/* even round */
+	OP_4REG	addw	line0, line1_perm
+	OP_4REG	xor	line3_perm, line0
+	OP_4REG	ROTRI	line3_perm, _16
+
+	OP_4REG	addw	line2_perm, line3_perm
+	OP_4REG	xor	line1_perm, line2_perm
+	OP_4REG	ROTRI	line1_perm, _20
+
+	OP_4REG	addw	line0, line1_perm
+	OP_4REG	xor	line3_perm, line0
+	OP_4REG	ROTRI	line3_perm, _24
+
+	OP_4REG	addw	line2_perm, line3_perm
+	OP_4REG	xor	line1_perm, line2_perm
+	OP_4REG	ROTRI	line1_perm, _25
+
+	addi		i, i, -1
+	bnez		i, .Lpermute
+
+	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
+	OP_4REG	addw	line0, copy
+	sw		state0,   (output)
+	sw		state1,  4(output)
+	sw		state2,  8(output)
+	sw		state3, 12(output)
+
+	/* from now on state[0,1,2,3] are scratch registers  */
+
+	/* state[0,1,2,3] = lo(key) */
+	lw		state0,   (key)
+	lw		state1,  4(key)
+	lw		state2,  8(key)
+	lw		state3, 12(key)
+
+	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
+	OP_4REG	addw	line1, line0
+	sw		state4, 16(output)
+	sw		state5, 20(output)
+	sw		state6, 24(output)
+	sw		state7, 28(output)
+
+	/* state[0,1,2,3] = hi(key) */
+	lw		state0, 16(key)
+	lw		state1, 20(key)
+	lw		state2, 24(key)
+	lw		state3, 28(key)
+
+	/* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */
+	OP_4REG	addw	line2, line0
+	sw		state8,  32(output)
+	sw		state9,  36(output)
+	sw		state10, 40(output)
+	sw		state11, 44(output)
+
+	/* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
+	addw		state12, state12, cnt
+	srli		state0, cnt, 32
+	addw		state13, state13, state0
+	sw		state12, 48(output)
+	sw		state13, 52(output)
+	sw		state14, 56(output)
+	sw		state15, 60(output)
+
+	/* ++counter */
+	addi		cnt, cnt, 1
+
+	/* output += 64 */
+	addi		output, output, 64
+	/* --nblocks */
+	addi		nblocks, nblocks, -1
+	bnez		nblocks, .Lblock
+
+	/* counter = [cnt_lo, cnt_hi] */
+	sd		cnt, (counter)
+
+	/* Zero out the potentially sensitive regs, in case nothing uses these
+	 * again.  As at now copy[0,1,2,3] just contains "expand 32-byte k" and
+	 * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we
+	 * only need to zero state[12,...,15].
+	 */
+	mv		state12, zero
+	mv		state13, zero
+	mv		state14, zero
+	mv		state15, zero
+
+	REG_L		s0,         (sp)
+	REG_L		s1,    SZREG(sp)
+	REG_L		s2,  2*SZREG(sp)
+	REG_L		s3,  3*SZREG(sp)
+	REG_L		s4,  4*SZREG(sp)
+	REG_L		s5,  5*SZREG(sp)
+	REG_L		s6,  6*SZREG(sp)
+	REG_L		s7,  7*SZREG(sp)
+	REG_L		s8,  8*SZREG(sp)
+	REG_L		s9,  9*SZREG(sp)
+	REG_L		s10, 10*SZREG(sp)
+	REG_L		s11, 11*SZREG(sp)
+	addi		sp, sp, 12*SZREG
+
+	ret
+SYM_FUNC_END(__arch_chacha20_blocks_nostack)
-- 
cgit v1.2.3


From 265d6aba165c500389c80d394ac247460c443ef5 Mon Sep 17 00:00:00 2001
From: Cyril Bur <cyrilbur@tenstorrent.com>
Date: Mon, 2 Jun 2025 12:15:43 +0000
Subject: riscv: uaccess: Only restore the CSR_STATUS SUM bit

During switch to csrs will OR the value of the register into the
corresponding csr. In this case we're only interested in restoring the
SUM bit not the entire register.

Signed-off-by: Cyril Bur <cyrilbur@tenstorrent.com>
Link: https://lore.kernel.org/r/20250522160954.429333-1-cyrilbur@tenstorrent.com
Co-developed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Fixes: 788aa64c01f1 ("riscv: save the SR_SUM status over switches")
Link: https://lore.kernel.org/r/20250602121543.1544278-1-alexghiti@rivosinc.com
Signed-off-by: Palmer Dabbelt <palmer@dabbelt.com>
---
 arch/riscv/include/asm/processor.h | 2 +-
 arch/riscv/kernel/asm-offsets.c    | 6 +++---
 arch/riscv/kernel/entry.S          | 9 +++++----
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/include/asm/processor.h b/arch/riscv/include/asm/processor.h
index 8111250f3c1b..24d3af4d3807 100644
--- a/arch/riscv/include/asm/processor.h
+++ b/arch/riscv/include/asm/processor.h
@@ -110,7 +110,7 @@ struct thread_struct {
 	struct __riscv_d_ext_state fstate;
 	unsigned long bad_cause;
 	unsigned long envcfg;
-	unsigned long status;
+	unsigned long sum;
 	u32 riscv_v_flags;
 	u32 vstate_ctrl;
 	struct __riscv_v_ext_state vstate;
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 9420ec6a50fd..6e8c0d6feae9 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -34,7 +34,7 @@ void asm_offsets(void)
 	OFFSET(TASK_THREAD_S9, task_struct, thread.s[9]);
 	OFFSET(TASK_THREAD_S10, task_struct, thread.s[10]);
 	OFFSET(TASK_THREAD_S11, task_struct, thread.s[11]);
-	OFFSET(TASK_THREAD_STATUS, task_struct, thread.status);
+	OFFSET(TASK_THREAD_SUM, task_struct, thread.sum);
 
 	OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
 	OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
@@ -347,8 +347,8 @@ void asm_offsets(void)
 		  offsetof(struct task_struct, thread.s[11])
 		- offsetof(struct task_struct, thread.ra)
 	);
-	DEFINE(TASK_THREAD_STATUS_RA,
-		  offsetof(struct task_struct, thread.status)
+	DEFINE(TASK_THREAD_SUM_RA,
+		  offsetof(struct task_struct, thread.sum)
 		- offsetof(struct task_struct, thread.ra)
 	);
 
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 00bd0de9faa2..a49e19ce3a97 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -399,14 +399,15 @@ SYM_FUNC_START(__switch_to)
 	REG_S s11, TASK_THREAD_S11_RA(a3)
 
 	/* save the user space access flag */
-	li    s0, SR_SUM
-	csrr  s1, CSR_STATUS
-	REG_S s1, TASK_THREAD_STATUS_RA(a3)
+	csrr  s0, CSR_STATUS
+	REG_S s0, TASK_THREAD_SUM_RA(a3)
 
 	/* Save the kernel shadow call stack pointer */
 	scs_save_current
 	/* Restore context from next->thread */
-	REG_L s0,  TASK_THREAD_STATUS_RA(a4)
+	REG_L s0,  TASK_THREAD_SUM_RA(a4)
+	li    s1,  SR_SUM
+	and   s0,  s0, s1
 	csrs  CSR_STATUS, s0
 	REG_L ra,  TASK_THREAD_RA_RA(a4)
 	REG_L sp,  TASK_THREAD_SP_RA(a4)
-- 
cgit v1.2.3


From 15ac613f124e51a6623975efad9657b1f3ee47e7 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Date: Mon, 19 May 2025 15:56:57 +0100
Subject: KVM: s390: rename PROT_NONE to PROT_TYPE_DUMMY

The enum type prot_type declared in arch/s390/kvm/gaccess.c declares an
unfortunate identifier within it - PROT_NONE.

This clashes with the protection bit define from the uapi for mmap()
declared in include/uapi/asm-generic/mman-common.h, which is indeed what
those casually reading this code would assume this to refer to.

This means that any changes which subsequently alter headers in any way
which results in the uapi header being imported here will cause build
errors.

Resolve the issue by renaming PROT_NONE to PROT_TYPE_DUMMY.

Link: https://lkml.kernel.org/r/20250519145657.178365-1-lorenzo.stoakes@oracle.com
Fixes: b3cefd6bf16e ("KVM: s390: Pass initialized arg even if unused")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Suggested-by: Ignacio Moreno Gonzalez <Ignacio.MorenoGonzalez@kuka.com>
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202505140943.IgHDa9s7-lkp@intel.com/
Acked-by: Christian Borntraeger <borntraeger@linux.ibm.com>
Acked-by: Ignacio Moreno Gonzalez <Ignacio.MorenoGonzalez@kuka.com>
Acked-by: Yang Shi <yang@os.amperecomputing.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
Cc: <stable@vger.kernel.org>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Janosch Frank <frankja@linux.ibm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 arch/s390/kvm/gaccess.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index e23670e1949c..21c2e61fece4 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -319,7 +319,7 @@ enum prot_type {
 	PROT_TYPE_DAT  = 3,
 	PROT_TYPE_IEP  = 4,
 	/* Dummy value for passing an initialized value when code != PGM_PROTECTION */
-	PROT_NONE,
+	PROT_TYPE_DUMMY,
 };
 
 static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar,
@@ -335,7 +335,7 @@ static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva,
 	switch (code) {
 	case PGM_PROTECTION:
 		switch (prot) {
-		case PROT_NONE:
+		case PROT_TYPE_DUMMY:
 			/* We should never get here, acts like termination */
 			WARN_ON_ONCE(1);
 			break;
@@ -805,7 +805,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 			gpa = kvm_s390_real_to_abs(vcpu, ga);
 			if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) {
 				rc = PGM_ADDRESSING;
-				prot = PROT_NONE;
+				prot = PROT_TYPE_DUMMY;
 			}
 		}
 		if (rc)
@@ -963,7 +963,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
 		if (rc == PGM_PROTECTION)
 			prot = PROT_TYPE_KEYC;
 		else
-			prot = PROT_NONE;
+			prot = PROT_TYPE_DUMMY;
 		rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate);
 	}
 out_unlock:
-- 
cgit v1.2.3


From e242bbbb6d7ac7556aa1e358294dc7e3c82cc902 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <thomas.weissschuh@linutronix.de>
Date: Thu, 5 Jun 2025 20:34:18 +0800
Subject: LoongArch: vDSO: Correctly use asm parameters in syscall wrappers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The syscall wrappers use the "a0" register for two different register
variables, both the first argument and the return value. Here the "ret"
variable is used as both input and output while the argument register is
only used as input. Clang treats the conflicting input parameters as an
undefined behaviour and optimizes away the argument assignment.

The code seems to work by chance for the most part today but that may
change in the future. Specifically clock_gettime_fallback() fails with
clockids from 16 to 23, as implemented by the upcoming auxiliary clocks.

Switch the "ret" register variable to a pure output, similar to the
other architectures' vDSO code. This works in both clang and GCC.

Link: https://lore.kernel.org/lkml/20250602102825-42aa84f0-23f1-4d10-89fc-e8bbaffd291a@linutronix.de/
Link: https://lore.kernel.org/lkml/20250519082042.742926976@linutronix.de/
Fixes: c6b99bed6b8f ("LoongArch: Add VDSO and VSYSCALL support")
Fixes: 18efd0b10e0f ("LoongArch: vDSO: Wire up getrandom() vDSO implementation")
Cc: stable@vger.kernel.org
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Yanteng Si <si.yanteng@linux.dev>
Reviewed-by: WANG Xuerui <git@xen0n.name>
Reviewed-by: Xi Ruoyao <xry111@xry111.site>
Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/include/asm/vdso/getrandom.h    | 2 +-
 arch/loongarch/include/asm/vdso/gettimeofday.h | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/loongarch/include/asm/vdso/getrandom.h b/arch/loongarch/include/asm/vdso/getrandom.h
index 48c43f55b039..a81724b69f29 100644
--- a/arch/loongarch/include/asm/vdso/getrandom.h
+++ b/arch/loongarch/include/asm/vdso/getrandom.h
@@ -20,7 +20,7 @@ static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, uns
 
 	asm volatile(
 	"      syscall 0\n"
-	: "+r" (ret)
+	: "=r" (ret)
 	: "r" (nr), "r" (buffer), "r" (len), "r" (flags)
 	: "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7", "$t8",
 	  "memory");
diff --git a/arch/loongarch/include/asm/vdso/gettimeofday.h b/arch/loongarch/include/asm/vdso/gettimeofday.h
index 88cfcf133116..f15503e3336c 100644
--- a/arch/loongarch/include/asm/vdso/gettimeofday.h
+++ b/arch/loongarch/include/asm/vdso/gettimeofday.h
@@ -25,7 +25,7 @@ static __always_inline long gettimeofday_fallback(
 
 	asm volatile(
 	"       syscall 0\n"
-	: "+r" (ret)
+	: "=r" (ret)
 	: "r" (nr), "r" (tv), "r" (tz)
 	: "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7",
 	  "$t8", "memory");
@@ -44,7 +44,7 @@ static __always_inline long clock_gettime_fallback(
 
 	asm volatile(
 	"       syscall 0\n"
-	: "+r" (ret)
+	: "=r" (ret)
 	: "r" (nr), "r" (clkid), "r" (ts)
 	: "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7",
 	  "$t8", "memory");
@@ -63,7 +63,7 @@ static __always_inline int clock_getres_fallback(
 
 	asm volatile(
 	"       syscall 0\n"
-	: "+r" (ret)
+	: "=r" (ret)
 	: "r" (nr), "r" (clkid), "r" (ts)
 	: "$t0", "$t1", "$t2", "$t3", "$t4", "$t5", "$t6", "$t7",
 	  "$t8", "memory");
-- 
cgit v1.2.3


From 07aeb50e6c74e59c268ad6460c6b51411bea1759 Mon Sep 17 00:00:00 2001
From: Binbin Zhou <zhoubinbin@loongson.cn>
Date: Thu, 5 Jun 2025 20:34:34 +0800
Subject: LoongArch: dts: Add PWM support to Loongson-2K0500

The module is supported, enable it.

Reviewed-by: Yanteng Si <si.yanteng@linux.dev>
Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/boot/dts/loongson-2k0500.dtsi | 160 +++++++++++++++++++++++++++
 1 file changed, 160 insertions(+)

(limited to 'arch')

diff --git a/arch/loongarch/boot/dts/loongson-2k0500.dtsi b/arch/loongarch/boot/dts/loongson-2k0500.dtsi
index 3b38ff8853a7..760c60eebb89 100644
--- a/arch/loongarch/boot/dts/loongson-2k0500.dtsi
+++ b/arch/loongarch/boot/dts/loongson-2k0500.dtsi
@@ -169,6 +169,166 @@
 			interrupts = <3>;
 		};
 
+		pwm@1ff5c000 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c000 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <24 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c010 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c010 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <24 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c020 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c020 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <24 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c030 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c030 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <24 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c040 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c040 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <25 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c050 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c050 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <25 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c060 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c060 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <25 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c070 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c070 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <25 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c080 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c080 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <26 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c090 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c090 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <26 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c0a0 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c0a0 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <26 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c0b0 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c0b0 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <26 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c0c0 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c0c0 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <27 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c0d0 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c0d0 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <27 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c0e0 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c0e0 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <27 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1ff5c0f0 {
+			compatible = "loongson,ls2k0500-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1ff5c0f0 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <27 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
 		gmac0: ethernet@1f020000 {
 			compatible = "snps,dwmac-3.70a";
 			reg = <0x0 0x1f020000 0x0 0x10000>;
-- 
cgit v1.2.3


From abd000dbaee45fa620ddaeb13757b9a8985ce50f Mon Sep 17 00:00:00 2001
From: Binbin Zhou <zhoubinbin@loongson.cn>
Date: Thu, 5 Jun 2025 20:34:34 +0800
Subject: LoongArch: dts: Add PWM support to Loongson-2K1000

The module is supported, enable it.

Also, add the pwm-fan and cooling-maps associated with it.

Reviewed-by: Yanteng Si <si.yanteng@linux.dev>
Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/boot/dts/loongson-2k1000-ref.dts | 24 ++++++++++++++
 arch/loongarch/boot/dts/loongson-2k1000.dtsi    | 42 ++++++++++++++++++++++++-
 2 files changed, 65 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts
index 3514ea78f525..78ea995abf1c 100644
--- a/arch/loongarch/boot/dts/loongson-2k1000-ref.dts
+++ b/arch/loongarch/boot/dts/loongson-2k1000-ref.dts
@@ -5,6 +5,7 @@
 
 /dts-v1/;
 
+#include "dt-bindings/thermal/thermal.h"
 #include "loongson-2k1000.dtsi"
 
 / {
@@ -38,6 +39,13 @@
 			linux,cma-default;
 		};
 	};
+
+	fan0: pwm-fan {
+		compatible = "pwm-fan";
+		cooling-levels = <255 153 85 25>;
+		pwms = <&pwm1 0 100000 0>;
+		#cooling-cells = <2>;
+	};
 };
 
 &gmac0 {
@@ -92,6 +100,22 @@
 	#size-cells = <0>;
 };
 
+&pwm1 {
+	status = "okay";
+
+	pinctrl-0 = <&pwm1_pins_default>;
+	pinctrl-names = "default";
+};
+
+&cpu_thermal {
+	cooling-maps {
+		map0 {
+			trip = <&cpu_alert>;
+			cooling-device = <&fan0 THERMAL_NO_LIMIT THERMAL_NO_LIMIT>;
+		};
+	};
+};
+
 &ehci0 {
 	status = "okay";
 };
diff --git a/arch/loongarch/boot/dts/loongson-2k1000.dtsi b/arch/loongarch/boot/dts/loongson-2k1000.dtsi
index 8dff2aa52417..1da3beb00f0e 100644
--- a/arch/loongarch/boot/dts/loongson-2k1000.dtsi
+++ b/arch/loongarch/boot/dts/loongson-2k1000.dtsi
@@ -68,7 +68,7 @@
 	};
 
 	thermal-zones {
-		cpu-thermal {
+		cpu_thermal: cpu-thermal {
 			polling-delay-passive = <1000>;
 			polling-delay = <5000>;
 			thermal-sensors = <&tsensor 0>;
@@ -322,6 +322,46 @@
 			status = "disabled";
 		};
 
+		pwm@1fe22000 {
+			compatible = "loongson,ls2k1000-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1fe22000 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <24 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm1: pwm@1fe22010 {
+			compatible = "loongson,ls2k1000-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1fe22010 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <25 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1fe22020 {
+			compatible = "loongson,ls2k1000-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1fe22020 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <26 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@1fe22030 {
+			compatible = "loongson,ls2k1000-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x1fe22030 0x0 0x10>;
+			interrupt-parent = <&liointc0>;
+			interrupts = <27 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_APB_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
 		pmc: power-management@1fe27000 {
 			compatible = "loongson,ls2k1000-pmc", "loongson,ls2k0500-pmc", "syscon";
 			reg = <0x0 0x1fe27000 0x0 0x58>;
-- 
cgit v1.2.3


From 1cf806006574b0e874fb39a18b1b26559770fb1c Mon Sep 17 00:00:00 2001
From: Binbin Zhou <zhoubinbin@loongson.cn>
Date: Thu, 5 Jun 2025 20:34:34 +0800
Subject: LoongArch: dts: Add PWM support to Loongson-2K2000

The module is supported, enable it.

Reviewed-by: Yanteng Si <si.yanteng@linux.dev>
Signed-off-by: Binbin Zhou <zhoubinbin@loongson.cn>
Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
---
 arch/loongarch/boot/dts/loongson-2k2000.dtsi | 60 ++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

(limited to 'arch')

diff --git a/arch/loongarch/boot/dts/loongson-2k2000.dtsi b/arch/loongarch/boot/dts/loongson-2k2000.dtsi
index b4ff55a33e90..9e0411f2754c 100644
--- a/arch/loongarch/boot/dts/loongson-2k2000.dtsi
+++ b/arch/loongarch/boot/dts/loongson-2k2000.dtsi
@@ -165,6 +165,66 @@
 			interrupt-parent = <&eiointc>;
 		};
 
+		pwm@100a0000 {
+			compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x100a0000 0x0 0x10>;
+			interrupt-parent = <&pic>;
+			interrupts = <24 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_MISC_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@100a0100 {
+			compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x100a0100 0x0 0x10>;
+			interrupt-parent = <&pic>;
+			interrupts = <25 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_MISC_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@100a0200 {
+			compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x100a0200 0x0 0x10>;
+			interrupt-parent = <&pic>;
+			interrupts = <26 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_MISC_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@100a0300 {
+			compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x100a0300 0x0 0x10>;
+			interrupt-parent = <&pic>;
+			interrupts = <27 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_MISC_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@100a0400 {
+			compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x100a0400 0x0 0x10>;
+			interrupt-parent = <&pic>;
+			interrupts = <38 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_MISC_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
+		pwm@100a0500 {
+			compatible = "loongson,ls2k2000-pwm", "loongson,ls7a-pwm";
+			reg = <0x0 0x100a0500 0x0 0x10>;
+			interrupt-parent = <&pic>;
+			interrupts = <39 IRQ_TYPE_LEVEL_HIGH>;
+			clocks = <&clk LOONGSON2_MISC_CLK>;
+			#pwm-cells = <3>;
+			status = "disabled";
+		};
+
 		rtc0: rtc@100d0100 {
 			compatible = "loongson,ls2k2000-rtc", "loongson,ls7a-rtc";
 			reg = <0x0 0x100d0100 0x0 0x100>;
-- 
cgit v1.2.3


From e21efe833eae4e2a56c2c2a11caae870a65926fa Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Tue, 3 Jun 2025 03:12:54 +0900
Subject: arch: use always-$(KBUILD_BUILTIN) for vmlinux.lds

The extra-y syntax is deprecated. Instead, use always-$(KBUILD_BUILTIN),
which behaves equivalently.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Johannes Berg <johannes@sipsolutions.net>
Reviewed-by: Nicolas Schier <n.schier@avm.de>
---
 arch/alpha/kernel/Makefile      | 2 +-
 arch/arc/kernel/Makefile        | 2 +-
 arch/arm/kernel/Makefile        | 2 +-
 arch/arm64/kernel/Makefile      | 2 +-
 arch/csky/kernel/Makefile       | 2 +-
 arch/hexagon/kernel/Makefile    | 2 +-
 arch/loongarch/kernel/Makefile  | 2 +-
 arch/m68k/kernel/Makefile       | 2 +-
 arch/microblaze/kernel/Makefile | 2 +-
 arch/mips/kernel/Makefile       | 2 +-
 arch/nios2/kernel/Makefile      | 2 +-
 arch/openrisc/kernel/Makefile   | 2 +-
 arch/parisc/kernel/Makefile     | 2 +-
 arch/powerpc/kernel/Makefile    | 2 +-
 arch/riscv/kernel/Makefile      | 2 +-
 arch/s390/kernel/Makefile       | 2 +-
 arch/sh/kernel/Makefile         | 2 +-
 arch/sparc/kernel/Makefile      | 2 +-
 arch/um/kernel/Makefile         | 2 +-
 arch/x86/kernel/Makefile        | 2 +-
 arch/xtensa/kernel/Makefile     | 2 +-
 21 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'arch')

diff --git a/arch/alpha/kernel/Makefile b/arch/alpha/kernel/Makefile
index b6c862dff1f6..187cd8df2faf 100644
--- a/arch/alpha/kernel/Makefile
+++ b/arch/alpha/kernel/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the linux kernel.
 #
 
-extra-y		:= vmlinux.lds
+always-$(KBUILD_BUILTIN)	:= vmlinux.lds
 asflags-y	:= $(KBUILD_CFLAGS)
 ccflags-y	:= -Wno-sign-compare
 
diff --git a/arch/arc/kernel/Makefile b/arch/arc/kernel/Makefile
index 95fbf9364c67..fa94fff02419 100644
--- a/arch/arc/kernel/Makefile
+++ b/arch/arc/kernel/Makefile
@@ -26,4 +26,4 @@ ifdef CONFIG_ISA_ARCOMPACT
 CFLAGS_fpu.o   += -mdpfp
 endif
 
-extra-y := vmlinux.lds
+always-$(KBUILD_BUILTIN) := vmlinux.lds
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index b3333d070390..afc9de7ef9a1 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -104,4 +104,4 @@ obj-$(CONFIG_HAVE_ARM_SMCCC)	+= smccc-call.o
 
 obj-$(CONFIG_GENERIC_CPU_VULNERABILITIES) += spectre.o
 
-extra-y := vmlinux.lds
+always-$(KBUILD_BUILTIN) := vmlinux.lds
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 71c29a2a2f19..2920b0a51403 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -78,7 +78,7 @@ $(obj)/vdso32-wrap.o: $(obj)/vdso32/vdso.so
 
 obj-y					+= probes/
 obj-y					+= head.o
-extra-y					+= vmlinux.lds
+always-$(KBUILD_BUILTIN)		+= vmlinux.lds
 
 ifeq ($(CONFIG_DEBUG_EFI),y)
 AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\""
diff --git a/arch/csky/kernel/Makefile b/arch/csky/kernel/Makefile
index de1c3472e8f0..a406a4ac2378 100644
--- a/arch/csky/kernel/Makefile
+++ b/arch/csky/kernel/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-extra-y := vmlinux.lds
+always-$(KBUILD_BUILTIN) := vmlinux.lds
 
 obj-y += head.o entry.o atomic.o signal.o traps.o irq.o time.o vdso.o vdso/
 obj-y += power.o syscall.o syscall_table.o setup.o
diff --git a/arch/hexagon/kernel/Makefile b/arch/hexagon/kernel/Makefile
index 3fdf937eb572..8e0fb4a62315 100644
--- a/arch/hexagon/kernel/Makefile
+++ b/arch/hexagon/kernel/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-extra-y := vmlinux.lds
+always-$(KBUILD_BUILTIN) := vmlinux.lds
 
 obj-y += head.o
 obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index f9dcaa60033d..6f5a4574a911 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -5,7 +5,7 @@
 
 OBJECT_FILES_NON_STANDARD_head.o := y
 
-extra-y		:= vmlinux.lds
+always-$(KBUILD_BUILTIN)	:= vmlinux.lds
 
 obj-y		+= head.o cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \
 		   traps.o irq.o idle.o process.o dma.o mem.o reset.o switch.o \
diff --git a/arch/m68k/kernel/Makefile b/arch/m68k/kernel/Makefile
index 6c732ed3998b..57c1b3e8d60e 100644
--- a/arch/m68k/kernel/Makefile
+++ b/arch/m68k/kernel/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the linux kernel.
 #
 
-extra-y			+= vmlinux.lds
+always-$(KBUILD_BUILTIN)	+= vmlinux.lds
 
 obj-$(CONFIG_MMU_MOTOROLA)	:= head.o
 obj-$(CONFIG_SUN3)		:= sun3-head.o
diff --git a/arch/microblaze/kernel/Makefile b/arch/microblaze/kernel/Makefile
index 85c4d29ef43e..241e466e7333 100644
--- a/arch/microblaze/kernel/Makefile
+++ b/arch/microblaze/kernel/Makefile
@@ -11,7 +11,7 @@ CFLAGS_REMOVE_ftrace.o = -pg
 CFLAGS_REMOVE_process.o = -pg
 endif
 
-extra-y := vmlinux.lds
+always-$(KBUILD_BUILTIN) := vmlinux.lds
 
 obj-y += head.o dma.o exceptions.o \
 	hw_exception_handler.o irq.o \
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index ecf3278a32f7..95a1e674fd67 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the Linux/MIPS kernel.
 #
 
-extra-y		:= vmlinux.lds
+always-$(KBUILD_BUILTIN)	:= vmlinux.lds
 
 obj-y		+= head.o branch.o cmpxchg.o elf.o entry.o genex.o idle.o irq.o \
 		   process.o prom.o ptrace.o reset.o setup.o signal.o \
diff --git a/arch/nios2/kernel/Makefile b/arch/nios2/kernel/Makefile
index 78a913181fa1..4dce965a7b73 100644
--- a/arch/nios2/kernel/Makefile
+++ b/arch/nios2/kernel/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the nios2 linux kernel.
 #
 
-extra-y	+= vmlinux.lds
+always-$(KBUILD_BUILTIN)	+= vmlinux.lds
 
 obj-y	+= head.o
 obj-y	+= cpuinfo.o
diff --git a/arch/openrisc/kernel/Makefile b/arch/openrisc/kernel/Makefile
index e4c7d9bdd598..58e6a1b525b7 100644
--- a/arch/openrisc/kernel/Makefile
+++ b/arch/openrisc/kernel/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the linux kernel.
 #
 
-extra-y	:= vmlinux.lds
+always-$(KBUILD_BUILTIN)	:= vmlinux.lds
 
 obj-y	:= head.o setup.o or32_ksyms.o process.o dma.o \
 	   traps.o time.o irq.o entry.o ptrace.o signal.o \
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index 5ab0467be70a..d5055ba33722 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -3,7 +3,7 @@
 # Makefile for arch/parisc/kernel
 #
 
-extra-y		:= vmlinux.lds
+always-$(KBUILD_BUILTIN)		:= vmlinux.lds
 
 obj-y		:= head.o cache.o pacache.o setup.o pdt.o traps.o time.o irq.o \
 		   syscall.o entry.o sys_parisc.o firmware.o \
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 6ac621155ec3..4d2daa8e7bca 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -126,7 +126,7 @@ obj-$(CONFIG_PPC_BOOK3S_32)	+= head_book3s_32.o
 obj-$(CONFIG_44x)		+= head_44x.o
 obj-$(CONFIG_PPC_8xx)		+= head_8xx.o
 obj-$(CONFIG_PPC_85xx)		+= head_85xx.o
-extra-y				+= vmlinux.lds
+always-$(KBUILD_BUILTIN)	+= vmlinux.lds
 
 obj-$(CONFIG_RELOCATABLE)	+= reloc_$(BITS).o
 
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index f7480c9c6f8d..48dcaf2efae8 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -43,7 +43,7 @@ CFLAGS_sbi_ecall.o += -D__NO_FORTIFY
 endif
 endif
 
-extra-y += vmlinux.lds
+always-$(KBUILD_BUILTIN) += vmlinux.lds
 
 obj-y	+= head.o
 obj-y	+= soc.o
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index db5f3a3faefb..ea5ed6654050 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -46,7 +46,7 @@ obj-y	+= nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
 obj-y	+= smp.o text_amode31.o stacktrace.o abs_lowcore.o facility.o uv.o wti.o
 obj-y	+= diag/
 
-extra-y				+= vmlinux.lds
+always-$(KBUILD_BUILTIN)	+= vmlinux.lds
 
 obj-$(CONFIG_SYSFS)		+= nospec-sysfs.o
 CFLAGS_REMOVE_nospec-branch.o	+= $(CC_FLAGS_EXPOLINE)
diff --git a/arch/sh/kernel/Makefile b/arch/sh/kernel/Makefile
index 7b453592adaf..5ef123bc63f8 100644
--- a/arch/sh/kernel/Makefile
+++ b/arch/sh/kernel/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the Linux/SuperH kernel.
 #
 
-extra-y	:= vmlinux.lds
+always-$(KBUILD_BUILTIN)	:= vmlinux.lds
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 58ea4ef9b622..2859842d6bb7 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -9,7 +9,7 @@ asflags-y := -ansi
 # Undefine sparc when processing vmlinux.lds - it is used
 # And teach CPP we are doing $(BITS) builds (for this case)
 CPPFLAGS_vmlinux.lds := -Usparc -m$(BITS)
-extra-y              += vmlinux.lds
+always-$(KBUILD_BUILTIN) += vmlinux.lds
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 4df1cd0d2017..821bf4027a23 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -12,7 +12,7 @@ CPPFLAGS_vmlinux.lds := -DSTART=$(LDS_START)		\
                         -DELF_ARCH=$(LDS_ELF_ARCH)	\
                         -DELF_FORMAT=$(LDS_ELF_FORMAT)	\
 			$(LDS_EXTRA)
-extra-y := vmlinux.lds
+always-$(KBUILD_BUILTIN) := vmlinux.lds
 
 obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \
 	physmem.o process.o ptrace.o reboot.o sigio.o \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 84cfa179802c..9a30c9816b16 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the linux kernel.
 #
 
-extra-y	+= vmlinux.lds
+always-$(KBUILD_BUILTIN)	+= vmlinux.lds
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
diff --git a/arch/xtensa/kernel/Makefile b/arch/xtensa/kernel/Makefile
index f28b8e3d717e..d3ef0407401f 100644
--- a/arch/xtensa/kernel/Makefile
+++ b/arch/xtensa/kernel/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the Linux/Xtensa kernel.
 #
 
-extra-y := vmlinux.lds
+always-$(KBUILD_BUILTIN) := vmlinux.lds
 
 obj-y := head.o align.o coprocessor.o entry.o irq.o platform.o process.o \
 	 ptrace.o setup.o signal.o stacktrace.o syscall.o time.o traps.o \
-- 
cgit v1.2.3


From 4c529a4a7260776bb4abe264498857b4537aa70d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 7 Jun 2025 14:22:56 +0200
Subject: x86/smp: PM/hibernate: Split arch_resume_nosmt()

Move the inner part of the arch_resume_nosmt() code into a separate
function called arch_cpu_rescan_dead_smt_siblings(), so it can be
used in other places where "dead" SMT siblings may need to be taken
online and offline again in order to get into deep idle states.

No intentional functional impact.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Link: https://patch.msgid.link/3361688.44csPzL39Z@rjwysocki.net
[ rjw: Prevent build issues with CONFIG_SMP unset ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/kernel/smp.c      | 24 ++++++++++++++++++++++++
 arch/x86/power/hibernate.c | 19 ++++++-------------
 2 files changed, 30 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18266cc3d98c..b014e6d229f9 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -299,3 +299,27 @@ struct smp_ops smp_ops = {
 	.send_call_func_single_ipi = native_send_call_func_single_ipi,
 };
 EXPORT_SYMBOL_GPL(smp_ops);
+
+int arch_cpu_rescan_dead_smt_siblings(void)
+{
+	enum cpuhp_smt_control old = cpu_smt_control;
+	int ret;
+
+	/*
+	 * If SMT has been disabled and SMT siblings are in HLT, bring them back
+	 * online and offline them again so that they end up in MWAIT proper.
+	 *
+	 * Called with hotplug enabled.
+	 */
+	if (old != CPU_SMT_DISABLED && old != CPU_SMT_FORCE_DISABLED)
+		return 0;
+
+	ret = cpuhp_smt_enable();
+	if (ret)
+		return ret;
+
+	ret = cpuhp_smt_disable(old);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(arch_cpu_rescan_dead_smt_siblings);
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index a7c23f2a58c9..a2294c1649f6 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -192,7 +192,8 @@ out:
 
 int arch_resume_nosmt(void)
 {
-	int ret = 0;
+	int ret;
+
 	/*
 	 * We reached this while coming out of hibernation. This means
 	 * that SMT siblings are sleeping in hlt, as mwait is not safe
@@ -206,18 +207,10 @@ int arch_resume_nosmt(void)
 	 * Called with hotplug disabled.
 	 */
 	cpu_hotplug_enable();
-	if (cpu_smt_control == CPU_SMT_DISABLED ||
-			cpu_smt_control == CPU_SMT_FORCE_DISABLED) {
-		enum cpuhp_smt_control old = cpu_smt_control;
-
-		ret = cpuhp_smt_enable();
-		if (ret)
-			goto out;
-		ret = cpuhp_smt_disable(old);
-		if (ret)
-			goto out;
-	}
-out:
+
+	ret = arch_cpu_rescan_dead_smt_siblings();
+
 	cpu_hotplug_disable();
+
 	return ret;
 }
-- 
cgit v1.2.3


From a18d098f2aab82abde1d59c56aef9beca01c892b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 5 Jun 2025 17:09:35 +0200
Subject: Reapply "x86/smp: Eliminate mwait_play_dead_cpuid_hint()"

Revert commit 70523f335734 ("Revert "x86/smp: Eliminate
mwait_play_dead_cpuid_hint()"") to reapply the changes from commit
96040f7273e2 ("x86/smp: Eliminate mwait_play_dead_cpuid_hint()")
reverted by it.

Previously, these changes caused idle power to rise on systems booting
with "nosmt" in the kernel command line because they effectively caused
"dead" SMT siblings to remain in idle state C1 after executing the HLT
instruction, which prevented the processor from reaching package idle
states deeper than PC2 going forward.

Now, the "dead" SMT siblings are rescanned after initializing a proper
cpuidle driver for the processor (either intel_idle or ACPI idle), at
which point they are able to enter a sufficiently deep idle state
in native_play_dead() via cpuidle, so the code changes in question can
be reapplied.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Link: https://patch.msgid.link/7813065.EvYhyI6sBW@rjwysocki.net
---
 arch/x86/kernel/smpboot.c | 54 ++++++-----------------------------------------
 1 file changed, 7 insertions(+), 47 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fc78c2325fd2..58ede3fa6a75 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1244,6 +1244,10 @@ void play_dead_common(void)
 	local_irq_disable();
 }
 
+/*
+ * We need to flush the caches before going to sleep, lest we have
+ * dirty data in our caches when we come back up.
+ */
 void __noreturn mwait_play_dead(unsigned int eax_hint)
 {
 	struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
@@ -1289,50 +1293,6 @@ void __noreturn mwait_play_dead(unsigned int eax_hint)
 	}
 }
 
-/*
- * We need to flush the caches before going to sleep, lest we have
- * dirty data in our caches when we come back up.
- */
-static inline void mwait_play_dead_cpuid_hint(void)
-{
-	unsigned int eax, ebx, ecx, edx;
-	unsigned int highest_cstate = 0;
-	unsigned int highest_subcstate = 0;
-	int i;
-
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
-	    boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)
-		return;
-	if (!this_cpu_has(X86_FEATURE_MWAIT))
-		return;
-	if (!this_cpu_has(X86_FEATURE_CLFLUSH))
-		return;
-
-	eax = CPUID_LEAF_MWAIT;
-	ecx = 0;
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-
-	/*
-	 * eax will be 0 if EDX enumeration is not valid.
-	 * Initialized below to cstate, sub_cstate value when EDX is valid.
-	 */
-	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
-		eax = 0;
-	} else {
-		edx >>= MWAIT_SUBSTATE_SIZE;
-		for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
-			if (edx & MWAIT_SUBSTATE_MASK) {
-				highest_cstate = i;
-				highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
-			}
-		}
-		eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
-			(highest_subcstate - 1);
-	}
-
-	mwait_play_dead(eax);
-}
-
 /*
  * Kick all "offline" CPUs out of mwait on kexec(). See comment in
  * mwait_play_dead().
@@ -1383,9 +1343,9 @@ void native_play_dead(void)
 	play_dead_common();
 	tboot_shutdown(TB_SHUTDOWN_WFS);
 
-	mwait_play_dead_cpuid_hint();
-	if (cpuidle_play_dead())
-		hlt_play_dead();
+	/* Below returns only on error. */
+	cpuidle_play_dead();
+	hlt_play_dead();
 }
 
 #else /* ... !CONFIG_HOTPLUG_CPU */
-- 
cgit v1.2.3


From 9cc646950eefda5605111cbc387b00b1f741c239 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Fri, 14 Mar 2025 08:10:03 +0100
Subject: sh: Replace __ASSEMBLY__ with __ASSEMBLER__ in all headers

While the GCC and Clang compilers already define __ASSEMBLER__
automatically when compiling assembly code, __ASSEMBLY__ is a
macro that only gets defined by the Makefiles in the kernel.
This can be very confusing when switching between userspace
and kernelspace coding, or when dealing with uapi headers that
rather should use __ASSEMBLER__ instead. So let's standardize on
the __ASSEMBLER__ macro that is provided by the compilers now.

This is a completely mechanical patch (done with a simple "sed -i"
statement).

Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: linux-sh@vger.kernel.org
Signed-off-by: Thomas Huth <thuth@redhat.com>
Reviewed-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Signed-off-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
---
 arch/sh/include/asm/cache.h                   |  4 ++--
 arch/sh/include/asm/dwarf.h                   |  6 +++---
 arch/sh/include/asm/fpu.h                     |  4 ++--
 arch/sh/include/asm/ftrace.h                  |  8 ++++----
 arch/sh/include/asm/mmu.h                     |  4 ++--
 arch/sh/include/asm/page.h                    |  8 ++++----
 arch/sh/include/asm/pgtable.h                 |  4 ++--
 arch/sh/include/asm/pgtable_32.h              |  8 ++++----
 arch/sh/include/asm/processor.h               |  4 ++--
 arch/sh/include/asm/smc37c93x.h               |  4 ++--
 arch/sh/include/asm/suspend.h                 |  2 +-
 arch/sh/include/asm/thread_info.h             | 10 +++++-----
 arch/sh/include/asm/tlb.h                     |  4 ++--
 arch/sh/include/asm/types.h                   |  4 ++--
 arch/sh/include/mach-common/mach/romimage.h   |  6 +++---
 arch/sh/include/mach-ecovec24/mach/romimage.h |  6 +++---
 arch/sh/include/mach-kfr2r09/mach/romimage.h  |  6 +++---
 17 files changed, 46 insertions(+), 46 deletions(-)

(limited to 'arch')

diff --git a/arch/sh/include/asm/cache.h b/arch/sh/include/asm/cache.h
index b38dbc975581..e7ac9c950275 100644
--- a/arch/sh/include/asm/cache.h
+++ b/arch/sh/include/asm/cache.h
@@ -22,7 +22,7 @@
 
 #define __read_mostly __section(".data..read_mostly")
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 struct cache_info {
 	unsigned int ways;		/* Number of cache ways */
 	unsigned int sets;		/* Number of cache sets */
@@ -48,5 +48,5 @@ struct cache_info {
 
 	unsigned long flags;
 };
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_SH_CACHE_H */
diff --git a/arch/sh/include/asm/dwarf.h b/arch/sh/include/asm/dwarf.h
index 571954474122..f46d18b84833 100644
--- a/arch/sh/include/asm/dwarf.h
+++ b/arch/sh/include/asm/dwarf.h
@@ -189,7 +189,7 @@
  */
 #define DWARF_ARCH_RA_REG	17
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/compiler.h>
 #include <linux/bug.h>
@@ -379,7 +379,7 @@ extern int module_dwarf_finalize(const Elf_Ehdr *, const Elf_Shdr *,
 				 struct module *);
 extern void module_dwarf_cleanup(struct module *);
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #define CFI_STARTPROC	.cfi_startproc
 #define CFI_ENDPROC	.cfi_endproc
@@ -402,7 +402,7 @@ extern void module_dwarf_cleanup(struct module *);
 #define CFI_REL_OFFSET	CFI_IGNORE
 #define CFI_UNDEFINED	CFI_IGNORE
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 static inline void dwarf_unwinder_init(void)
 {
 }
diff --git a/arch/sh/include/asm/fpu.h b/arch/sh/include/asm/fpu.h
index 0379f4cce5ed..a086e38b70ee 100644
--- a/arch/sh/include/asm/fpu.h
+++ b/arch/sh/include/asm/fpu.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_SH_FPU_H
 #define __ASM_SH_FPU_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/ptrace.h>
 
@@ -67,6 +67,6 @@ static inline void clear_fpu(struct task_struct *tsk, struct pt_regs *regs)
 void float_raise(unsigned int flags);
 int float_rounding_mode(void);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_SH_FPU_H */
diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h
index 1c10e1066390..d35781ab716e 100644
--- a/arch/sh/include/asm/ftrace.h
+++ b/arch/sh/include/asm/ftrace.h
@@ -7,7 +7,7 @@
 #define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
 #define FTRACE_SYSCALL_MAX	NR_syscalls
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 extern void mcount(void);
 
 #define MCOUNT_ADDR		((unsigned long)(mcount))
@@ -35,10 +35,10 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
 
 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /* arch/sh/kernel/return_address.c */
 extern void *return_address(unsigned int);
@@ -53,6 +53,6 @@ static inline void arch_ftrace_nmi_enter(void) { }
 static inline void arch_ftrace_nmi_exit(void) { }
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __ASM_SH_FTRACE_H */
diff --git a/arch/sh/include/asm/mmu.h b/arch/sh/include/asm/mmu.h
index 172e329fd92d..b9c9f91e6616 100644
--- a/arch/sh/include/asm/mmu.h
+++ b/arch/sh/include/asm/mmu.h
@@ -33,7 +33,7 @@
 
 #define PMB_NO_ENTRY		(-1)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/errno.h>
 #include <linux/threads.h>
 #include <asm/page.h>
@@ -102,6 +102,6 @@ pmb_remap(phys_addr_t phys, unsigned long size, pgprot_t prot)
 	return pmb_remap_caller(phys, size, prot, __builtin_return_address(0));
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif /* __MMU_H */
diff --git a/arch/sh/include/asm/page.h b/arch/sh/include/asm/page.h
index 3990cbd9aa04..def4205491ec 100644
--- a/arch/sh/include/asm/page.h
+++ b/arch/sh/include/asm/page.h
@@ -30,7 +30,7 @@
 #define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT-PAGE_SHIFT)
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/uncached.h>
 
 extern unsigned long shm_align_mask;
@@ -85,7 +85,7 @@ typedef struct page *pgtable_t;
 
 #define pte_pgprot(x) __pgprot(pte_val(x) & PTE_FLAGS_MASK)
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /*
  * __MEMORY_START and SIZE are the physical addresses and size of RAM.
@@ -126,10 +126,10 @@ typedef struct page *pgtable_t;
 #define ___va(x)	((x)+PAGE_OFFSET)
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define __pa(x)		___pa((unsigned long)x)
 #define __va(x)		(void *)___va((unsigned long)x)
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #ifdef CONFIG_UNCACHED_MAPPING
 #if defined(CONFIG_29BIT)
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 729f5c6225fb..10fa8f2bb8d1 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -17,7 +17,7 @@
 #include <asm/page.h>
 #include <asm/mmu.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/addrspace.h>
 #include <asm/fixmap.h>
 
@@ -28,7 +28,7 @@
 extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /*
  * Effective and physical address definitions, to aid with sign
diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h
index f939f1215232..bb9f9a2fc85c 100644
--- a/arch/sh/include/asm/pgtable_32.h
+++ b/arch/sh/include/asm/pgtable_32.h
@@ -170,7 +170,7 @@ static inline unsigned long copy_ptea_attributes(unsigned long x)
 	(PTE_MASK | _PAGE_ACCESSED | _PAGE_CACHABLE | \
 	 _PAGE_DIRTY | _PAGE_SPECIAL)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #if defined(CONFIG_X2TLB) /* SH-X2 TLB */
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_CACHABLE | \
@@ -287,9 +287,9 @@ static inline unsigned long copy_ptea_attributes(unsigned long x)
 				__pgprot(0)
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /*
  * Certain architectures need to do special things when PTEs
@@ -486,5 +486,5 @@ static inline int pte_swp_exclusive(pte_t pte)
 PTE_BIT_FUNC(low, swp_mkexclusive, |= _PAGE_SWP_EXCLUSIVE);
 PTE_BIT_FUNC(low, swp_clear_exclusive, &= ~_PAGE_SWP_EXCLUSIVE);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_SH_PGTABLE_32_H */
diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h
index 73fba7c922f9..2a0b5713ab80 100644
--- a/arch/sh/include/asm/processor.h
+++ b/arch/sh/include/asm/processor.h
@@ -5,7 +5,7 @@
 #include <asm/cpu-features.h>
 #include <asm/cache.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  *  CPU type and hardware bug flags. Kept separately for each CPU.
  *
@@ -168,7 +168,7 @@ extern unsigned int instruction_size(unsigned int insn);
 
 void select_idle_routine(void);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #include <asm/processor_32.h>
 
diff --git a/arch/sh/include/asm/smc37c93x.h b/arch/sh/include/asm/smc37c93x.h
index 891f2f8f2fd0..caf4cd8dd241 100644
--- a/arch/sh/include/asm/smc37c93x.h
+++ b/arch/sh/include/asm/smc37c93x.h
@@ -67,7 +67,7 @@
 #define UART_DLL	0x0	/* Divisor Latch (LS) */
 #define UART_DLM	0x2	/* Divisor Latch (MS) */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 typedef struct uart_reg {
 	volatile __u16 rbr;
 	volatile __u16 ier;
@@ -78,7 +78,7 @@ typedef struct uart_reg {
 	volatile __u16 msr;
 	volatile __u16 scr;
 } uart_reg;
-#endif /* ! __ASSEMBLY__ */
+#endif /* ! __ASSEMBLER__ */
 
 /* Alias for Write Only Register */
 
diff --git a/arch/sh/include/asm/suspend.h b/arch/sh/include/asm/suspend.h
index 47db17520261..0f991babc559 100644
--- a/arch/sh/include/asm/suspend.h
+++ b/arch/sh/include/asm/suspend.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_SH_SUSPEND_H
 #define _ASM_SH_SUSPEND_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/notifier.h>
 
 #include <asm/ptrace.h>
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index 9f19a682d315..471db5173036 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -21,7 +21,7 @@
 #define FAULT_CODE_PROT		(1 << 3)	/* protection fault */
 #define FAULT_CODE_USER		(1 << 4)	/* user-mode access */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <asm/processor.h>
 
 struct thread_info {
@@ -49,7 +49,7 @@ struct thread_info {
 /*
  * macros/functions for gaining access to the thread information structure
  */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.task		= &tsk,			\
@@ -86,7 +86,7 @@ static inline struct thread_info *current_thread_info(void)
 
 extern void init_thread_xstate(void);
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 /*
  * Thread information flags
@@ -144,7 +144,7 @@ extern void init_thread_xstate(void);
  */
 #define TS_USEDFPU		0x0002	/* FPU used by this task this quantum */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define TI_FLAG_FAULT_CODE_SHIFT	24
 
@@ -164,5 +164,5 @@ static inline unsigned int get_thread_fault_code(void)
 	return ti->flags >> TI_FLAG_FAULT_CODE_SHIFT;
 }
 
-#endif	/* !__ASSEMBLY__ */
+#endif	/* !__ASSEMBLER__ */
 #endif /* __ASM_SH_THREAD_INFO_H */
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index ddf324bfb9a0..39df40d0ebc2 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -2,7 +2,7 @@
 #ifndef __ASM_SH_TLB_H
 #define __ASM_SH_TLB_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 #include <linux/pagemap.h>
 #include <asm-generic/tlb.h>
 
@@ -29,5 +29,5 @@ asmlinkage int handle_tlbmiss(struct pt_regs *regs, unsigned long error_code,
 			      unsigned long address);
 
 #endif /* CONFIG_MMU */
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_SH_TLB_H */
diff --git a/arch/sh/include/asm/types.h b/arch/sh/include/asm/types.h
index 9b3fc923ee28..fec3e89df0b1 100644
--- a/arch/sh/include/asm/types.h
+++ b/arch/sh/include/asm/types.h
@@ -7,10 +7,10 @@
 /*
  * These aren't exported outside the kernel to avoid name space clashes
  */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 typedef u16 insn_size_t;
 typedef u32 reg_size_t;
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 #endif /* __ASM_SH_TYPES_H */
diff --git a/arch/sh/include/mach-common/mach/romimage.h b/arch/sh/include/mach-common/mach/romimage.h
index 1915714263aa..22fb47ec9b15 100644
--- a/arch/sh/include/mach-common/mach/romimage.h
+++ b/arch/sh/include/mach-common/mach/romimage.h
@@ -1,12 +1,12 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 /* do nothing here by default */
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 static inline void mmcif_update_progress(int nr)
 {
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
diff --git a/arch/sh/include/mach-ecovec24/mach/romimage.h b/arch/sh/include/mach-ecovec24/mach/romimage.h
index 2da6ff326cbd..f93d494736c3 100644
--- a/arch/sh/include/mach-ecovec24/mach/romimage.h
+++ b/arch/sh/include/mach-ecovec24/mach/romimage.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 /* EcoVec board specific boot code:
  * converts the "partner-jet-script.txt" script into assembly
@@ -22,7 +22,7 @@
 1 :	.long 0xa8000000
 2 :
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 /* Ecovec board specific information:
  *
@@ -45,4 +45,4 @@ static inline void mmcif_update_progress(int nr)
 	__raw_writeb(1 << (nr - 1), PGDR);
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
diff --git a/arch/sh/include/mach-kfr2r09/mach/romimage.h b/arch/sh/include/mach-kfr2r09/mach/romimage.h
index 209275872ff0..f68bb480d378 100644
--- a/arch/sh/include/mach-kfr2r09/mach/romimage.h
+++ b/arch/sh/include/mach-kfr2r09/mach/romimage.h
@@ -1,5 +1,5 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 /* kfr2r09 board specific boot code:
  * converts the "partner-jet-script.txt" script into assembly
@@ -22,10 +22,10 @@
 1:	.long 0xa8000000
 2:
 
-#else /* __ASSEMBLY__ */
+#else /* __ASSEMBLER__ */
 
 static inline void mmcif_update_progress(int nr)
 {
 }
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
-- 
cgit v1.2.3


From ab0a168fcd984c2fc93eb0fb2367881c6aa62eb3 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Fri, 2 May 2025 13:13:36 +0200
Subject: sh: ecovec24: Make SPI mode explicit

Commit cf9e4784f3bde3e4 ("spi: sh-msiof: Add slave mode support") added
a new mode member to the sh_msiof_spi_info structure, but did not update
any board files.  Hence all users in board files rely on the default
being host mode.

Make this unambiguous by configuring host mode explicitly.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Signed-off-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
---
 arch/sh/boards/mach-ecovec24/setup.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c
index 6f13557eecd6..a641e26f8fdf 100644
--- a/arch/sh/boards/mach-ecovec24/setup.c
+++ b/arch/sh/boards/mach-ecovec24/setup.c
@@ -825,6 +825,7 @@ static struct spi_board_info spi_bus[] = {
 /* MSIOF0 */
 static struct sh_msiof_spi_info msiof0_data = {
 	.num_chipselect = 1,
+	.mode = MSIOF_SPI_HOST,
 };
 
 static struct resource msiof0_resources[] = {
-- 
cgit v1.2.3


From 8a3682601ddaa4ef0c400f627a7f4b9388bbccef Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@kernel.org>
Date: Sat, 17 May 2025 12:30:48 +0300
Subject: sh: kprobes: Remove unused variables in kprobe_exceptions_notify()

kbuild reports the following warning:

   arch/sh/kernel/kprobes.c: In function 'kprobe_exceptions_notify':
>> arch/sh/kernel/kprobes.c:412:24: warning: variable 'p' set but not used [-Wunused-but-set-variable]
     412 |         struct kprobe *p = NULL;
         |                        ^

The variable 'p' is indeed unused since the commit fa5a24b16f94
("sh/kprobes: Don't call the ->break_handler() in SH kprobes code")

Remove that variable along with 'kprobe_opcode_t *addr' which also
becomes unused after 'p' is removed.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202505151341.EuRFR22l-lkp@intel.com/
Fixes: fa5a24b16f94 ("sh/kprobes: Don't call the ->break_handler() in SH kprobes code")
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Reviewed-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
---
 arch/sh/kernel/kprobes.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'arch')

diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c
index 49c4ffd782d6..a250fb1b9420 100644
--- a/arch/sh/kernel/kprobes.c
+++ b/arch/sh/kernel/kprobes.c
@@ -404,13 +404,10 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 				       unsigned long val, void *data)
 {
-	struct kprobe *p = NULL;
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
-	kprobe_opcode_t *addr = NULL;
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
-	addr = (kprobe_opcode_t *) (args->regs->pc);
 	if (val == DIE_TRAP &&
 	    args->trapnr == (BREAKPOINT_INSTRUCTION & 0xff)) {
 		if (!kprobe_running()) {
@@ -421,7 +418,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 				ret = NOTIFY_DONE;
 			}
 		} else {
-			p = get_kprobe(addr);
 			if ((kcb->kprobe_status == KPROBE_HIT_SS) ||
 			    (kcb->kprobe_status == KPROBE_REENTER)) {
 				if (post_kprobe_handler(args->regs))
-- 
cgit v1.2.3


From 41cb08555c4164996d67c78b3bf1c658075b75f1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 9 May 2025 07:51:14 +0200
Subject: treewide, timers: Rename from_timer() to timer_container_of()

Move this API to the canonical timer_*() namespace.

[ tglx: Redone against pre rc1 ]

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/all/aB2X0jCKQO56WdMt@gmail.com
---
 arch/alpha/kernel/srmcons.c               | 3 ++-
 arch/powerpc/kvm/booke.c                  | 2 +-
 arch/powerpc/platforms/powermac/low_i2c.c | 3 ++-
 arch/sh/drivers/heartbeat.c               | 2 +-
 arch/sh/drivers/pci/common.c              | 4 ++--
 arch/sh/drivers/push-switch.c             | 2 +-
 arch/sparc/kernel/viohs.c                 | 2 +-
 arch/um/drivers/vector_kern.c             | 2 +-
 arch/x86/kvm/xen.c                        | 3 ++-
 arch/xtensa/platforms/iss/network.c       | 2 +-
 10 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/alpha/kernel/srmcons.c b/arch/alpha/kernel/srmcons.c
index a89ce84371f9..d19e51ec711d 100644
--- a/arch/alpha/kernel/srmcons.c
+++ b/arch/alpha/kernel/srmcons.c
@@ -69,7 +69,8 @@ srmcons_do_receive_chars(struct tty_port *port)
 static void
 srmcons_receive_chars(struct timer_list *t)
 {
-	struct srmcons_private *srmconsp = from_timer(srmconsp, t, timer);
+	struct srmcons_private *srmconsp = timer_container_of(srmconsp, t,
+							      timer);
 	struct tty_port *port = &srmconsp->port;
 	unsigned long flags;
 	int incr = 10;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 791d1942a058..3401b96be475 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -628,7 +628,7 @@ static void arm_next_watchdog(struct kvm_vcpu *vcpu)
 
 static void kvmppc_watchdog_func(struct timer_list *t)
 {
-	struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.wdt_timer);
+	struct kvm_vcpu *vcpu = timer_container_of(vcpu, t, arch.wdt_timer);
 	u32 tsr, new_tsr;
 	int final;
 
diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c
index a0ae58636e10..02474e27df9b 100644
--- a/arch/powerpc/platforms/powermac/low_i2c.c
+++ b/arch/powerpc/platforms/powermac/low_i2c.c
@@ -359,7 +359,8 @@ static irqreturn_t kw_i2c_irq(int irq, void *dev_id)
 
 static void kw_i2c_timeout(struct timer_list *t)
 {
-	struct pmac_i2c_host_kw *host = from_timer(host, t, timeout_timer);
+	struct pmac_i2c_host_kw *host = timer_container_of(host, t,
+							   timeout_timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&host->lock, flags);
diff --git a/arch/sh/drivers/heartbeat.c b/arch/sh/drivers/heartbeat.c
index 24391b444b28..42103038a7d0 100644
--- a/arch/sh/drivers/heartbeat.c
+++ b/arch/sh/drivers/heartbeat.c
@@ -58,7 +58,7 @@ static inline void heartbeat_toggle_bit(struct heartbeat_data *hd,
 
 static void heartbeat_timer(struct timer_list *t)
 {
-	struct heartbeat_data *hd = from_timer(hd, t, timer);
+	struct heartbeat_data *hd = timer_container_of(hd, t, timer);
 	static unsigned bit = 0, up = 1;
 
 	heartbeat_toggle_bit(hd, bit, hd->flags & HEARTBEAT_INVERTED);
diff --git a/arch/sh/drivers/pci/common.c b/arch/sh/drivers/pci/common.c
index 5442475d132e..9633b6147a05 100644
--- a/arch/sh/drivers/pci/common.c
+++ b/arch/sh/drivers/pci/common.c
@@ -88,7 +88,7 @@ int __init pci_is_66mhz_capable(struct pci_channel *hose,
 
 static void pcibios_enable_err(struct timer_list *t)
 {
-	struct pci_channel *hose = from_timer(hose, t, err_timer);
+	struct pci_channel *hose = timer_container_of(hose, t, err_timer);
 
 	timer_delete(&hose->err_timer);
 	printk(KERN_DEBUG "PCI: re-enabling error IRQ.\n");
@@ -97,7 +97,7 @@ static void pcibios_enable_err(struct timer_list *t)
 
 static void pcibios_enable_serr(struct timer_list *t)
 {
-	struct pci_channel *hose = from_timer(hose, t, serr_timer);
+	struct pci_channel *hose = timer_container_of(hose, t, serr_timer);
 
 	timer_delete(&hose->serr_timer);
 	printk(KERN_DEBUG "PCI: re-enabling system error IRQ.\n");
diff --git a/arch/sh/drivers/push-switch.c b/arch/sh/drivers/push-switch.c
index 2b51ad9d5586..443cc6fd26a8 100644
--- a/arch/sh/drivers/push-switch.c
+++ b/arch/sh/drivers/push-switch.c
@@ -25,7 +25,7 @@ static DEVICE_ATTR_RO(switch);
 
 static void switch_timer(struct timer_list *t)
 {
-	struct push_switch *psw = from_timer(psw, t, debounce);
+	struct push_switch *psw = timer_container_of(psw, t, debounce);
 
 	schedule_work(&psw->work);
 }
diff --git a/arch/sparc/kernel/viohs.c b/arch/sparc/kernel/viohs.c
index e27afd233bf5..8fb2e7ca5015 100644
--- a/arch/sparc/kernel/viohs.c
+++ b/arch/sparc/kernel/viohs.c
@@ -804,7 +804,7 @@ EXPORT_SYMBOL(vio_port_up);
 
 static void vio_port_timer(struct timer_list *t)
 {
-	struct vio_driver_state *vio = from_timer(vio, t, timer);
+	struct vio_driver_state *vio = timer_container_of(vio, t, timer);
 
 	vio_port_up(vio);
 }
diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c
index 5226d2c52e6a..f292e0b4ff8b 100644
--- a/arch/um/drivers/vector_kern.c
+++ b/arch/um/drivers/vector_kern.c
@@ -1534,7 +1534,7 @@ static const struct net_device_ops vector_netdev_ops = {
 
 static void vector_timer_expire(struct timer_list *t)
 {
-	struct vector_private *vp = from_timer(vp, t, tl);
+	struct vector_private *vp = timer_container_of(vp, t, tl);
 
 	vp->estats.tx_kicks++;
 	napi_schedule(&vp->napi);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 38b33cdd4232..9b029bb29a16 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1571,7 +1571,8 @@ out:
 
 static void cancel_evtchn_poll(struct timer_list *t)
 {
-	struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
+	struct kvm_vcpu *vcpu = timer_container_of(vcpu, t,
+						   arch.xen.poll_timer);
 
 	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
 	kvm_vcpu_kick(vcpu);
diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c
index c6d8c62695e1..f0a63b2f85cc 100644
--- a/arch/xtensa/platforms/iss/network.c
+++ b/arch/xtensa/platforms/iss/network.c
@@ -338,7 +338,7 @@ static int iss_net_poll(struct iss_net_private *lp)
 
 static void iss_net_timer(struct timer_list *t)
 {
-	struct iss_net_private *lp = from_timer(lp, t, timer);
+	struct iss_net_private *lp = timer_container_of(lp, t, timer);
 
 	iss_net_poll(lp);
 	mod_timer(&lp->timer, jiffies + lp->timer_val);
-- 
cgit v1.2.3


From e34dbbc85d64af59176fe59fad7b4122f4330fe2 Mon Sep 17 00:00:00 2001
From: "Xin Li (Intel)" <xin@zytor.com>
Date: Mon, 9 Jun 2025 01:40:53 -0700
Subject: x86/fred/signal: Prevent immediate repeat of single step trap on
 return from SIGTRAP handler

Clear the software event flag in the augmented SS to prevent immediate
repeat of single step trap on return from SIGTRAP handler if the trap
flag (TF) is set without an external debugger attached.

Following is a typical single-stepping flow for a user process:

1) The user process is prepared for single-stepping by setting
   RFLAGS.TF = 1.
2) When any instruction in user space completes, a #DB is triggered.
3) The kernel handles the #DB and returns to user space, invoking the
   SIGTRAP handler with RFLAGS.TF = 0.
4) After the SIGTRAP handler finishes, the user process performs a
   sigreturn syscall, restoring the original state, including
   RFLAGS.TF = 1.
5) Goto step 2.

According to the FRED specification:

A) Bit 17 in the augmented SS is designated as the software event
   flag, which is set to 1 for FRED event delivery of SYSCALL,
   SYSENTER, or INT n.
B) If bit 17 of the augmented SS is 1 and ERETU would result in
   RFLAGS.TF = 1, a single-step trap will be pending upon completion
   of ERETU.

In step 4) above, the software event flag is set upon the sigreturn
syscall, and its corresponding ERETU would restore RFLAGS.TF = 1.
This combination causes a pending single-step trap upon completion of
ERETU.  Therefore, another #DB is triggered before any user space
instruction is executed, which leads to an infinite loop in which the
SIGTRAP handler keeps being invoked on the same user space IP.

Fixes: 14619d912b65 ("x86/fred: FRED entry/exit and dispatch code")
Suggested-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Signed-off-by: Xin Li (Intel) <xin@zytor.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Tested-by: Sohil Mehta <sohil.mehta@intel.com>
Cc:stable@vger.kernel.org
Link: https://lore.kernel.org/all/20250609084054.2083189-2-xin%40zytor.com
---
 arch/x86/include/asm/sighandling.h | 22 ++++++++++++++++++++++
 arch/x86/kernel/signal_32.c        |  4 ++++
 arch/x86/kernel/signal_64.c        |  4 ++++
 3 files changed, 30 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index e770c4fc47f4..8727c7e21dd1 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -24,4 +24,26 @@ int ia32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
 int x64_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
 int x32_setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs);
 
+/*
+ * To prevent immediate repeat of single step trap on return from SIGTRAP
+ * handler if the trap flag (TF) is set without an external debugger attached,
+ * clear the software event flag in the augmented SS, ensuring no single-step
+ * trap is pending upon ERETU completion.
+ *
+ * Note, this function should be called in sigreturn() before the original
+ * state is restored to make sure the TF is read from the entry frame.
+ */
+static __always_inline void prevent_single_step_upon_eretu(struct pt_regs *regs)
+{
+	/*
+	 * If the trap flag (TF) is set, i.e., the sigreturn() SYSCALL instruction
+	 * is being single-stepped, do not clear the software event flag in the
+	 * augmented SS, thus a debugger won't skip over the following instruction.
+	 */
+#ifdef CONFIG_X86_FRED
+	if (!(regs->flags & X86_EFLAGS_TF))
+		regs->fred_ss.swevent = 0;
+#endif
+}
+
 #endif /* _ASM_X86_SIGHANDLING_H */
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 98123ff10506..42bbc42bd350 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -152,6 +152,8 @@ SYSCALL32_DEFINE0(sigreturn)
 	struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
 	sigset_t set;
 
+	prevent_single_step_upon_eretu(regs);
+
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
 	if (__get_user(set.sig[0], &frame->sc.oldmask)
@@ -175,6 +177,8 @@ SYSCALL32_DEFINE0(rt_sigreturn)
 	struct rt_sigframe_ia32 __user *frame;
 	sigset_t set;
 
+	prevent_single_step_upon_eretu(regs);
+
 	frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
 
 	if (!access_ok(frame, sizeof(*frame)))
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ee9453891901..d483b585c6c6 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -250,6 +250,8 @@ SYSCALL_DEFINE0(rt_sigreturn)
 	sigset_t set;
 	unsigned long uc_flags;
 
+	prevent_single_step_upon_eretu(regs);
+
 	frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
 	if (!access_ok(frame, sizeof(*frame)))
 		goto badframe;
@@ -366,6 +368,8 @@ COMPAT_SYSCALL_DEFINE0(x32_rt_sigreturn)
 	sigset_t set;
 	unsigned long uc_flags;
 
+	prevent_single_step_upon_eretu(regs);
+
 	frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
 
 	if (!access_ok(frame, sizeof(*frame)))
-- 
cgit v1.2.3


From ea7caffedd011f7d40abe93a884ffbe46f122535 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 8 Apr 2025 14:22:56 -0300
Subject: ARC: atomics: Implement arch_atomic64_cmpxchg using _relaxed

The core atomic code has a number of macros where it elaborates
architecture primitives into more functions. ARC uses
arch_atomic64_cmpxchg() as it's architecture primitive which disable alot
of the additional functions.

Instead provide arch_cmpxchg64_relaxed() as the primitive and rely on the
core macros to create arch_cmpxchg64().

The macros will also provide other functions, for instance,
try_cmpxchg64_release(), giving a more complete implementation.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/Z0747n5bSep4_1VX@J2N7QTR9R3
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Vineet Gupta <vgupta@kernel.org>
---
 arch/arc/include/asm/atomic64-arcv2.h | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/arc/include/asm/atomic64-arcv2.h b/arch/arc/include/asm/atomic64-arcv2.h
index 9b5791b85471..73080a664369 100644
--- a/arch/arc/include/asm/atomic64-arcv2.h
+++ b/arch/arc/include/asm/atomic64-arcv2.h
@@ -137,12 +137,9 @@ ATOMIC64_OPS(xor, xor, xor)
 #undef ATOMIC64_OP_RETURN
 #undef ATOMIC64_OP
 
-static inline s64
-arch_atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new)
+static inline u64 __arch_cmpxchg64_relaxed(volatile void *ptr, u64 old, u64 new)
 {
-	s64 prev;
-
-	smp_mb();
+	u64 prev;
 
 	__asm__ __volatile__(
 	"1:	llockd  %0, [%1]	\n"
@@ -152,14 +149,12 @@ arch_atomic64_cmpxchg(atomic64_t *ptr, s64 expected, s64 new)
 	"	bnz     1b		\n"
 	"2:				\n"
 	: "=&r"(prev)
-	: "r"(ptr), "ir"(expected), "r"(new)
-	: "cc");	/* memory clobber comes from smp_mb() */
-
-	smp_mb();
+	: "r"(ptr), "ir"(old), "r"(new)
+	: "memory", "cc");
 
 	return prev;
 }
-#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg
+#define arch_cmpxchg64_relaxed __arch_cmpxchg64_relaxed
 
 static inline s64 arch_atomic64_xchg(atomic64_t *ptr, s64 new)
 {
-- 
cgit v1.2.3


From 857f4517965b282234e12f6bca0c21ef10eec09b Mon Sep 17 00:00:00 2001
From: Yu-Chun Lin <eleanor15x@gmail.com>
Date: Thu, 10 Apr 2025 01:11:16 +0800
Subject: ARC: unwind: Use built-in sort swap to reduce code size and improve
 performance

The custom swap function used in sort() was identical to the default
built-in sort swap. Remove the custom swap function and passes NULL to
sort(), allowing it to use the default swap function.

This change reduces code size and improves performance, particularly when
CONFIG_MITIGATION_RETPOLINE is enabled. With RETPOLINE mitigation, indirect
function calls incur significant overhead, and using the default swap
function avoids this cost.

$ ./scripts/bloat-o-meter ./unwind.o.old ./unwind.o.new
add/remove: 0/1 grow/shrink: 0/1 up/down: 0/-22 (-22)
Function                                     old     new   delta
init_unwind_hdr.constprop                    544     540      -4
swap_eh_frame_hdr_table_entries               18       -     -18
Total: Before=4410, After=4388, chg -0.50%

Signed-off-by: Yu-Chun Lin <eleanor15x@gmail.com>
Signed-off-by: Vineet Gupta <vgupta@kernel.org>
---
 arch/arc/kernel/unwind.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c
index d8969dab12d4..789cfb9ea14e 100644
--- a/arch/arc/kernel/unwind.c
+++ b/arch/arc/kernel/unwind.c
@@ -241,15 +241,6 @@ static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2)
 	return (e1->start > e2->start) - (e1->start < e2->start);
 }
 
-static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size)
-{
-	struct eh_frame_hdr_table_entry *e1 = p1;
-	struct eh_frame_hdr_table_entry *e2 = p2;
-
-	swap(e1->start, e2->start);
-	swap(e1->fde, e2->fde);
-}
-
 static void init_unwind_hdr(struct unwind_table *table,
 			    void *(*alloc) (unsigned long))
 {
@@ -345,7 +336,7 @@ static void init_unwind_hdr(struct unwind_table *table,
 	sort(header->table,
 	     n,
 	     sizeof(*header->table),
-	     cmp_eh_frame_hdr_table_entries, swap_eh_frame_hdr_table_entries);
+	     cmp_eh_frame_hdr_table_entries, NULL);
 
 	table->hdrsz = hdrSize;
 	smp_wmb();
-- 
cgit v1.2.3


From 2cb74be378675c860af0fcaf1ec2801beebdf028 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Fri, 14 Mar 2025 08:09:35 +0100
Subject: ARC: Replace __ASSEMBLY__ with __ASSEMBLER__ in uapi headers

__ASSEMBLY__ is only defined by the Makefile of the kernel, so
this is not really useful for uapi headers (unless the userspace
Makefile defines it, too). Let's switch to __ASSEMBLER__ which
gets set automatically by the compiler when compiling assembly
code.

Cc: linux-snps-arc@lists.infradead.org
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Vineet Gupta <vgupta@kernel.org>
---
 arch/arc/include/uapi/asm/ptrace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arc/include/uapi/asm/ptrace.h b/arch/arc/include/uapi/asm/ptrace.h
index 2a6eff57f6dd..3ae832db278c 100644
--- a/arch/arc/include/uapi/asm/ptrace.h
+++ b/arch/arc/include/uapi/asm/ptrace.h
@@ -14,7 +14,7 @@
 
 #define PTRACE_GET_THREAD_AREA	25
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 /*
  * Userspace ABI: Register state needed by
  *  -ptrace (gdbserver)
@@ -53,6 +53,6 @@ struct user_regs_arcv2 {
 	unsigned long r30, r58, r59;
 };
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* _UAPI__ASM_ARC_PTRACE_H */
-- 
cgit v1.2.3


From 179e949719fe81219a3e23f1e716ac2d02eea845 Mon Sep 17 00:00:00 2001
From: Thomas Huth <thuth@redhat.com>
Date: Fri, 14 Mar 2025 08:09:36 +0100
Subject: ARC: Replace __ASSEMBLY__ with __ASSEMBLER__ in the non-uapi headers

While the GCC and Clang compilers already define __ASSEMBLER__
automatically when compiling assembly code, __ASSEMBLY__ is a
macro that only gets defined by the Makefiles in the kernel.
This can be very confusing when switching between userspace
and kernelspace coding, or when dealing with uapi headers that
rather should use __ASSEMBLER__ instead. So let's standardize on
the __ASSEMBLER__ macro that is provided by the compilers now.

This is a completely mechanical patch (done with a simple "sed -i"
statement).

Cc: linux-snps-arc@lists.infradead.org
Signed-off-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Vineet Gupta <vgupta@kernel.org>
---
 arch/arc/include/asm/arcregs.h            | 2 +-
 arch/arc/include/asm/atomic.h             | 4 ++--
 arch/arc/include/asm/bitops.h             | 4 ++--
 arch/arc/include/asm/bug.h                | 4 ++--
 arch/arc/include/asm/cache.h              | 4 ++--
 arch/arc/include/asm/current.h            | 4 ++--
 arch/arc/include/asm/dsp-impl.h           | 2 +-
 arch/arc/include/asm/dsp.h                | 4 ++--
 arch/arc/include/asm/dwarf.h              | 4 ++--
 arch/arc/include/asm/entry.h              | 4 ++--
 arch/arc/include/asm/irqflags-arcv2.h     | 4 ++--
 arch/arc/include/asm/irqflags-compact.h   | 4 ++--
 arch/arc/include/asm/jump_label.h         | 4 ++--
 arch/arc/include/asm/linkage.h            | 6 +++---
 arch/arc/include/asm/mmu-arcv2.h          | 4 ++--
 arch/arc/include/asm/mmu.h                | 2 +-
 arch/arc/include/asm/page.h               | 4 ++--
 arch/arc/include/asm/pgtable-bits-arcv2.h | 4 ++--
 arch/arc/include/asm/pgtable-levels.h     | 4 ++--
 arch/arc/include/asm/pgtable.h            | 4 ++--
 arch/arc/include/asm/processor.h          | 4 ++--
 arch/arc/include/asm/ptrace.h             | 4 ++--
 arch/arc/include/asm/switch_to.h          | 2 +-
 arch/arc/include/asm/thread_info.h        | 4 ++--
 24 files changed, 45 insertions(+), 45 deletions(-)

(limited to 'arch')

diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h
index 005d9e4d187a..a31bbf5c8bbc 100644
--- a/arch/arc/include/asm/arcregs.h
+++ b/arch/arc/include/asm/arcregs.h
@@ -144,7 +144,7 @@
 #define ARC_AUX_AGU_MOD2	0x5E2
 #define ARC_AUX_AGU_MOD3	0x5E3
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <soc/arc/arc_aux.h>
 
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 592d7fffc223..e615c42b93ba 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -6,7 +6,7 @@
 #ifndef _ASM_ARC_ATOMIC_H
 #define _ASM_ARC_ATOMIC_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 #include <linux/compiler.h>
@@ -31,6 +31,6 @@
 #include <asm/atomic64-arcv2.h>
 #endif
 
-#endif	/* !__ASSEMBLY__ */
+#endif	/* !__ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h
index f5a936496f06..5340c2871392 100644
--- a/arch/arc/include/asm/bitops.h
+++ b/arch/arc/include/asm/bitops.h
@@ -10,7 +10,7 @@
 #error only <linux/bitops.h> can be included directly
 #endif
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/types.h>
 #include <linux/compiler.h>
@@ -192,6 +192,6 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x)
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/bug.h b/arch/arc/include/asm/bug.h
index 4c453ba96c51..171c16021f70 100644
--- a/arch/arc/include/asm/bug.h
+++ b/arch/arc/include/asm/bug.h
@@ -6,7 +6,7 @@
 #ifndef _ASM_ARC_BUG_H
 #define _ASM_ARC_BUG_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/ptrace.h>
 
@@ -29,6 +29,6 @@ void die(const char *str, struct pt_regs *regs, unsigned long address);
 
 #include <asm-generic/bug.h>
 
-#endif	/* !__ASSEMBLY__ */
+#endif	/* !__ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h
index f0f1fc5d62b6..040a97f4dd82 100644
--- a/arch/arc/include/asm/cache.h
+++ b/arch/arc/include/asm/cache.h
@@ -23,7 +23,7 @@
  */
 #define ARC_UNCACHED_ADDR_SPACE	0xc0000000
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/build_bug.h>
 
@@ -65,7 +65,7 @@
 extern int ioc_enable;
 extern unsigned long perip_base, perip_end;
 
-#endif	/* !__ASSEMBLY__ */
+#endif	/* !__ASSEMBLER__ */
 
 /* Instruction cache related Auxiliary registers */
 #define ARC_REG_IC_BCR		0x77	/* Build Config reg */
diff --git a/arch/arc/include/asm/current.h b/arch/arc/include/asm/current.h
index 06be89f6f2f0..03ffd005f3fa 100644
--- a/arch/arc/include/asm/current.h
+++ b/arch/arc/include/asm/current.h
@@ -9,7 +9,7 @@
 #ifndef _ASM_ARC_CURRENT_H
 #define _ASM_ARC_CURRENT_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #ifdef CONFIG_ARC_CURR_IN_REG
 
@@ -20,6 +20,6 @@ register struct task_struct *curr_arc asm("gp");
 #include <asm-generic/current.h>
 #endif /* ! CONFIG_ARC_CURR_IN_REG */
 
-#endif /* ! __ASSEMBLY__ */
+#endif /* ! __ASSEMBLER__ */
 
 #endif /* _ASM_ARC_CURRENT_H */
diff --git a/arch/arc/include/asm/dsp-impl.h b/arch/arc/include/asm/dsp-impl.h
index cd5636dfeb6f..fd5fdaad90c1 100644
--- a/arch/arc/include/asm/dsp-impl.h
+++ b/arch/arc/include/asm/dsp-impl.h
@@ -11,7 +11,7 @@
 
 #define DSP_CTRL_DISABLED_ALL		0
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 /* clobbers r5 register */
 .macro DSP_EARLY_INIT
diff --git a/arch/arc/include/asm/dsp.h b/arch/arc/include/asm/dsp.h
index f496dbc4640b..eeaaf4e4eabd 100644
--- a/arch/arc/include/asm/dsp.h
+++ b/arch/arc/include/asm/dsp.h
@@ -7,7 +7,7 @@
 #ifndef __ASM_ARC_DSP_H
 #define __ASM_ARC_DSP_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /*
  * DSP-related saved registers - need to be saved only when you are
@@ -24,6 +24,6 @@ struct dsp_callee_regs {
 #endif
 };
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_ARC_DSP_H */
diff --git a/arch/arc/include/asm/dwarf.h b/arch/arc/include/asm/dwarf.h
index a0d5ebe1bc3f..1524c5cf8b59 100644
--- a/arch/arc/include/asm/dwarf.h
+++ b/arch/arc/include/asm/dwarf.h
@@ -6,7 +6,7 @@
 #ifndef _ASM_ARC_DWARF_H
 #define _ASM_ARC_DWARF_H
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #ifdef ARC_DW2_UNWIND_AS_CFI
 
@@ -38,6 +38,6 @@
 
 #endif	/* !ARC_DW2_UNWIND_AS_CFI */
 
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #endif	/* _ASM_ARC_DWARF_H */
diff --git a/arch/arc/include/asm/entry.h b/arch/arc/include/asm/entry.h
index 38c35722cebf..f453af251a1a 100644
--- a/arch/arc/include/asm/entry.h
+++ b/arch/arc/include/asm/entry.h
@@ -13,7 +13,7 @@
 #include <asm/processor.h>	/* For VMALLOC_START */
 #include <asm/mmu.h>
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 #ifdef CONFIG_ISA_ARCOMPACT
 #include <asm/entry-compact.h>	/* ISA specific bits */
@@ -146,7 +146,7 @@
 
 #endif	/* CONFIG_ARC_CURR_IN_REG */
 
-#else	/* !__ASSEMBLY__ */
+#else	/* !__ASSEMBLER__ */
 
 extern void do_signal(struct pt_regs *);
 extern void do_notify_resume(struct pt_regs *);
diff --git a/arch/arc/include/asm/irqflags-arcv2.h b/arch/arc/include/asm/irqflags-arcv2.h
index fb3c21f1a238..30aea562f8aa 100644
--- a/arch/arc/include/asm/irqflags-arcv2.h
+++ b/arch/arc/include/asm/irqflags-arcv2.h
@@ -50,7 +50,7 @@
 #define ISA_INIT_STATUS_BITS	(STATUS_IE_MASK | __AD_ENB | \
 					(ARCV2_IRQ_DEF_PRIO << 1))
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /*
  * Save IRQ state and disable IRQs
@@ -170,6 +170,6 @@ static inline void arc_softirq_clear(int irq)
 	seti
 .endm
 
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/irqflags-compact.h b/arch/arc/include/asm/irqflags-compact.h
index 936a2f21f315..85c2f6bcde0c 100644
--- a/arch/arc/include/asm/irqflags-compact.h
+++ b/arch/arc/include/asm/irqflags-compact.h
@@ -40,7 +40,7 @@
 
 #define ISA_INIT_STATUS_BITS	STATUS_IE_MASK
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 /******************************************************************
  * IRQ Control Macros
@@ -196,6 +196,6 @@ static inline int arch_irqs_disabled(void)
 	flag	\scratch
 .endm
 
-#endif	/* __ASSEMBLY__ */
+#endif	/* __ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/jump_label.h b/arch/arc/include/asm/jump_label.h
index a339223d9e05..66ead75784d9 100644
--- a/arch/arc/include/asm/jump_label.h
+++ b/arch/arc/include/asm/jump_label.h
@@ -2,7 +2,7 @@
 #ifndef _ASM_ARC_JUMP_LABEL_H
 #define _ASM_ARC_JUMP_LABEL_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/stringify.h>
 #include <linux/types.h>
@@ -68,5 +68,5 @@ struct jump_entry {
 	jump_label_t key;
 };
 
-#endif  /* __ASSEMBLY__ */
+#endif  /* __ASSEMBLER__ */
 #endif
diff --git a/arch/arc/include/asm/linkage.h b/arch/arc/include/asm/linkage.h
index 8a3fb71e9cfa..ba3cb65b5eaa 100644
--- a/arch/arc/include/asm/linkage.h
+++ b/arch/arc/include/asm/linkage.h
@@ -12,7 +12,7 @@
 #define __ALIGN		.align 4
 #define __ALIGN_STR	__stringify(__ALIGN)
 
-#ifdef __ASSEMBLY__
+#ifdef __ASSEMBLER__
 
 .macro ST2 e, o, off
 #ifdef CONFIG_ARC_HAS_LL64
@@ -61,7 +61,7 @@
 	CFI_ENDPROC ASM_NL	\
 	.size name, .-name
 
-#else	/* !__ASSEMBLY__ */
+#else	/* !__ASSEMBLER__ */
 
 #ifdef CONFIG_ARC_HAS_ICCM
 #define __arcfp_code __section(".text.arcfp")
@@ -75,6 +75,6 @@
 #define __arcfp_data __section(".data")
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/mmu-arcv2.h b/arch/arc/include/asm/mmu-arcv2.h
index 41412642f279..5e5482026ac9 100644
--- a/arch/arc/include/asm/mmu-arcv2.h
+++ b/arch/arc/include/asm/mmu-arcv2.h
@@ -69,7 +69,7 @@
 
 #define PTE_BITS_NON_RWX_IN_PD1	(PAGE_MASK_PHYS | _PAGE_CACHEABLE)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 struct mm_struct;
 extern int pae40_exist_but_not_enab(void);
@@ -100,6 +100,6 @@ static inline void mmu_setup_pgd(struct mm_struct *mm, void *pgd)
 	sr \reg, [ARC_REG_PID]
 .endm
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/mmu.h b/arch/arc/include/asm/mmu.h
index 4ae2db59d494..e3b35ceab582 100644
--- a/arch/arc/include/asm/mmu.h
+++ b/arch/arc/include/asm/mmu.h
@@ -6,7 +6,7 @@
 #ifndef _ASM_ARC_MMU_H
 #define _ASM_ARC_MMU_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/threads.h>	/* NR_CPUS */
 
diff --git a/arch/arc/include/asm/page.h b/arch/arc/include/asm/page.h
index def0dfb95b43..9720fe6b2c24 100644
--- a/arch/arc/include/asm/page.h
+++ b/arch/arc/include/asm/page.h
@@ -19,7 +19,7 @@
 
 #endif /* CONFIG_ARC_HAS_PAE40 */
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define clear_page(paddr)		memset((paddr), 0, PAGE_SIZE)
 #define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
@@ -136,6 +136,6 @@ static inline unsigned long virt_to_pfn(const void *kaddr)
 #include <asm-generic/memory_model.h>   /* page_to_pfn, pfn_to_page */
 #include <asm-generic/getorder.h>
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h
index 8ebec1b21d24..a8fdb4fe1fbd 100644
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -75,7 +75,7 @@
  *     This is to enable COW mechanism
  */
 	/* xwr */
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #define pte_write(pte)		(pte_val(pte) & _PAGE_WRITE)
 #define pte_dirty(pte)		(pte_val(pte) & _PAGE_DIRTY)
@@ -142,6 +142,6 @@ PTE_BIT_FUNC(swp_clear_exclusive, &= ~(_PAGE_SWP_EXCLUSIVE));
 #include <asm/hugepage.h>
 #endif
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/pgtable-levels.h b/arch/arc/include/asm/pgtable-levels.h
index d1ce4b0f1071..c8f9273372c0 100644
--- a/arch/arc/include/asm/pgtable-levels.h
+++ b/arch/arc/include/asm/pgtable-levels.h
@@ -85,7 +85,7 @@
 
 #define PTRS_PER_PTE		BIT(PMD_SHIFT - PAGE_SHIFT)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #if CONFIG_PGTABLE_LEVELS > 3
 #include <asm-generic/pgtable-nop4d.h>
@@ -181,6 +181,6 @@
 #define pmd_leaf(x)		(pmd_val(x) & _PAGE_HW_SZ)
 #endif
 
-#endif	/* !__ASSEMBLY__ */
+#endif	/* !__ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 4cf45a99fd79..bd580e2b62d7 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -19,7 +19,7 @@
  */
 #define	USER_PTRS_PER_PGD	(TASK_SIZE / PGDIR_SIZE)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 extern char empty_zero_page[PAGE_SIZE];
 #define ZERO_PAGE(vaddr)	(virt_to_page(empty_zero_page))
@@ -29,6 +29,6 @@ extern pgd_t swapper_pg_dir[] __aligned(PAGE_SIZE);
 /* to cope with aliasing VIPT cache */
 #define HAVE_ARCH_UNMAPPED_AREA
 
-#endif /* __ASSEMBLY__ */
+#endif /* __ASSEMBLER__ */
 
 #endif
diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h
index d606658e2fe7..7f7901ac6643 100644
--- a/arch/arc/include/asm/processor.h
+++ b/arch/arc/include/asm/processor.h
@@ -11,7 +11,7 @@
 #ifndef __ASM_ARC_PROCESSOR_H
 #define __ASM_ARC_PROCESSOR_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <asm/ptrace.h>
 #include <asm/dsp.h>
@@ -66,7 +66,7 @@ extern void start_thread(struct pt_regs * regs, unsigned long pc,
 
 extern unsigned int __get_wchan(struct task_struct *p);
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /*
  * Default System Memory Map on ARC
diff --git a/arch/arc/include/asm/ptrace.h b/arch/arc/include/asm/ptrace.h
index cf79df0b2570..f6c052af8f4d 100644
--- a/arch/arc/include/asm/ptrace.h
+++ b/arch/arc/include/asm/ptrace.h
@@ -10,7 +10,7 @@
 #include <uapi/asm/ptrace.h>
 #include <linux/compiler.h>
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 typedef union {
 	struct {
@@ -172,6 +172,6 @@ static inline unsigned long regs_get_register(struct pt_regs *regs,
 extern int syscall_trace_enter(struct pt_regs *);
 extern void syscall_trace_exit(struct pt_regs *);
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 #endif /* __ASM_PTRACE_H */
diff --git a/arch/arc/include/asm/switch_to.h b/arch/arc/include/asm/switch_to.h
index 1f85de8288b1..5806106a65f9 100644
--- a/arch/arc/include/asm/switch_to.h
+++ b/arch/arc/include/asm/switch_to.h
@@ -6,7 +6,7 @@
 #ifndef _ASM_ARC_SWITCH_TO_H
 #define _ASM_ARC_SWITCH_TO_H
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/sched.h>
 #include <asm/dsp-impl.h>
diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h
index 12daaf3a61ea..255d2c774219 100644
--- a/arch/arc/include/asm/thread_info.h
+++ b/arch/arc/include/asm/thread_info.h
@@ -24,7 +24,7 @@
 #define THREAD_SIZE     (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define THREAD_SHIFT	(PAGE_SHIFT << THREAD_SIZE_ORDER)
 
-#ifndef __ASSEMBLY__
+#ifndef __ASSEMBLER__
 
 #include <linux/thread_info.h>
 
@@ -62,7 +62,7 @@ static inline __attribute_const__ struct thread_info *current_thread_info(void)
 	return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
 }
 
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLER__ */
 
 /*
  * thread information flags
-- 
cgit v1.2.3


From cd097df4596f3a1e9d75eb8520162de1eb8485b2 Mon Sep 17 00:00:00 2001
From: "Ritesh Harjani (IBM)" <ritesh.list@gmail.com>
Date: Tue, 10 Jun 2025 07:42:26 +0530
Subject: powerpc/powernv/memtrace: Fix out of bounds issue in memtrace mmap

memtrace mmap issue has an out of bounds issue. This patch fixes the by
checking that the requested mapping region size should stay within the
allocated region size.

Reported-by: Jonathan Greental <yonatan02greental@gmail.com>
Fixes: 08a022ad3dfa ("powerpc/powernv/memtrace: Allow mmaping trace buffers")
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20250610021227.361980-1-maddy@linux.ibm.com
---
 arch/powerpc/platforms/powernv/memtrace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c
index 4ac9808e55a4..2ea30b343354 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -48,11 +48,15 @@ static ssize_t memtrace_read(struct file *filp, char __user *ubuf,
 static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct memtrace_entry *ent = filp->private_data;
+	unsigned long ent_nrpages = ent->size >> PAGE_SHIFT;
+	unsigned long vma_nrpages = vma_pages(vma);
 
-	if (ent->size < vma->vm_end - vma->vm_start)
+	/* The requested page offset should be within object's page count */
+	if (vma->vm_pgoff >= ent_nrpages)
 		return -EINVAL;
 
-	if (vma->vm_pgoff << PAGE_SHIFT >= ent->size)
+	/* The requested mapping range should remain within the bounds */
+	if (vma_nrpages > ent_nrpages - vma->vm_pgoff)
 		return -EINVAL;
 
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-- 
cgit v1.2.3


From 0d67f0dee6c9176bc09a5482dd7346e3a0f14d0b Mon Sep 17 00:00:00 2001
From: Haren Myneni <haren@linux.ibm.com>
Date: Tue, 10 Jun 2025 07:42:27 +0530
Subject: powerpc/vas: Return -EINVAL if the offset is non-zero in mmap()

The user space calls mmap() to map VAS window paste address
and the kernel returns the complete mapped page for each
window. So return -EINVAL if non-zero is passed for offset
parameter to mmap().

See Documentation/arch/powerpc/vas-api.rst for mmap()
restrictions.

Co-developed-by: Jonathan Greental <yonatan02greental@gmail.com>
Signed-off-by: Jonathan Greental <yonatan02greental@gmail.com>
Reported-by: Jonathan Greental <yonatan02greental@gmail.com>
Fixes: dda44eb29c23 ("powerpc/vas: Add VAS user space API")
Signed-off-by: Haren Myneni <haren@linux.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20250610021227.361980-2-maddy@linux.ibm.com
---
 arch/powerpc/platforms/book3s/vas-api.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c
index 0b6365d85d11..dc6f75d3ac6e 100644
--- a/arch/powerpc/platforms/book3s/vas-api.c
+++ b/arch/powerpc/platforms/book3s/vas-api.c
@@ -521,6 +521,15 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma)
 		return -EINVAL;
 	}
 
+	/*
+	 * Map complete page to the paste address. So the user
+	 * space should pass 0ULL to the offset parameter.
+	 */
+	if (vma->vm_pgoff) {
+		pr_debug("Page offset unsupported to map paste address\n");
+		return -EINVAL;
+	}
+
 	/* Ensure instance has an open send window */
 	if (!txwin) {
 		pr_err("No send window open?\n");
-- 
cgit v1.2.3


From 0b3bc018e86afdc0cbfef61328c63d5c08f8b370 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@intel.com>
Date: Sat, 7 Jun 2025 01:07:37 +1200
Subject: x86/virt/tdx: Avoid indirect calls to TDX assembly functions

Two 'static inline' TDX helper functions (sc_retry() and
sc_retry_prerr()) take function pointer arguments which refer to
assembly functions.  Normally, the compiler inlines the TDX helper,
realizes that the function pointer targets are completely static --
thus can be resolved at compile time -- and generates direct call
instructions.

But, other times (like when CONFIG_CC_OPTIMIZE_FOR_SIZE=y), the
compiler declines to inline the helpers and will instead generate
indirect call instructions.

Indirect calls to assembly functions require special annotation (for
various Control Flow Integrity mechanisms).  But TDX assembly
functions lack the special annotations and can only be called
directly.

Annotate both the helpers as '__always_inline' to prod the compiler
into maintaining the direct calls. There is no guarantee here, but
Peter has volunteered to report the compiler bug if this assumption
ever breaks[1].

Fixes: 1e66a7e27539 ("x86/virt/tdx: Handle SEAMCALL no entropy error in common code")
Fixes: df01f5ae07dd ("x86/virt/tdx: Add SEAMCALL error printing for module initialization")
Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/lkml/20250605145914.GW39944@noisy.programming.kicks-ass.net/ [1]
Link: https://lore.kernel.org/all/20250606130737.30713-1-kai.huang%40intel.com
---
 arch/x86/include/asm/tdx.h  | 2 +-
 arch/x86/virt/vmx/tdx/tdx.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
index 8b19294600c4..7ddef3a69866 100644
--- a/arch/x86/include/asm/tdx.h
+++ b/arch/x86/include/asm/tdx.h
@@ -106,7 +106,7 @@ void tdx_init(void);
 
 typedef u64 (*sc_func_t)(u64 fn, struct tdx_module_args *args);
 
-static inline u64 sc_retry(sc_func_t func, u64 fn,
+static __always_inline u64 sc_retry(sc_func_t func, u64 fn,
 			   struct tdx_module_args *args)
 {
 	int retry = RDRAND_RETRY_LOOPS;
diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c
index 2457d13c3f9e..c7a9a087ccaf 100644
--- a/arch/x86/virt/vmx/tdx/tdx.c
+++ b/arch/x86/virt/vmx/tdx/tdx.c
@@ -75,8 +75,9 @@ static inline void seamcall_err_ret(u64 fn, u64 err,
 			args->r9, args->r10, args->r11);
 }
 
-static inline int sc_retry_prerr(sc_func_t func, sc_err_func_t err_func,
-				 u64 fn, struct tdx_module_args *args)
+static __always_inline int sc_retry_prerr(sc_func_t func,
+					  sc_err_func_t err_func,
+					  u64 fn, struct tdx_module_args *args)
 {
 	u64 sret = sc_retry(func, fn, args);
 
-- 
cgit v1.2.3


From 1dbf30fdb5e57fb2c39f17f35f2b544d5de34397 Mon Sep 17 00:00:00 2001
From: Juergen Gross <jgross@suse.com>
Date: Tue, 3 Jun 2025 14:14:41 +0300
Subject: x86/mm/pat: don't collapse pages without PSE set

Collapsing pages to a leaf PMD or PUD should be done only if
X86_FEATURE_PSE is available, which is not the case when running e.g.
as a Xen PV guest.

Fixes: 41d88484c71c ("x86/mm/pat: restore large ROX pages after fragmentation")
Signed-off-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20250528123557.12847-3-jgross@suse.com
---
 arch/x86/mm/pat/set_memory.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 46edc11726b7..8834c76f91c9 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1257,6 +1257,9 @@ static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
 	pgprot_t pgprot;
 	int i = 0;
 
+	if (!cpu_feature_enabled(X86_FEATURE_PSE))
+		return 0;
+
 	addr &= PMD_MASK;
 	pte = pte_offset_kernel(pmd, addr);
 	first = *pte;
-- 
cgit v1.2.3


From 47410d839fcda6890cb82828f874f97710982f24 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Tue, 3 Jun 2025 14:14:42 +0300
Subject: x86/Kconfig: only enable ROX cache in execmem when STRICT_MODULE_RWX
 is set

Currently ROX cache in execmem is enabled regardless of
STRICT_MODULE_RWX setting. This breaks an assumption that module memory
is writable when STRICT_MODULE_RWX is disabled, for instance for kernel
debuggin.

Only enable ROX cache in execmem when STRICT_MODULE_RWX is set to
restore the original behaviour of module text permissions.

Fixes: 64f6a4e10c05 ("x86: re-enable EXECMEM_ROX support")
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250603111446.2609381-3-rppt@kernel.org
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 340e5468980e..71019b3b54ea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -89,7 +89,7 @@ config X86
 	select ARCH_HAS_DMA_OPS			if GART_IOMMU || XEN
 	select ARCH_HAS_EARLY_DEBUG		if KGDB
 	select ARCH_HAS_ELF_RANDOMIZE
-	select ARCH_HAS_EXECMEM_ROX		if X86_64
+	select ARCH_HAS_EXECMEM_ROX		if X86_64 && STRICT_MODULE_RWX
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
-- 
cgit v1.2.3


From 0b0cae7119a0ec9449d7261b5e672a5fed765068 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Tue, 3 Jun 2025 14:14:43 +0300
Subject: x86/its: move its_pages array to struct mod_arch_specific

The of pages with ITS thunks allocated for modules are tracked by an
array in 'struct module'.

Since this is very architecture specific data structure, move it to
'struct mod_arch_specific'.

No functional changes.

Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches")
Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250603111446.2609381-4-rppt@kernel.org
---
 arch/x86/include/asm/module.h |  8 ++++++++
 arch/x86/kernel/alternative.c | 19 ++++++++++---------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e988bac0a4a1..3c2de4ce3b10 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -5,12 +5,20 @@
 #include <asm-generic/module.h>
 #include <asm/orc_types.h>
 
+struct its_array {
+#ifdef CONFIG_MITIGATION_ITS
+	void **pages;
+	int num;
+#endif
+};
+
 struct mod_arch_specific {
 #ifdef CONFIG_UNWINDER_ORC
 	unsigned int num_orcs;
 	int *orc_unwind_ip;
 	struct orc_entry *orc_unwind;
 #endif
+	struct its_array its_pages;
 };
 
 #endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ecfe7b497cad..b50fe6ce4655 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -173,8 +173,8 @@ void its_fini_mod(struct module *mod)
 	its_page = NULL;
 	mutex_unlock(&text_mutex);
 
-	for (int i = 0; i < mod->its_num_pages; i++) {
-		void *page = mod->its_page_array[i];
+	for (int i = 0; i < mod->arch.its_pages.num; i++) {
+		void *page = mod->arch.its_pages.pages[i];
 		execmem_restore_rox(page, PAGE_SIZE);
 	}
 }
@@ -184,11 +184,11 @@ void its_free_mod(struct module *mod)
 	if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS))
 		return;
 
-	for (int i = 0; i < mod->its_num_pages; i++) {
-		void *page = mod->its_page_array[i];
+	for (int i = 0; i < mod->arch.its_pages.num; i++) {
+		void *page = mod->arch.its_pages.pages[i];
 		execmem_free(page);
 	}
-	kfree(mod->its_page_array);
+	kfree(mod->arch.its_pages.pages);
 }
 #endif /* CONFIG_MODULES */
 
@@ -201,14 +201,15 @@ static void *its_alloc(void)
 
 #ifdef CONFIG_MODULES
 	if (its_mod) {
-		void *tmp = krealloc(its_mod->its_page_array,
-				     (its_mod->its_num_pages+1) * sizeof(void *),
+		struct its_array *pages = &its_mod->arch.its_pages;
+		void *tmp = krealloc(pages->pages,
+				     (pages->num+1) * sizeof(void *),
 				     GFP_KERNEL);
 		if (!tmp)
 			return NULL;
 
-		its_mod->its_page_array = tmp;
-		its_mod->its_page_array[its_mod->its_num_pages++] = page;
+		pages->pages = tmp;
+		pages->pages[pages->num++] = page;
 
 		execmem_make_temp_rw(page, PAGE_SIZE);
 	}
-- 
cgit v1.2.3


From a82b26451de126a5ae130361081986bc459afe9b Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Tue, 3 Jun 2025 14:14:44 +0300
Subject: x86/its: explicitly manage permissions for ITS pages

execmem_alloc() sets permissions differently depending on the kernel
configuration, CPU support for PSE and whether a page is allocated
before or after mark_rodata_ro().

Add tracking for pages allocated for ITS when patching the core kernel
and make sure the permissions for ITS pages are explicitly managed for
both kernel and module allocations.

Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Nikolay Borisov <nik.borisov@suse.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250603111446.2609381-5-rppt@kernel.org
---
 arch/x86/kernel/alternative.c | 74 ++++++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 22 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index b50fe6ce4655..6455f7f751b3 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -116,6 +116,24 @@ static struct module *its_mod;
 #endif
 static void *its_page;
 static unsigned int its_offset;
+struct its_array its_pages;
+
+static void *__its_alloc(struct its_array *pages)
+{
+	void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+	if (!page)
+		return NULL;
+
+	void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *),
+			     GFP_KERNEL);
+	if (!tmp)
+		return NULL;
+
+	pages->pages = tmp;
+	pages->pages[pages->num++] = page;
+
+	return no_free_ptr(page);
+}
 
 /* Initialize a thunk with the "jmp *reg; int3" instructions. */
 static void *its_init_thunk(void *thunk, int reg)
@@ -151,6 +169,21 @@ static void *its_init_thunk(void *thunk, int reg)
 	return thunk + offset;
 }
 
+static void its_pages_protect(struct its_array *pages)
+{
+	for (int i = 0; i < pages->num; i++) {
+		void *page = pages->pages[i];
+		execmem_restore_rox(page, PAGE_SIZE);
+	}
+}
+
+static void its_fini_core(void)
+{
+	if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+		its_pages_protect(&its_pages);
+	kfree(its_pages.pages);
+}
+
 #ifdef CONFIG_MODULES
 void its_init_mod(struct module *mod)
 {
@@ -173,10 +206,8 @@ void its_fini_mod(struct module *mod)
 	its_page = NULL;
 	mutex_unlock(&text_mutex);
 
-	for (int i = 0; i < mod->arch.its_pages.num; i++) {
-		void *page = mod->arch.its_pages.pages[i];
-		execmem_restore_rox(page, PAGE_SIZE);
-	}
+	if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+		its_pages_protect(&mod->arch.its_pages);
 }
 
 void its_free_mod(struct module *mod)
@@ -194,28 +225,23 @@ void its_free_mod(struct module *mod)
 
 static void *its_alloc(void)
 {
-	void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE);
+	struct its_array *pages = &its_pages;
+	void *page;
 
+#ifdef CONFIG_MODULE
+	if (its_mod)
+		pages = &its_mod->arch.its_pages;
+#endif
+
+	page = __its_alloc(pages);
 	if (!page)
 		return NULL;
 
-#ifdef CONFIG_MODULES
-	if (its_mod) {
-		struct its_array *pages = &its_mod->arch.its_pages;
-		void *tmp = krealloc(pages->pages,
-				     (pages->num+1) * sizeof(void *),
-				     GFP_KERNEL);
-		if (!tmp)
-			return NULL;
-
-		pages->pages = tmp;
-		pages->pages[pages->num++] = page;
+	execmem_make_temp_rw(page, PAGE_SIZE);
+	if (pages == &its_pages)
+		set_memory_x((unsigned long)page, 1);
 
-		execmem_make_temp_rw(page, PAGE_SIZE);
-	}
-#endif /* CONFIG_MODULES */
-
-	return no_free_ptr(page);
+	return page;
 }
 
 static void *its_allocate_thunk(int reg)
@@ -269,7 +295,9 @@ u8 *its_static_thunk(int reg)
 	return thunk;
 }
 
-#endif
+#else
+static inline void its_fini_core(void) {}
+#endif /* CONFIG_MITIGATION_ITS */
 
 /*
  * Nomenclature for variable names to simplify and clarify this code and ease
@@ -2339,6 +2367,8 @@ void __init alternative_instructions(void)
 	apply_retpolines(__retpoline_sites, __retpoline_sites_end);
 	apply_returns(__return_sites, __return_sites_end);
 
+	its_fini_core();
+
 	/*
 	 * Adjust all CALL instructions to point to func()-10, including
 	 * those in .altinstr_replacement.
-- 
cgit v1.2.3


From 7cd9a11dd0c3d1dd225795ed1b5b53132888e7b5 Mon Sep 17 00:00:00 2001
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Date: Tue, 3 Jun 2025 14:14:45 +0300
Subject: Revert "mm/execmem: Unify early execmem_cache behaviour"

The commit d6d1e3e6580c ("mm/execmem: Unify early execmem_cache
behaviour") changed early behaviour of execemem ROX cache to allow its
usage in early x86 code that allocates text pages when
CONFIG_MITGATION_ITS is enabled.

The permission management of the pages allocated from execmem for ITS
mitigation is now completely contained in arch/x86/kernel/alternatives.c
and therefore there is no need to special case early allocations in
execmem.

This reverts commit d6d1e3e6580ca35071ad474381f053cbf1fb6414.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20250603111446.2609381-6-rppt@kernel.org
---
 arch/x86/mm/init_32.c | 3 ---
 arch/x86/mm/init_64.c | 3 ---
 2 files changed, 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 607d6a2e66e2..8a34fff6ab2b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -30,7 +30,6 @@
 #include <linux/initrd.h>
 #include <linux/cpumask.h>
 #include <linux/gfp.h>
-#include <linux/execmem.h>
 
 #include <asm/asm.h>
 #include <asm/bios_ebda.h>
@@ -749,8 +748,6 @@ void mark_rodata_ro(void)
 	pr_info("Write protecting kernel text and read-only data: %luk\n",
 		size >> 10);
 
-	execmem_cache_make_ro();
-
 	kernel_set_to_readonly = 1;
 
 #ifdef CONFIG_CPA_DEBUG
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ee66fae9ebcc..fdb6cab524f0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -34,7 +34,6 @@
 #include <linux/gfp.h>
 #include <linux/kcore.h>
 #include <linux/bootmem_info.h>
-#include <linux/execmem.h>
 
 #include <asm/processor.h>
 #include <asm/bios_ebda.h>
@@ -1392,8 +1391,6 @@ void mark_rodata_ro(void)
 	       (end - start) >> 10);
 	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
 
-	execmem_cache_make_ro();
-
 	kernel_set_to_readonly = 1;
 
 	/*
-- 
cgit v1.2.3


From 179a8427fcbffe36ccfed5e138d7c9b6180caff9 Mon Sep 17 00:00:00 2001
From: Ashish Kalra <ashish.kalra@amd.com>
Date: Mon, 12 May 2025 22:16:34 +0000
Subject: KVM: SEV: Disable SEV-SNP support on initialization failure

During platform init, SNP initialization may fail for several reasons,
such as firmware command failures and incompatible versions. However,
the KVM capability may continue to advertise support for it.

The platform may have SNP enabled but if SNP_INIT fails then SNP is
not supported by KVM.

During KVM module initialization query the SNP platform status to obtain
the SNP initialization state and use it as an additional condition to
determine support for SEV-SNP.

Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Co-developed-by: Pratik R. Sampat <prsampat@amd.com>
Signed-off-by: Pratik R. Sampat <prsampat@amd.com>
Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Ashish Kalra <ashish.kalra@amd.com>
Reviewed-by: Pankaj Gupta <pankaj.gupta@amd.com>
Reviewed-by: Pavan Kumar Paluri <papaluri@amd.com>
Message-ID: <20250512221634.12045-1-Ashish.Kalra@amd.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/svm/sev.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 5a69b657dae9..459c3b791fd4 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2871,6 +2871,33 @@ void __init sev_set_cpu_caps(void)
 	}
 }
 
+static bool is_sev_snp_initialized(void)
+{
+	struct sev_user_data_snp_status *status;
+	struct sev_data_snp_addr buf;
+	bool initialized = false;
+	int ret, error = 0;
+
+	status = snp_alloc_firmware_page(GFP_KERNEL | __GFP_ZERO);
+	if (!status)
+		return false;
+
+	buf.address = __psp_pa(status);
+	ret = sev_do_cmd(SEV_CMD_SNP_PLATFORM_STATUS, &buf, &error);
+	if (ret) {
+		pr_err("SEV: SNP_PLATFORM_STATUS failed ret=%d, fw_error=%d (%#x)\n",
+		       ret, error, error);
+		goto out;
+	}
+
+	initialized = !!status->state;
+
+out:
+	snp_free_firmware_page(status);
+
+	return initialized;
+}
+
 void __init sev_hardware_setup(void)
 {
 	unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
@@ -2975,6 +3002,14 @@ void __init sev_hardware_setup(void)
 	sev_snp_supported = sev_snp_enabled && cc_platform_has(CC_ATTR_HOST_SEV_SNP);
 
 out:
+	if (sev_enabled) {
+		init_args.probe = true;
+		if (sev_platform_init(&init_args))
+			sev_supported = sev_es_supported = sev_snp_supported = false;
+		else if (sev_snp_supported)
+			sev_snp_supported = is_sev_snp_initialized();
+	}
+
 	if (boot_cpu_has(X86_FEATURE_SEV))
 		pr_info("SEV %s (ASIDs %u - %u)\n",
 			sev_supported ? min_sev_asid <= max_sev_asid ? "enabled" :
@@ -3001,15 +3036,6 @@ out:
 	sev_supported_vmsa_features = 0;
 	if (sev_es_debug_swap_enabled)
 		sev_supported_vmsa_features |= SVM_SEV_FEAT_DEBUG_SWAP;
-
-	if (!sev_enabled)
-		return;
-
-	/*
-	 * Do both SNP and SEV initialization at KVM module load.
-	 */
-	init_args.probe = true;
-	sev_platform_init(&init_args);
 }
 
 void sev_hardware_unsetup(void)
-- 
cgit v1.2.3


From 403d1338a4a59cfebb4ded53fa35fbd5119f36b1 Mon Sep 17 00:00:00 2001
From: Magnus Lindholm <linmag7@gmail.com>
Date: Tue, 18 Feb 2025 18:55:14 +0100
Subject: mm: pgtable: fix pte_swp_exclusive

Make pte_swp_exclusive return bool instead of int.  This will better
reflect how pte_swp_exclusive is actually used in the code.

This fixes swap/swapoff problems on Alpha due pte_swp_exclusive not
returning correct values when _PAGE_SWP_EXCLUSIVE bit resides in upper
32-bits of PTE (like on alpha).

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Magnus Lindholm <linmag7@gmail.com>
Cc: Sam James <sam@gentoo.org>
Link: https://lore.kernel.org/lkml/20250218175735.19882-2-linmag7@gmail.com/
Link: https://lore.kernel.org/lkml/20250602041118.GA2675383@ZenIV/
[ Applied as the 'sed' script Al suggested   - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/pgtable.h             | 2 +-
 arch/arc/include/asm/pgtable-bits-arcv2.h    | 2 +-
 arch/arm/include/asm/pgtable.h               | 2 +-
 arch/arm64/include/asm/pgtable.h             | 2 +-
 arch/csky/include/asm/pgtable.h              | 2 +-
 arch/hexagon/include/asm/pgtable.h           | 2 +-
 arch/loongarch/include/asm/pgtable.h         | 2 +-
 arch/m68k/include/asm/mcf_pgtable.h          | 2 +-
 arch/m68k/include/asm/motorola_pgtable.h     | 2 +-
 arch/m68k/include/asm/sun3_pgtable.h         | 2 +-
 arch/microblaze/include/asm/pgtable.h        | 2 +-
 arch/mips/include/asm/pgtable.h              | 4 ++--
 arch/nios2/include/asm/pgtable.h             | 2 +-
 arch/openrisc/include/asm/pgtable.h          | 2 +-
 arch/parisc/include/asm/pgtable.h            | 2 +-
 arch/powerpc/include/asm/book3s/32/pgtable.h | 2 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +-
 arch/powerpc/include/asm/nohash/pgtable.h    | 2 +-
 arch/riscv/include/asm/pgtable.h             | 2 +-
 arch/s390/include/asm/pgtable.h              | 2 +-
 arch/sh/include/asm/pgtable_32.h             | 2 +-
 arch/sparc/include/asm/pgtable_32.h          | 2 +-
 arch/sparc/include/asm/pgtable_64.h          | 2 +-
 arch/um/include/asm/pgtable.h                | 2 +-
 arch/x86/include/asm/pgtable.h               | 2 +-
 arch/xtensa/include/asm/pgtable.h            | 2 +-
 26 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'arch')

diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 2676017f42f1..44e7aedac6e8 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -327,7 +327,7 @@ extern inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)	((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h
index 8ebec1b21d24..3084c53f402d 100644
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -130,7 +130,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index 7f1c3b4e3e04..86378eec7757 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -301,7 +301,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(swp)	__pte((swp).val)
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_isset(pte, L_PTE_SWP_EXCLUSIVE);
 }
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 88db8a0c0b37..192d86e1cc76 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -563,7 +563,7 @@ static inline pte_t pte_swp_mkexclusive(pte_t pte)
 	return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
 }
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & PTE_SWP_EXCLUSIVE;
 }
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index b8378431aeff..5a394be09c35 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -200,7 +200,7 @@ static inline pte_t pte_mkyoung(pte_t pte)
 	return pte;
 }
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index 9fbdfdbc539f..fbf24d1d1ca6 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -387,7 +387,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 		(((type & 0x1f) << 1) | \
 		 ((offset & 0x3ffff8) << 10) | ((offset & 0x7) << 7)) })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h
index a3f17914dbab..b30185302c07 100644
--- a/arch/loongarch/include/asm/pgtable.h
+++ b/arch/loongarch/include/asm/pgtable.h
@@ -301,7 +301,7 @@ static inline pte_t mk_swap_pte(unsigned long type, unsigned long offset)
 #define __pmd_to_swp_entry(pmd) ((swp_entry_t) { pmd_val(pmd) })
 #define __swp_entry_to_pmd(x)	((pmd_t) { (x).val | _PAGE_HUGE })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h
index f5c596b211d4..d79fef609194 100644
--- a/arch/m68k/include/asm/mcf_pgtable.h
+++ b/arch/m68k/include/asm/mcf_pgtable.h
@@ -268,7 +268,7 @@ extern pgd_t kernel_pg_dir[PTRS_PER_PGD];
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)	(__pte((x).val))
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h
index 040ac3bad713..14fee64d3e60 100644
--- a/arch/m68k/include/asm/motorola_pgtable.h
+++ b/arch/m68k/include/asm/motorola_pgtable.h
@@ -185,7 +185,7 @@ extern pgd_t kernel_pg_dir[128];
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)	((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h
index 73745dc0ec0e..858cbe936f5b 100644
--- a/arch/m68k/include/asm/sun3_pgtable.h
+++ b/arch/m68k/include/asm/sun3_pgtable.h
@@ -169,7 +169,7 @@ extern pgd_t kernel_pg_dir[PTRS_PER_PGD];
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)	((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index b1bb2c65dd04..bae1abfa6f6b 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -398,7 +398,7 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) >> 2 })
 #define __swp_entry_to_pte(x)	((pte_t) { (x).val << 2 })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 4852b005a72d..ae73ecf4c41a 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -534,7 +534,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 #endif
 
 #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32)
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte.pte_low & _PAGE_SWP_EXCLUSIVE;
 }
@@ -551,7 +551,7 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
 	return pte;
 }
 #else
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index e98578e27e26..844dce55569f 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -259,7 +259,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 #define __swp_entry_to_pte(swp)	((pte_t) { (swp).val })
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 71bfb8c8c482..5bd6463bd514 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -411,7 +411,7 @@ static inline void update_mmu_cache_range(struct vm_fault *vmf,
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 80f5e2a28413..1a86a4370b29 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -425,7 +425,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 42c3af90d1f0..92d21c6faf1e 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -365,7 +365,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) >> 3 })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val << 3 })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 6ed93e290c2f..a2ddcbb3fcb9 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -693,7 +693,7 @@ static inline pte_t pte_swp_mkexclusive(pte_t pte)
 	return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_EXCLUSIVE));
 }
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return !!(pte_raw(pte) & cpu_to_be64(_PAGE_SWP_EXCLUSIVE));
 }
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h
index 8d1f0b7062eb..7d6b9e5b286e 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -286,7 +286,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 	return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
 }
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index a11816bbf9e7..438ce7df24c3 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -1028,7 +1028,7 @@ static inline pud_t pud_modify(pud_t pud, pgprot_t newprot)
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)	((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1c661ac62ce8..6d8bc27a366e 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -915,7 +915,7 @@ static inline int pmd_protnone(pmd_t pmd)
 }
 #endif
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h
index 71b18741cc11..5f51af18997b 100644
--- a/arch/sh/include/asm/pgtable_32.h
+++ b/arch/sh/include/asm/pgtable_32.h
@@ -470,7 +470,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 /* In both cases, we borrow bit 6 to store the exclusive marker in swap PTEs. */
 #define _PAGE_SWP_EXCLUSIVE	_PAGE_USER
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte.pte_low & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 1454ebe91539..7c199c003ffe 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -348,7 +348,7 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & SRMMU_SWP_EXCLUSIVE;
 }
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 4af03e3c161b..669cd02469a1 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1023,7 +1023,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 #define __pte_to_swp_entry(pte)		((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index ca2a519d53ab..24fdea6f88c3 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -314,7 +314,7 @@ extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr);
 	((swp_entry_t) { pte_val(pte_mkuptodate(pte)) })
 #define __swp_entry_to_pte(x)		((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_get_bits(pte, _PAGE_SWP_EXCLUSIVE);
 }
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 774430c3abff..97954c936c54 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1561,7 +1561,7 @@ static inline pte_t pte_swp_mkexclusive(pte_t pte)
 	return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
 }
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE;
 }
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index cb1725c40e36..d62aa1c316fc 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -349,7 +349,7 @@ ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)	((pte_t) { (x).val })
 
-static inline int pte_swp_exclusive(pte_t pte)
+static inline bool pte_swp_exclusive(pte_t pte)
 {
 	return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
 }
-- 
cgit v1.2.3


From aa2024c01a9afba6728b362626f868811ca872ee Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Tue, 10 Jun 2025 20:10:18 -0400
Subject: KVM: x86/mmu: Embed direct bits into gpa for KVM_PRE_FAULT_MEMORY

Bug[*] reported for TDX case when enabling KVM_PRE_FAULT_MEMORY in QEMU.

It turns out that @gpa passed to kvm_mmu_do_page_fault() doesn't have
shared bit set when the memory attribute of it is shared, and it leads
to wrong root in tdp_mmu_get_root_for_fault().

Fix it by embedding the direct bits in the gpa that is passed to
kvm_tdp_map_page(), when the memory of the gpa is not private.

[*] https://lore.kernel.org/qemu-devel/4a757796-11c2-47f1-ae0d-335626e818fd@intel.com/

Reported-by: Xiaoyao Li <xiaoyao.li@intel.com>
Closes: https://lore.kernel.org/qemu-devel/4a757796-11c2-47f1-ae0d-335626e818fd@intel.com/
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Xiaoyao Li <xiaoyao.li@intel.com>
Message-ID: <20250611001018.2179964-1-xiaoyao.li@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index cbc84c6abc2e..a4040578b537 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4896,6 +4896,7 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 {
 	u64 error_code = PFERR_GUEST_FINAL_MASK;
 	u8 level = PG_LEVEL_4K;
+	u64 direct_bits;
 	u64 end;
 	int r;
 
@@ -4910,15 +4911,18 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 	if (r)
 		return r;
 
+	direct_bits = 0;
 	if (kvm_arch_has_private_mem(vcpu->kvm) &&
 	    kvm_mem_is_private(vcpu->kvm, gpa_to_gfn(range->gpa)))
 		error_code |= PFERR_PRIVATE_ACCESS;
+	else
+		direct_bits = gfn_to_gpa(kvm_gfn_direct_bits(vcpu->kvm));
 
 	/*
 	 * Shadow paging uses GVA for kvm page fault, so restrict to
 	 * two-dimensional paging.
 	 */
-	r = kvm_tdp_map_page(vcpu, range->gpa, error_code, &level);
+	r = kvm_tdp_map_page(vcpu, range->gpa | direct_bits, error_code, &level);
 	if (r < 0)
 		return r;
 
-- 
cgit v1.2.3


From 8046d29dde17002523f94d3e6e0ebe486ce52166 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 12 Jun 2025 00:47:44 -0400
Subject: KVM: x86/mmu: Reject direct bits in gpa passed to
 KVM_PRE_FAULT_MEMORY

Only let userspace pass the same addresses that were used in KVM_SET_USER_MEMORY_REGION
(or KVM_SET_USER_MEMORY_REGION2); gpas in the the upper half of the address space
are an implementation detail of TDX and KVM.

Extracted from a patch by Sean Christopherson <seanjc@google.com>.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a4040578b537..4e06e2e89a8f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4903,6 +4903,9 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
 	if (!vcpu->kvm->arch.pre_fault_allowed)
 		return -EOPNOTSUPP;
 
+	if (kvm_is_gfn_alias(vcpu->kvm, gpa_to_gfn(range->gpa)))
+		return -EINVAL;
+
 	/*
 	 * reload is efficient when called repeatedly, so we can do it on
 	 * every iteration.
-- 
cgit v1.2.3


From 650768c512faba8070bf4cfbb28c95eb5cd203f3 Mon Sep 17 00:00:00 2001
From: Dev Jain <dev.jain@arm.com>
Date: Tue, 27 May 2025 13:56:33 +0530
Subject: arm64: Restrict pagetable teardown to avoid false warning

Commit 9c006972c3fe ("arm64: mmu: drop pXd_present() checks from
pXd_free_pYd_table()") removes the pxd_present() checks because the
caller checks pxd_present(). But, in case of vmap_try_huge_pud(), the
caller only checks pud_present(); pud_free_pmd_page() recurses on each
pmd through pmd_free_pte_page(), wherein the pmd may be none. Thus it is
possible to hit a warning in the latter, since pmd_none => !pmd_table().
Thus, add a pmd_present() check in pud_free_pmd_page().

This problem was found by code inspection.

Fixes: 9c006972c3fe ("arm64: mmu: drop pXd_present() checks from pXd_free_pYd_table()")
Cc: stable@vger.kernel.org
Reported-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Dev Jain <dev.jain@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Link: https://lore.kernel.org/r/20250527082633.61073-1-dev.jain@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/mm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8fcf59ba39db..00ab1d648db6 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1305,7 +1305,8 @@ int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
 	next = addr;
 	end = addr + PUD_SIZE;
 	do {
-		pmd_free_pte_page(pmdp, next);
+		if (pmd_present(pmdp_get(pmdp)))
+			pmd_free_pte_page(pmdp, next);
 	} while (pmdp++, next += PMD_SIZE, next != end);
 
 	pud_clear(pudp);
-- 
cgit v1.2.3


From d2be3270f40b1330f8c1c9512c59dd085a758ac0 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Wed, 11 Jun 2025 17:28:13 +0100
Subject: arm64/gcs: Don't call gcs_free() during flush_gcs()

Currently we call gcs_free() during flush_gcs() to reset the thread
state for GCS. This includes unmapping any kernel allocated GCS, but
this is redundant when doing a flush_thread() since we are
reinitialising the thread memory too. Inline the reinitialisation of the
thread struct.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20250611-arm64-gcs-flush-thread-v1-1-cc26feeddabd@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/process.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index a5ca15daeb8a..5954cec19660 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -288,7 +288,9 @@ static void flush_gcs(void)
 	if (!system_supports_gcs())
 		return;
 
-	gcs_free(current);
+	current->thread.gcspr_el0 = 0;
+	current->thread.gcs_base = 0;
+	current->thread.gcs_size = 0;
 	current->thread.gcs_el0_mode = 0;
 	write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
 	write_sysreg_s(0, SYS_GCSPR_EL0);
-- 
cgit v1.2.3


From 39dfc971e42d886e7df01371cd1bef505076d84c Mon Sep 17 00:00:00 2001
From: Tengda Wu <wutengda@huaweicloud.com>
Date: Wed, 4 Jun 2025 00:55:33 +0000
Subject: arm64/ptrace: Fix stack-out-of-bounds read in
 regs_get_kernel_stack_nth()

KASAN reports a stack-out-of-bounds read in regs_get_kernel_stack_nth().

Call Trace:
[   97.283505] BUG: KASAN: stack-out-of-bounds in regs_get_kernel_stack_nth+0xa8/0xc8
[   97.284677] Read of size 8 at addr ffff800089277c10 by task 1.sh/2550
[   97.285732]
[   97.286067] CPU: 7 PID: 2550 Comm: 1.sh Not tainted 6.6.0+ #11
[   97.287032] Hardware name: linux,dummy-virt (DT)
[   97.287815] Call trace:
[   97.288279]  dump_backtrace+0xa0/0x128
[   97.288946]  show_stack+0x20/0x38
[   97.289551]  dump_stack_lvl+0x78/0xc8
[   97.290203]  print_address_description.constprop.0+0x84/0x3c8
[   97.291159]  print_report+0xb0/0x280
[   97.291792]  kasan_report+0x84/0xd0
[   97.292421]  __asan_load8+0x9c/0xc0
[   97.293042]  regs_get_kernel_stack_nth+0xa8/0xc8
[   97.293835]  process_fetch_insn+0x770/0xa30
[   97.294562]  kprobe_trace_func+0x254/0x3b0
[   97.295271]  kprobe_dispatcher+0x98/0xe0
[   97.295955]  kprobe_breakpoint_handler+0x1b0/0x210
[   97.296774]  call_break_hook+0xc4/0x100
[   97.297451]  brk_handler+0x24/0x78
[   97.298073]  do_debug_exception+0xac/0x178
[   97.298785]  el1_dbg+0x70/0x90
[   97.299344]  el1h_64_sync_handler+0xcc/0xe8
[   97.300066]  el1h_64_sync+0x78/0x80
[   97.300699]  kernel_clone+0x0/0x500
[   97.301331]  __arm64_sys_clone+0x70/0x90
[   97.302084]  invoke_syscall+0x68/0x198
[   97.302746]  el0_svc_common.constprop.0+0x11c/0x150
[   97.303569]  do_el0_svc+0x38/0x50
[   97.304164]  el0_svc+0x44/0x1d8
[   97.304749]  el0t_64_sync_handler+0x100/0x130
[   97.305500]  el0t_64_sync+0x188/0x190
[   97.306151]
[   97.306475] The buggy address belongs to stack of task 1.sh/2550
[   97.307461]  and is located at offset 0 in frame:
[   97.308257]  __se_sys_clone+0x0/0x138
[   97.308910]
[   97.309241] This frame has 1 object:
[   97.309873]  [48, 184) 'args'
[   97.309876]
[   97.310749] The buggy address belongs to the virtual mapping at
[   97.310749]  [ffff800089270000, ffff800089279000) created by:
[   97.310749]  dup_task_struct+0xc0/0x2e8
[   97.313347]
[   97.313674] The buggy address belongs to the physical page:
[   97.314604] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x14f69a
[   97.315885] flags: 0x15ffffe00000000(node=1|zone=2|lastcpupid=0xfffff)
[   97.316957] raw: 015ffffe00000000 0000000000000000 dead000000000122 0000000000000000
[   97.318207] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
[   97.319445] page dumped because: kasan: bad access detected
[   97.320371]
[   97.320694] Memory state around the buggy address:
[   97.321511]  ffff800089277b00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   97.322681]  ffff800089277b80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[   97.323846] >ffff800089277c00: 00 00 f1 f1 f1 f1 f1 f1 00 00 00 00 00 00 00 00
[   97.325023]                          ^
[   97.325683]  ffff800089277c80: 00 00 00 00 00 00 00 00 00 f3 f3 f3 f3 f3 f3 f3
[   97.326856]  ffff800089277d00: f3 f3 00 00 00 00 00 00 00 00 00 00 00 00 00 00

This issue seems to be related to the behavior of some gcc compilers and
was also fixed on the s390 architecture before:

 commit d93a855c31b7 ("s390/ptrace: Avoid KASAN false positives in regs_get_kernel_stack_nth()")

As described in that commit, regs_get_kernel_stack_nth() has confirmed that
`addr` is on the stack, so reading the value at `*addr` should be allowed.
Use READ_ONCE_NOCHECK() helper to silence the KASAN check for this case.

Fixes: 0a8ea52c3eb1 ("arm64: Add HAVE_REGS_AND_STACK_ACCESS_API feature")
Signed-off-by: Tengda Wu <wutengda@huaweicloud.com>
Link: https://lore.kernel.org/r/20250604005533.1278992-1-wutengda@huaweicloud.com
[will: Use '*addr' as the argument to READ_ONCE_NOCHECK()]
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index a360e52db02f..ee94b72bf8fb 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -141,7 +141,7 @@ unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
 
 	addr += n;
 	if (regs_within_kernel_stack(regs, (unsigned long)addr))
-		return *addr;
+		return READ_ONCE_NOCHECK(*addr);
 	else
 		return 0;
 }
-- 
cgit v1.2.3


From b93755f408325170edb2156c6a894ed1cae5f4f6 Mon Sep 17 00:00:00 2001
From: Christophe Leroy <christophe.leroy@csgroup.eu>
Date: Mon, 12 May 2025 20:14:55 +0200
Subject: powerpc/vdso: Fix build of VDSO32 with pcrel

Building vdso32 on power10 with pcrel leads to following errors:

	  VDSO32A arch/powerpc/kernel/vdso/gettimeofday-32.o
	arch/powerpc/kernel/vdso/gettimeofday.S: Assembler messages:
	arch/powerpc/kernel/vdso/gettimeofday.S:40: Error: syntax error; found `@', expected `,'
	arch/powerpc/kernel/vdso/gettimeofday.S:71:  Info: macro invoked from here
	arch/powerpc/kernel/vdso/gettimeofday.S:40: Error: junk at end of line: `@notoc'
	arch/powerpc/kernel/vdso/gettimeofday.S:71:  Info: macro invoked from here
	 ...
	make[2]: *** [arch/powerpc/kernel/vdso/Makefile:85: arch/powerpc/kernel/vdso/gettimeofday-32.o] Error 1
	make[1]: *** [arch/powerpc/Makefile:388: vdso_prepare] Error 2

Once the above is fixed, the following happens:

	  VDSO32C arch/powerpc/kernel/vdso/vgettimeofday-32.o
	cc1: error: '-mpcrel' requires '-mcmodel=medium'
	make[2]: *** [arch/powerpc/kernel/vdso/Makefile:89: arch/powerpc/kernel/vdso/vgettimeofday-32.o] Error 1
	make[1]: *** [arch/powerpc/Makefile:388: vdso_prepare] Error 2
	make: *** [Makefile:251: __sub-make] Error 2

Make sure pcrel version of CFUNC() macro is used only for powerpc64
builds and remove -mpcrel for powerpc32 builds.

Fixes: 7e3a68be42e1 ("powerpc/64: vmlinux support building with PCREL addresing")
Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/1fa3453f07d42a50a70114da9905bf7b73304fca.1747073669.git.christophe.leroy@csgroup.eu
---
 arch/powerpc/include/asm/ppc_asm.h | 2 +-
 arch/powerpc/kernel/vdso/Makefile  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index 02897f4b0dbf..b891910fce8a 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -183,7 +183,7 @@
 /*
  * Used to name C functions called from asm
  */
-#ifdef CONFIG_PPC_KERNEL_PCREL
+#if defined(__powerpc64__) && defined(CONFIG_PPC_KERNEL_PCREL)
 #define CFUNC(name) name@notoc
 #else
 #define CFUNC(name) name
diff --git a/arch/powerpc/kernel/vdso/Makefile b/arch/powerpc/kernel/vdso/Makefile
index e8824f933326..8834dfe9d727 100644
--- a/arch/powerpc/kernel/vdso/Makefile
+++ b/arch/powerpc/kernel/vdso/Makefile
@@ -53,7 +53,7 @@ ldflags-$(CONFIG_LD_ORPHAN_WARN) += -Wl,--orphan-handling=$(CONFIG_LD_ORPHAN_WAR
 ldflags-y += $(filter-out $(CC_AUTO_VAR_INIT_ZERO_ENABLER) $(CC_FLAGS_FTRACE) -Wa$(comma)%, $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS))
 
 CC32FLAGS := -m32
-CC32FLAGSREMOVE := -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc
+CC32FLAGSREMOVE := -mcmodel=medium -mabi=elfv1 -mabi=elfv2 -mcall-aixdesc -mpcrel
 ifdef CONFIG_CC_IS_CLANG
 # This flag is supported by clang for 64-bit but not 32-bit so it will cause
 # an unused command line flag warning for this file.
-- 
cgit v1.2.3


From 33bc69cf6655cf60829a803a45275f11a74899e5 Mon Sep 17 00:00:00 2001
From: Narayana Murty N <nnmlinux@linux.ibm.com>
Date: Thu, 8 May 2025 02:29:28 -0400
Subject: powerpc/eeh: Fix missing PE bridge reconfiguration during VFIO EEH
 recovery

VFIO EEH recovery for PCI passthrough devices fails on PowerNV and pseries
platforms due to missing host-side PE bridge reconfiguration. In the
current implementation, eeh_pe_configure() only performs RTAS or OPAL-based
bridge reconfiguration for native host devices, but skips it entirely for
PEs managed through VFIO in guest passthrough scenarios.

This leads to incomplete EEH recovery when a PCI error affects a
passthrough device assigned to a QEMU/KVM guest. Although VFIO triggers the
EEH recovery flow through VFIO_EEH_PE_ENABLE ioctl, the platform-specific
bridge reconfiguration step is silently bypassed. As a result, the PE's
config space is not fully restored, causing subsequent config space access
failures or EEH freeze-on-access errors inside the guest.

This patch fixes the issue by ensuring that eeh_pe_configure() always
invokes the platform's configure_bridge() callback (e.g.,
pseries_eeh_phb_configure_bridge) even for VFIO-managed PEs. This ensures
that RTAS or OPAL calls to reconfigure the PE bridge are correctly issued
on the host side, restoring the PE's configuration space after an EEH
event.

This fix is essential for reliable EEH recovery in QEMU/KVM guests using
VFIO PCI passthrough on PowerNV and pseries systems.

Tested with:
- QEMU/KVM guest using VFIO passthrough (IBM Power9,(lpar)Power11 host)
- Injected EEH errors with pseries EEH errinjct tool on host, recovery
  verified on qemu guest.
- Verified successful config space access and CAP_EXP DevCtl restoration
  after recovery

Fixes: 212d16cdca2d ("powerpc/eeh: EEH support for VFIO PCI device")
Signed-off-by: Narayana Murty N <nnmlinux@linux.ibm.com>
Reviewed-by: Vaibhav Jain <vaibhav@linux.ibm.com>
Reviewed-by: Ganesh Goudar <ganeshgr@linux.ibm.com>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20250508062928.146043-1-nnmlinux@linux.ibm.com
---
 arch/powerpc/kernel/eeh.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 83fe99861eb1..ca7f7bb2b478 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1509,6 +1509,8 @@ int eeh_pe_configure(struct eeh_pe *pe)
 	/* Invalid PE ? */
 	if (!pe)
 		return -ENODEV;
+	else
+		ret = eeh_ops->configure_bridge(pe);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 4e6d080acfda5344ccbc63afe778830e22be4be9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= <j.ne@posteo.net>
Date: Wed, 11 Jun 2025 23:07:49 +0200
Subject: powerpc/microwatt: Fix model property in device tree
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The standard property for the model name is called "model".

Signed-off-by: J. Neuschäfer <j.ne@posteo.net>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20250611-microwatt-v2-1-80847bbc5f9c@posteo.net
---
 arch/powerpc/boot/dts/microwatt.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/boot/dts/microwatt.dts b/arch/powerpc/boot/dts/microwatt.dts
index c4e4d2a9b460..b7eac4e56019 100644
--- a/arch/powerpc/boot/dts/microwatt.dts
+++ b/arch/powerpc/boot/dts/microwatt.dts
@@ -4,7 +4,7 @@
 / {
 	#size-cells = <0x02>;
 	#address-cells = <0x02>;
-	model-name = "microwatt";
+	model = "microwatt";
 	compatible = "microwatt-soc";
 
 	aliases {
-- 
cgit v1.2.3


From e75cb6010838f61b9d63e921d1763a8ab177e38e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=2E=20Neusch=C3=A4fer?= <j.ne@posteo.net>
Date: Wed, 11 Jun 2025 21:01:01 +0200
Subject: powerpc: dts: mpc8315erdb: Add GPIO controller node
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The MPC8315E SoC and variants have a GPIO controller at IMMR + 0xc00.
This node was previously missing from the device tree.

Signed-off-by: J. Neuschäfer <j.ne@posteo.net>
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20250611-mpc-gpio-v1-1-02d1f75336e2@posteo.net
---
 arch/powerpc/boot/dts/mpc8315erdb.dts | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/boot/dts/mpc8315erdb.dts b/arch/powerpc/boot/dts/mpc8315erdb.dts
index e09b37d7489d..a89cb3139ca8 100644
--- a/arch/powerpc/boot/dts/mpc8315erdb.dts
+++ b/arch/powerpc/boot/dts/mpc8315erdb.dts
@@ -6,6 +6,7 @@
  */
 
 /dts-v1/;
+#include <dt-bindings/interrupt-controller/irq.h>
 
 / {
 	compatible = "fsl,mpc8315erdb";
@@ -358,6 +359,15 @@
 			interrupt-parent = <&ipic>;
 			fsl,mpc8313-wakeup-timer = <&gtm1>;
 		};
+
+		gpio: gpio-controller@c00 {
+			compatible = "fsl,mpc8314-gpio";
+			reg = <0xc00 0x100>;
+			interrupts = <74 IRQ_TYPE_LEVEL_LOW>;
+			interrupt-parent = <&ipic>;
+			gpio-controller;
+			#gpio-cells = <2>;
+		};
 	};
 
 	pci0: pci@e0008500 {
-- 
cgit v1.2.3


From b0823d5fbacb1c551d793cbfe7af24e0d1fa45ed Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 12 Jun 2025 07:38:18 -0700
Subject: perf/x86/intel: Fix crash in icl_update_topdown_event()

The perf_fuzzer found a hard-lockup crash on a RaptorLake machine:

  Oops: general protection fault, maybe for address 0xffff89aeceab400: 0000
  CPU: 23 UID: 0 PID: 0 Comm: swapper/23
  Tainted: [W]=WARN
  Hardware name: Dell Inc. Precision 9660/0VJ762
  RIP: 0010:native_read_pmc+0x7/0x40
  Code: cc e8 8d a9 01 00 48 89 03 5b cd cc cc cc cc 0f 1f ...
  RSP: 000:fffb03100273de8 EFLAGS: 00010046
  ....
  Call Trace:
    <TASK>
    icl_update_topdown_event+0x165/0x190
    ? ktime_get+0x38/0xd0
    intel_pmu_read_event+0xf9/0x210
    __perf_event_read+0xf9/0x210

CPUs 16-23 are E-core CPUs that don't support the perf metrics feature.
The icl_update_topdown_event() should not be invoked on these CPUs.

It's a regression of commit:

  f9bdf1f95339 ("perf/x86/intel: Avoid disable PMU if !cpuc->enabled in sample read")

The bug introduced by that commit is that the is_topdown_event() function
is mistakenly used to replace the is_topdown_count() call to check if the
topdown functions for the perf metrics feature should be invoked.

Fix it.

Fixes: f9bdf1f95339 ("perf/x86/intel: Avoid disable PMU if !cpuc->enabled in sample read")
Closes: https://lore.kernel.org/lkml/352f0709-f026-cd45-e60c-60dfd97f73f3@maine.edu/
Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Vince Weaver <vincent.weaver@maine.edu>
Cc: stable@vger.kernel.org # v6.15+
Link: https://lore.kernel.org/r/20250612143818.2889040-1-kan.liang@linux.intel.com
---
 arch/x86/events/intel/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 741b229f0718..c2fb729c270e 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2826,7 +2826,7 @@ static void intel_pmu_read_event(struct perf_event *event)
 		 * If the PEBS counters snapshotting is enabled,
 		 * the topdown event is available in PEBS records.
 		 */
-		if (is_topdown_event(event) && !is_pebs_counter_event_group(event))
+		if (is_topdown_count(event) && !is_pebs_counter_event_group(event))
 			static_call(intel_pmu_update_topdown_event)(event, NULL);
 		else
 			intel_pmu_drain_pebs_buffer();
-- 
cgit v1.2.3


From ab107276607af90b13a5994997e19b7b9731e251 Mon Sep 17 00:00:00 2001
From: Madhavan Srinivasan <maddy@linux.ibm.com>
Date: Sat, 17 May 2025 19:52:37 +0530
Subject: powerpc: Fix struct termio related ioctl macros

Since termio interface is now obsolete, include/uapi/asm/ioctls.h
has some constant macros referring to "struct termio", this caused
build failure at userspace.

In file included from /usr/include/asm/ioctl.h:12,
                 from /usr/include/asm/ioctls.h:5,
                 from tst-ioctls.c:3:
tst-ioctls.c: In function 'get_TCGETA':
tst-ioctls.c:12:10: error: invalid application of 'sizeof' to incomplete type 'struct termio'
   12 |   return TCGETA;
      |          ^~~~~~

Even though termios.h provides "struct termio", trying to juggle definitions around to
make it compile could introduce regressions. So better to open code it.

Reported-by: Tulio Magno <tuliom@ascii.art.br>
Suggested-by: Nicholas Piggin <npiggin@gmail.com>
Tested-by: Justin M. Forbes <jforbes@fedoraproject.org>
Reviewed-by: Michael Ellerman <mpe@ellerman.id.au>
Closes: https://lore.kernel.org/linuxppc-dev/8734dji5wl.fsf@ascii.art.br/
Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com>
Link: https://patch.msgid.link/20250517142237.156665-1-maddy@linux.ibm.com
---
 arch/powerpc/include/uapi/asm/ioctls.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/uapi/asm/ioctls.h b/arch/powerpc/include/uapi/asm/ioctls.h
index 2c145da3b774..b5211e413829 100644
--- a/arch/powerpc/include/uapi/asm/ioctls.h
+++ b/arch/powerpc/include/uapi/asm/ioctls.h
@@ -23,10 +23,10 @@
 #define TCSETSW		_IOW('t', 21, struct termios)
 #define TCSETSF		_IOW('t', 22, struct termios)
 
-#define TCGETA		_IOR('t', 23, struct termio)
-#define TCSETA		_IOW('t', 24, struct termio)
-#define TCSETAW		_IOW('t', 25, struct termio)
-#define TCSETAF		_IOW('t', 28, struct termio)
+#define TCGETA		0x40147417 /* _IOR('t', 23, struct termio) */
+#define TCSETA		0x80147418 /* _IOW('t', 24, struct termio) */
+#define TCSETAW		0x80147419 /* _IOW('t', 25, struct termio) */
+#define TCSETAF		0x8014741c /* _IOW('t', 28, struct termio) */
 
 #define TCSBRK		_IO('t', 29)
 #define TCXONC		_IO('t', 30)
-- 
cgit v1.2.3


From 594902c986e269660302f09df9ec4bf1cf017b77 Mon Sep 17 00:00:00 2001
From: Qinyun Tan <qinyuntan@linux.alibaba.com>
Date: Sat, 31 May 2025 02:20:53 +0800
Subject: x86,fs/resctrl: Remove inappropriate references to cacheinfo in the
 resctrl subsystem

In the resctrl subsystem's Sub-NUMA Cluster (SNC) mode, the rdt_mon_domain
structure representing a NUMA node relies on the cacheinfo interface
(rdt_mon_domain::ci) to store L3 cache information (e.g., shared_cpu_map)
for monitoring. The L3 cache information of a SNC NUMA node determines
which domains are summed for the "top level" L3-scoped events.

rdt_mon_domain::ci is initialized using the first online CPU of a NUMA
node. When this CPU goes offline, its shared_cpu_map is cleared to contain
only the offline CPU itself. Subsequently, attempting to read counters
via smp_call_on_cpu(offline_cpu) fails (and error ignored), returning
zero values for "top-level events" without any error indication.

Replace the cacheinfo references in struct rdt_mon_domain and struct
rmid_read with the cacheinfo ID (a unique identifier for the L3 cache).

rdt_domain_hdr::cpu_mask contains the online CPUs associated with that
domain. When reading "top-level events", select a CPU from
rdt_domain_hdr::cpu_mask and utilize its L3 shared_cpu_map to determine
valid CPUs for reading RMID counter via the MSR interface.

Considering all CPUs associated with the L3 cache improves the chances
of picking a housekeeping CPU on which the counter reading work can be
queued, avoiding an unnecessary IPI.

Fixes: 328ea68874642 ("x86/resctrl: Prepare for new Sub-NUMA Cluster (SNC) monitor files")
Signed-off-by: Qinyun Tan <qinyuntan@linux.alibaba.com>
Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Link: https://lore.kernel.org/20250530182053.37502-2-qinyuntan@linux.alibaba.com
---
 arch/x86/kernel/cpu/resctrl/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 7109cbfcad4f..187d527ef73b 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -498,6 +498,7 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
 	struct rdt_hw_mon_domain *hw_dom;
 	struct rdt_domain_hdr *hdr;
 	struct rdt_mon_domain *d;
+	struct cacheinfo *ci;
 	int err;
 
 	lockdep_assert_held(&domain_list_lock);
@@ -525,12 +526,13 @@ static void domain_add_cpu_mon(int cpu, struct rdt_resource *r)
 	d = &hw_dom->d_resctrl;
 	d->hdr.id = id;
 	d->hdr.type = RESCTRL_MON_DOMAIN;
-	d->ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
-	if (!d->ci) {
+	ci = get_cpu_cacheinfo_level(cpu, RESCTRL_L3_CACHE);
+	if (!ci) {
 		pr_warn_once("Can't find L3 cache for CPU:%d resource %s\n", cpu, r->name);
 		mon_domain_free(hw_dom);
 		return;
 	}
+	d->ci_id = ci->id;
 	cpumask_set_cpu(cpu, &d->hdr.cpu_mask);
 
 	arch_mon_domain_online(r, d);
-- 
cgit v1.2.3


From 9d4204a8106fe7dc80e3f2e440c8f2ba1ba47319 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sun, 15 Jun 2025 18:06:54 -0700
Subject: lib/crypto/poly1305: Fix arm64's poly1305_blocks_arch()

For some reason arm64's Poly1305 code got changed to ignore the padbit
argument.  As a result, the output is incorrect when the message length
is not a multiple of 16 (which is not reached with the standard
ChaCha20Poly1305, but bcachefs could reach this).  Fix this.

Fixes: a59e5468a921 ("crypto: arm64/poly1305 - Add block-only interface")
Reported-by: Kent Overstreet <kent.overstreet@linux.dev>
Tested-by: Kent Overstreet <kent.overstreet@linux.dev>
Link: https://lore.kernel.org/r/20250616010654.367302-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@google.com>
---
 arch/arm64/lib/crypto/poly1305-glue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/lib/crypto/poly1305-glue.c b/arch/arm64/lib/crypto/poly1305-glue.c
index 6a661cf04821..c9a74766785b 100644
--- a/arch/arm64/lib/crypto/poly1305-glue.c
+++ b/arch/arm64/lib/crypto/poly1305-glue.c
@@ -38,14 +38,14 @@ void poly1305_blocks_arch(struct poly1305_block_state *state, const u8 *src,
 			unsigned int todo = min_t(unsigned int, len, SZ_4K);
 
 			kernel_neon_begin();
-			poly1305_blocks_neon(state, src, todo, 1);
+			poly1305_blocks_neon(state, src, todo, padbit);
 			kernel_neon_end();
 
 			len -= todo;
 			src += todo;
 		} while (len);
 	} else
-		poly1305_blocks(state, src, len, 1);
+		poly1305_blocks(state, src, len, padbit);
 }
 EXPORT_SYMBOL_GPL(poly1305_blocks_arch);
 
-- 
cgit v1.2.3


From 6aba0cb5bba6141158d5449f2cf53187b7f755f9 Mon Sep 17 00:00:00 2001
From: Anup Patel <apatel@ventanamicro.com>
Date: Thu, 5 Jun 2025 11:44:46 +0530
Subject: RISC-V: KVM: Fix the size parameter check in SBI SFENCE calls

As-per the SBI specification, an SBI remote fence operation applies
to the entire address space if either:
1) start_addr and size are both 0
2) size is equal to 2^XLEN-1

>From the above, only #1 is checked by SBI SFENCE calls so fix the
size parameter check in SBI SFENCE calls to cover #2 as well.

Fixes: 13acfec2dbcc ("RISC-V: KVM: Add remote HFENCE functions based on VCPU requests")
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Signed-off-by: Anup Patel <apatel@ventanamicro.com>
Link: https://lore.kernel.org/r/20250605061458.196003-2-apatel@ventanamicro.com
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 arch/riscv/kvm/vcpu_sbi_replace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c
index 5fbf3f94f1e8..9752d2ffff68 100644
--- a/arch/riscv/kvm/vcpu_sbi_replace.c
+++ b/arch/riscv/kvm/vcpu_sbi_replace.c
@@ -103,7 +103,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
 		kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT);
 		break;
 	case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
-		if (cp->a2 == 0 && cp->a3 == 0)
+		if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL)
 			kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask);
 		else
 			kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask,
@@ -111,7 +111,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
 		kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT);
 		break;
 	case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
-		if (cp->a2 == 0 && cp->a3 == 0)
+		if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL)
 			kvm_riscv_hfence_vvma_asid_all(vcpu->kvm,
 						       hbase, hmask, cp->a4);
 		else
-- 
cgit v1.2.3


From 2e7be162996640bbe3b6da694cc064c511b8a5d9 Mon Sep 17 00:00:00 2001
From: Anup Patel <apatel@ventanamicro.com>
Date: Thu, 5 Jun 2025 11:44:47 +0530
Subject: RISC-V: KVM: Don't treat SBI HFENCE calls as NOPs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The SBI specification clearly states that SBI HFENCE calls should
return SBI_ERR_NOT_SUPPORTED when one of the target hart doesn’t
support hypervisor extension (aka nested virtualization in-case
of KVM RISC-V).

Fixes: c7fa3c48de86 ("RISC-V: KVM: Treat SBI HFENCE calls as NOPs")
Reviewed-by: Atish Patra <atishp@rivosinc.com>
Signed-off-by: Anup Patel <apatel@ventanamicro.com>
Link: https://lore.kernel.org/r/20250605061458.196003-3-apatel@ventanamicro.com
Signed-off-by: Anup Patel <anup@brainfault.org>
---
 arch/riscv/kvm/vcpu_sbi_replace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c
index 9752d2ffff68..b17fad091bab 100644
--- a/arch/riscv/kvm/vcpu_sbi_replace.c
+++ b/arch/riscv/kvm/vcpu_sbi_replace.c
@@ -127,9 +127,9 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
 	case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID:
 		/*
 		 * Until nested virtualization is implemented, the
-		 * SBI HFENCE calls should be treated as NOPs
+		 * SBI HFENCE calls should return not supported
+		 * hence fallthrough.
 		 */
-		break;
 	default:
 		retdata->err_val = SBI_ERR_NOT_SUPPORTED;
 	}
-- 
cgit v1.2.3


From 94a17f2dc90bc7eae36c0f478515d4bd1c23e877 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Tue, 10 Jun 2025 15:24:20 -0700
Subject: x86/mm: Disable INVLPGB when PTI is enabled

PTI uses separate ASIDs (aka. PCIDs) for kernel and user address
spaces. When the kernel needs to flush the user address space, it
just sets a bit in a bitmap and then flushes the entire PCID on
the next switch to userspace.

This bitmap is a single 'unsigned long' which is plenty for all 6
dynamic ASIDs. But, unfortunately, the INVLPGB support brings along a
bunch more user ASIDs, as many as ~2k more. The bitmap can't address
that many.

Fortunately, the bitmap is only needed for PTI and all the CPUs
with INVLPGB are AMD CPUs that aren't vulnerable to Meltdown and
don't need PTI. The only way someone can run into an issue in
practice is by booting with pti=on on a newer AMD CPU.

Disable INVLPGB if PTI is enabled. Avoid overrunning the small
bitmap.

Note: this will be fixed up properly by making the bitmap bigger.
For now, just avoid the mostly theoretical bug.

Fixes: 4afeb0ed1753 ("x86/mm: Enable broadcast TLB invalidation for multi-threaded processes")
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Rik van Riel <riel@surriel.com>
Cc:stable@vger.kernel.org
Link: https://lore.kernel.org/all/20250610222420.E8CBF472%40davehans-spike.ostc.intel.com
---
 arch/x86/mm/pti.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 190299834011..c0c40b67524e 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -98,6 +98,11 @@ void __init pti_check_boottime_disable(void)
 		return;
 
 	setup_force_cpu_cap(X86_FEATURE_PTI);
+
+	if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) {
+		pr_debug("PTI enabled, disabling INVLPGB\n");
+		setup_clear_cpu_cap(X86_FEATURE_INVLPGB);
+	}
 }
 
 static int __init pti_parse_cmdline(char *arg)
-- 
cgit v1.2.3


From 3c902383f2da91cba3821b73aa6edd49f4db6023 Mon Sep 17 00:00:00 2001
From: Lukas Bulwahn <lukas.bulwahn@redhat.com>
Date: Mon, 16 Jun 2025 12:04:32 +0200
Subject: x86/its: Fix an ifdef typo in its_alloc()

Commit a82b26451de1 ("x86/its: explicitly manage permissions for ITS
pages") reworks its_alloc() and introduces a typo in an ifdef
conditional, referring to CONFIG_MODULE instead of CONFIG_MODULES.

Fix this typo in its_alloc().

Fixes: a82b26451de1 ("x86/its: explicitly manage permissions for ITS pages")
Signed-off-by: Lukas Bulwahn <lukas.bulwahn@redhat.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/all/20250616100432.22941-1-lukas.bulwahn%40redhat.com
---
 arch/x86/kernel/alternative.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6455f7f751b3..9ae80fa904a2 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -228,7 +228,7 @@ static void *its_alloc(void)
 	struct its_array *pages = &its_pages;
 	void *page;
 
-#ifdef CONFIG_MODULE
+#ifdef CONFIG_MODULES
 	if (its_mod)
 		pages = &its_mod->arch.its_pages;
 #endif
-- 
cgit v1.2.3


From cb6075bc62dc6a9cd7ab3572758685fdf78e3e20 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@surriel.com>
Date: Fri, 6 Jun 2025 13:10:34 -0400
Subject: x86/mm: Fix early boot use of INVPLGB

The INVLPGB instruction has limits on how many pages it can invalidate
at once. That limit is enumerated in CPUID, read by the kernel, and
stored in 'invpgb_count_max'. Ranged invalidation, like
invlpgb_kernel_range_flush() break up their invalidations so
that they do not exceed the limit.

However, early boot code currently attempts to do ranged
invalidation before populating 'invlpgb_count_max'. There is a
for loop which is basically:

	for (...; addr < end; addr += invlpgb_count_max*PAGE_SIZE)

If invlpgb_kernel_range_flush is called before the kernel has read
the value of invlpgb_count_max from the hardware, the normally
bounded loop can become an infinite loop if invlpgb_count_max is
initialized to zero.

Fix that issue by initializing invlpgb_count_max to 1.

This way INVPLGB at early boot time will be a little bit slower
than normal (with initialized invplgb_count_max), and not an
instant hang at bootup time.

Fixes: b7aa05cbdc52 ("x86/mm: Add INVLPGB support code")
Signed-off-by: Rik van Riel <riel@surriel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Link: https://lore.kernel.org/all/20250606171112.4013261-3-riel%40surriel.com
---
 arch/x86/kernel/cpu/amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 93da466dfe2c..b2ad8d13211a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -31,7 +31,7 @@
 
 #include "cpu.h"
 
-u16 invlpgb_count_max __ro_after_init;
+u16 invlpgb_count_max __ro_after_init = 1;
 
 static inline int rdmsrq_amd_safe(unsigned msr, u64 *p)
 {
-- 
cgit v1.2.3


From 2aebf5ee43bf0ed225a09a30cf515d9f2813b759 Mon Sep 17 00:00:00 2001
From: "Masami Hiramatsu (Google)" <mhiramat@kernel.org>
Date: Wed, 18 Jun 2025 09:05:23 +0900
Subject: x86/alternatives: Fix int3 handling failure from broken text_poke
 array

Since smp_text_poke_single() does not expect there is another
text_poke request is queued, it can make text_poke_array not
sorted or cause a buffer overflow on the text_poke_array.vec[].
This will cause an Oops in int3 because of bsearch failing;

   CPU 0                        CPU 1                      CPU 2
   -----                        -----                      -----

 smp_text_poke_batch_add()

			    smp_text_poke_single() <<-- Adds out of order

							<int3>
                                                	[Fails o find address
                                                        in text_poke_array ]
                                                        OOPS!

Or unhandled page fault because of a buffer overflow;

   CPU 0                        CPU 1
   -----                        -----

 smp_text_poke_batch_add() <<+
 ...                         |
 smp_text_poke_batch_add() <<-- Adds TEXT_POKE_ARRAY_MAX times.

			     smp_text_poke_single() {
			     	__smp_text_poke_batch_add() <<-- Adds entry at
								TEXT_POKE_ARRAY_MAX + 1

                		smp_text_poke_batch_finish()
                        	  [Unhandled page fault because
				   text_poke_array.nr_entries is
				   overwritten]
				   BUG!
			     }

Use smp_text_poke_batch_add() instead of __smp_text_poke_batch_add()
so that it correctly flush the queue if needed.

Closes: https://lore.kernel.org/all/CA+G9fYsLu0roY3DV=tKyqP7FEKbOEETRvTDhnpPxJGbA=Cg+4w@mail.gmail.com/
Fixes: c8976ade0c1b ("x86/alternatives: Simplify smp_text_poke_single() by using tp_vec and existing APIs")
Reported-by: Linux Kernel Functional Testing <lkft@linaro.org>
Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Tested-by: Linux Kernel Functional Testing <lkft@linaro.org>
Link: https://lkml.kernel.org/r/\ 175020512308.3582717.13631440385506146631.stgit@mhiramat.tok.corp.google.com
---
 arch/x86/kernel/alternative.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9ae80fa904a2..ea1d984166cd 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -3138,6 +3138,6 @@ void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, c
  */
 void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate)
 {
-	__smp_text_poke_batch_add(addr, opcode, len, emulate);
+	smp_text_poke_batch_add(addr, opcode, len, emulate);
 	smp_text_poke_batch_finish();
 }
-- 
cgit v1.2.3


From 8a8ff069c7ad9a359c54683329883e2432cff191 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sun, 15 Jun 2025 16:11:38 +0100
Subject: KVM: arm64: nv: Fix tracking of shadow list registers

Wei-Lin reports that the tracking of shadow list registers is
majorly broken when resync'ing the L2 state after a run, as
we confuse the guest's LR index with the host's, potentially
losing the interrupt state.

While this could be fixed by adding yet another side index to
track it (Wei-Lin's fix), it may be better to refactor this
code to avoid having a side index altogether, limiting the
risk to introduce this class of bugs.

A key observation is that the shadow index is always the number
of bits in the lr_map bitmap. With that, the parallel indexing
scheme can be completely dropped.

While doing this, introduce a couple of helpers that abstract
the index conversion and some of the LR repainting, making the
whole exercise much simpler.

Reported-by: Wei-Lin Chang <r09922117@csie.ntu.edu.tw>
Reviewed-by: Wei-Lin Chang <r09922117@csie.ntu.edu.tw>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Signed-off-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20250614145721.2504524-1-r09922117@csie.ntu.edu.tw
Link: https://lore.kernel.org/r/86qzzkc5xa.wl-maz@kernel.org
---
 arch/arm64/kvm/vgic/vgic-v3-nested.c | 81 +++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 39 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
index d22a8ad7bcc5..a50fb7e6841f 100644
--- a/arch/arm64/kvm/vgic/vgic-v3-nested.c
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -36,6 +36,11 @@ struct shadow_if {
 
 static DEFINE_PER_CPU(struct shadow_if, shadow_if);
 
+static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
+{
+	return hweight16(shadow_if->lr_map & (BIT(idx) - 1));
+}
+
 /*
  * Nesting GICv3 support
  *
@@ -209,6 +214,29 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
 	return reg;
 }
 
+static u64 translate_lr_pintid(struct kvm_vcpu *vcpu, u64 lr)
+{
+	struct vgic_irq *irq;
+
+	if (!(lr & ICH_LR_HW))
+		return lr;
+
+	/* We have the HW bit set, check for validity of pINTID */
+	irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+	/* If there was no real mapping, nuke the HW bit */
+	if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI)
+		lr &= ~ICH_LR_HW;
+
+	/* Translate the virtual mapping to the real one, even if invalid */
+	if (irq) {
+		lr &= ~ICH_LR_PHYS_ID_MASK;
+		lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
+		vgic_put_irq(vcpu->kvm, irq);
+	}
+
+	return lr;
+}
+
 /*
  * For LRs which have HW bit set such as timer interrupts, we modify them to
  * have the host hardware interrupt number instead of the virtual one programmed
@@ -217,58 +245,37 @@ u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
 static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
 				     struct vgic_v3_cpu_if *s_cpu_if)
 {
-	unsigned long lr_map = 0;
-	int index = 0;
+	struct shadow_if *shadow_if;
+
+	shadow_if = container_of(s_cpu_if, struct shadow_if, cpuif);
+	shadow_if->lr_map = 0;
 
 	for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
 		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
-		struct vgic_irq *irq;
 
 		if (!(lr & ICH_LR_STATE))
-			lr = 0;
-
-		if (!(lr & ICH_LR_HW))
-			goto next;
-
-		/* We have the HW bit set, check for validity of pINTID */
-		irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
-		if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) {
-			/* There was no real mapping, so nuke the HW bit */
-			lr &= ~ICH_LR_HW;
-			if (irq)
-				vgic_put_irq(vcpu->kvm, irq);
-			goto next;
-		}
-
-		/* Translate the virtual mapping to the real one */
-		lr &= ~ICH_LR_PHYS_ID_MASK;
-		lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);
+			continue;
 
-		vgic_put_irq(vcpu->kvm, irq);
+		lr = translate_lr_pintid(vcpu, lr);
 
-next:
-		s_cpu_if->vgic_lr[index] = lr;
-		if (lr) {
-			lr_map |= BIT(i);
-			index++;
-		}
+		s_cpu_if->vgic_lr[hweight16(shadow_if->lr_map)] = lr;
+		shadow_if->lr_map |= BIT(i);
 	}
 
-	container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map;
-	s_cpu_if->used_lrs = index;
+	s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
 }
 
 void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
 {
 	struct shadow_if *shadow_if = get_shadow_if();
-	int i, index = 0;
+	int i;
 
 	for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
 		u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
 		struct vgic_irq *irq;
 
 		if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
-			goto next;
+			continue;
 
 		/*
 		 * If we had a HW lr programmed by the guest hypervisor, we
@@ -277,15 +284,13 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
 		 */
 		irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
 		if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
-			goto next;
+			continue;
 
-		lr = __gic_v3_get_lr(index);
+		lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
 		if (!(lr & ICH_LR_STATE))
 			irq->active = false;
 
 		vgic_put_irq(vcpu->kvm, irq);
-	next:
-		index++;
 	}
 }
 
@@ -368,13 +373,11 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
 		val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
 
 		val &= ~ICH_LR_STATE;
-		val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE;
+		val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE;
 
 		__vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
-		s_cpu_if->vgic_lr[i] = 0;
 	}
 
-	shadow_if->lr_map = 0;
 	vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
 }
 
-- 
cgit v1.2.3


From 1fbe6861a6d9a942fb8ab8677ddf1ecb86b1af60 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Wed, 11 Jun 2025 15:45:04 -0700
Subject: KVM: arm64: Explicitly treat routing entry type changes as changes

Explicitly treat type differences as GSI routing changes, as comparing MSI
data between two entries could get a false negative, e.g. if userspace
changed the type but left the type-specific data as-

Note, the same bug was fixed in x86 by commit bcda70c56f3e ("KVM: x86:
Explicitly treat routing entry type changes as changes").

Fixes: 4bf3693d36af ("KVM: arm64: Unmap vLPIs affected by changes to GSI routing information")
Signed-off-by: Sean Christopherson <seanjc@google.com>
Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
Link: https://lore.kernel.org/r/20250611224604.313496-3-seanjc@google.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/arm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index de2b4e9c9f9f..38a91bb5d4c7 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -2764,7 +2764,8 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
 				  struct kvm_kernel_irq_routing_entry *new)
 {
-	if (new->type != KVM_IRQ_ROUTING_MSI)
+	if (old->type != KVM_IRQ_ROUTING_MSI ||
+	    new->type != KVM_IRQ_ROUTING_MSI)
 		return true;
 
 	return memcmp(&old->msi, &new->msi, sizeof(new->msi));
-- 
cgit v1.2.3


From cade3d57e456e69f67aa9894bf89dc8678796bb7 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 17 Jun 2025 14:37:12 +0100
Subject: KVM: arm64: VHE: Synchronize restore of host debug registers

When KVM runs in non-protected VHE mode, there's no context
synchronization event between __debug_switch_to_host() restoring the
host debug registers and __kvm_vcpu_run() unmasking debug exceptions.
Due to this, it's theoretically possible for the host to take an
unexpected debug exception due to the stale guest configuration.

This cannot happen in NVHE/HVHE mode as debug exceptions are masked in
the hyp code, and the exception return to the host will provide the
necessary context synchronization before debug exceptions can be taken.

For now, avoid the problem by adding an ISB after VHE hyp code restores
the host debug registers.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Fuad Tabba <tabba@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Will Deacon <will@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20250617133718.4014181-2-mark.rutland@arm.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
index 502a5b73ee70..73881e1dc267 100644
--- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
@@ -167,6 +167,9 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
 
 	__debug_save_state(guest_dbg, guest_ctxt);
 	__debug_restore_state(host_dbg, host_ctxt);
+
+	if (has_vhe())
+		isb();
 }
 
 #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */
-- 
cgit v1.2.3


From 257d0aa8e2502754bc758faceceb6ff59318af60 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 17 Jun 2025 14:37:13 +0100
Subject: KVM: arm64: VHE: Synchronize CPTR trap deactivation

Currently there is no ISB between __deactivate_cptr_traps() disabling
traps that affect EL2 and fpsimd_lazy_switch_to_host() manipulating
registers potentially affected by CPTR traps.

When NV is not in use, this is safe because the relevant registers are
only accessed when guest_owns_fp_regs() && vcpu_has_sve(vcpu), and this
also implies that SVE traps affecting EL2 have been deactivated prior to
__guest_entry().

When NV is in use, a guest hypervisor may have configured SVE traps for
a nested context, and so it is necessary to have an ISB between
__deactivate_cptr_traps() and fpsimd_lazy_switch_to_host().

Due to the current lack of an ISB, when a guest hypervisor enables SVE
traps in CPTR, the host can take an unexpected SVE trap from within
fpsimd_lazy_switch_to_host(), e.g.

| Unhandled 64-bit el1h sync exception on CPU1, ESR 0x0000000066000000 -- SVE
| CPU: 1 UID: 0 PID: 164 Comm: kvm-vcpu-0 Not tainted 6.15.0-rc4-00138-ga05e0f012c05 #3 PREEMPT
| Hardware name: FVP Base RevC (DT)
| pstate: 604023c9 (nZCv DAIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
| pc : __kvm_vcpu_run+0x6f4/0x844
| lr : __kvm_vcpu_run+0x150/0x844
| sp : ffff800083903a60
| x29: ffff800083903a90 x28: ffff000801f4a300 x27: 0000000000000000
| x26: 0000000000000000 x25: ffff000801f90000 x24: ffff000801f900f0
| x23: ffff800081ff7720 x22: 0002433c807d623f x21: ffff000801f90000
| x20: ffff00087f730730 x19: 0000000000000000 x18: 0000000000000000
| x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000
| x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
| x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000
| x8 : 0000000000000000 x7 : 0000000000000000 x6 : ffff000801f90d70
| x5 : 0000000000001000 x4 : ffff8007fd739000 x3 : ffff000801f90000
| x2 : 0000000000000000 x1 : 00000000000003cc x0 : ffff800082f9d000
| Kernel panic - not syncing: Unhandled exception
| CPU: 1 UID: 0 PID: 164 Comm: kvm-vcpu-0 Not tainted 6.15.0-rc4-00138-ga05e0f012c05 #3 PREEMPT
| Hardware name: FVP Base RevC (DT)
| Call trace:
|  show_stack+0x18/0x24 (C)
|  dump_stack_lvl+0x60/0x80
|  dump_stack+0x18/0x24
|  panic+0x168/0x360
|  __panic_unhandled+0x68/0x74
|  el1h_64_irq_handler+0x0/0x24
|  el1h_64_sync+0x6c/0x70
|  __kvm_vcpu_run+0x6f4/0x844 (P)
|  kvm_arm_vcpu_enter_exit+0x64/0xa0
|  kvm_arch_vcpu_ioctl_run+0x21c/0x870
|  kvm_vcpu_ioctl+0x1a8/0x9d0
|  __arm64_sys_ioctl+0xb4/0xf4
|  invoke_syscall+0x48/0x104
|  el0_svc_common.constprop.0+0x40/0xe0
|  do_el0_svc+0x1c/0x28
|  el0_svc+0x30/0xcc
|  el0t_64_sync_handler+0x10c/0x138
|  el0t_64_sync+0x198/0x19c
| SMP: stopping secondary CPUs
| Kernel Offset: disabled
| CPU features: 0x0000,000002c0,02df4fb9,97ee773f
| Memory Limit: none
| ---[ end Kernel panic - not syncing: Unhandled exception ]---

Fix this by adding an ISB between __deactivate_traps() and
fpsimd_lazy_switch_to_host().

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Fuad Tabba <tabba@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20250617133718.4014181-3-mark.rutland@arm.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/hyp/vhe/switch.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 09df2b42bc1b..20df44b49ceb 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -667,6 +667,9 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 
 	__deactivate_traps(vcpu);
 
+	/* Ensure CPTR trap deactivation has taken effect */
+	isb();
+
 	fpsimd_lazy_switch_to_host(vcpu);
 
 	sysreg_restore_host_state_vhe(host_ctxt);
-- 
cgit v1.2.3


From e62dd507844fa47f0fdc29f3be5a90a83f297820 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 17 Jun 2025 14:37:14 +0100
Subject: KVM: arm64: Reorganise CPTR trap manipulation

The NVHE/HVHE and VHE modes have separate implementations of
__activate_cptr_traps() and __deactivate_cptr_traps() in their
respective switch.c files. There's some duplication of logic, and it's
not currently possible to reuse this logic elsewhere.

Move the logic into the common switch.h header so that it can be reused,
and de-duplicate the common logic.

This rework changes the way SVE traps are deactivated in VHE mode,
aligning it with NVHE/HVHE modes:

* Before this patch, VHE's __deactivate_cptr_traps() would
  unconditionally enable SVE for host EL2 (but not EL0), regardless of
  whether the ARM64_SVE cpucap was set.

* After this patch, VHE's __deactivate_cptr_traps() will take the
  ARM64_SVE cpucap into account. When ARM64_SVE is not set, SVE will be
  trapped from EL2 and below.

The old and new behaviour are both benign:

* When ARM64_SVE is not set, the host will not touch SVE state, and will
  not reconfigure SVE traps. Host EL0 access to SVE will be trapped as
  expected.

* When ARM64_SVE is set, the host will configure EL0 SVE traps before
  returning to EL0 as part of reloading the EL0 FPSIMD/SVE/SME state.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Fuad Tabba <tabba@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20250617133718.4014181-4-mark.rutland@arm.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/hyp/include/hyp/switch.h | 130 ++++++++++++++++++++++++++++++++
 arch/arm64/kvm/hyp/nvhe/switch.c        |  59 ---------------
 arch/arm64/kvm/hyp/vhe/switch.c         |  81 --------------------
 3 files changed, 130 insertions(+), 140 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 76dfda116e56..8a77fcccbcf6 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -65,6 +65,136 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
 	}
 }
 
+static inline void __activate_cptr_traps_nvhe(struct kvm_vcpu *vcpu)
+{
+	u64 val = CPTR_NVHE_EL2_RES1 | CPTR_EL2_TAM | CPTR_EL2_TTA;
+
+	/*
+	 * Always trap SME since it's not supported in KVM.
+	 * TSM is RES1 if SME isn't implemented.
+	 */
+	val |= CPTR_EL2_TSM;
+
+	if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
+		val |= CPTR_EL2_TZ;
+
+	if (!guest_owns_fp_regs())
+		val |= CPTR_EL2_TFP;
+
+	write_sysreg(val, cptr_el2);
+}
+
+static inline void __activate_cptr_traps_vhe(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
+	 * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
+	 * except for some missing controls, such as TAM.
+	 * In this case, CPTR_EL2.TAM has the same position with or without
+	 * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
+	 * shift value for trapping the AMU accesses.
+	 */
+	u64 val = CPTR_EL2_TAM | CPACR_EL1_TTA;
+	u64 cptr;
+
+	if (guest_owns_fp_regs()) {
+		val |= CPACR_EL1_FPEN;
+		if (vcpu_has_sve(vcpu))
+			val |= CPACR_EL1_ZEN;
+	}
+
+	if (!vcpu_has_nv(vcpu))
+		goto write;
+
+	/*
+	 * The architecture is a bit crap (what a surprise): an EL2 guest
+	 * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
+	 * as they are RES0 in the guest's view. To work around it, trap the
+	 * sucker using the very same bit it can't set...
+	 */
+	if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
+		val |= CPTR_EL2_TCPAC;
+
+	/*
+	 * Layer the guest hypervisor's trap configuration on top of our own if
+	 * we're in a nested context.
+	 */
+	if (is_hyp_ctxt(vcpu))
+		goto write;
+
+	cptr = vcpu_sanitised_cptr_el2(vcpu);
+
+	/*
+	 * Pay attention, there's some interesting detail here.
+	 *
+	 * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
+	 * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
+	 *
+	 *  - CPTR_EL2.xEN = x0, traps are enabled
+	 *  - CPTR_EL2.xEN = x1, traps are disabled
+	 *
+	 * In other words, bit[0] determines if guest accesses trap or not. In
+	 * the interest of simplicity, clear the entire field if the guest
+	 * hypervisor has traps enabled to dispel any illusion of something more
+	 * complicated taking place.
+	 */
+	if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
+		val &= ~CPACR_EL1_FPEN;
+	if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
+		val &= ~CPACR_EL1_ZEN;
+
+	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
+		val |= cptr & CPACR_EL1_E0POE;
+
+	val |= cptr & CPTR_EL2_TCPAC;
+
+write:
+	write_sysreg(val, cpacr_el1);
+}
+
+static inline void __activate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+	if (!guest_owns_fp_regs())
+		__activate_traps_fpsimd32(vcpu);
+
+	if (has_vhe() || has_hvhe())
+		__activate_cptr_traps_vhe(vcpu);
+	else
+		__activate_cptr_traps_nvhe(vcpu);
+}
+
+static inline void __deactivate_cptr_traps_nvhe(struct kvm_vcpu *vcpu)
+{
+	u64 val = CPTR_NVHE_EL2_RES1;
+
+	if (!cpus_have_final_cap(ARM64_SVE))
+		val |= CPTR_EL2_TZ;
+	if (!cpus_have_final_cap(ARM64_SME))
+		val |= CPTR_EL2_TSM;
+
+	write_sysreg(val, cptr_el2);
+}
+
+static inline void __deactivate_cptr_traps_vhe(struct kvm_vcpu *vcpu)
+{
+	u64 val = CPACR_EL1_FPEN;
+
+	if (cpus_have_final_cap(ARM64_SVE))
+		val |= CPACR_EL1_ZEN;
+	if (cpus_have_final_cap(ARM64_SME))
+		val |= CPACR_EL1_SMEN;
+
+	write_sysreg(val, cpacr_el1);
+}
+
+static inline void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
+{
+	if (has_vhe() || has_hvhe())
+		__deactivate_cptr_traps_vhe(vcpu);
+	else
+		__deactivate_cptr_traps_nvhe(vcpu);
+}
+
 #define reg_to_fgt_masks(reg)						\
 	({								\
 		struct fgt_masks *m;					\
diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c
index 73affe1333a4..0e752b515d0f 100644
--- a/arch/arm64/kvm/hyp/nvhe/switch.c
+++ b/arch/arm64/kvm/hyp/nvhe/switch.c
@@ -47,65 +47,6 @@ struct fgt_masks hdfgwtr2_masks;
 
 extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc);
 
-static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
-{
-	u64 val = CPTR_EL2_TAM;	/* Same bit irrespective of E2H */
-
-	if (!guest_owns_fp_regs())
-		__activate_traps_fpsimd32(vcpu);
-
-	if (has_hvhe()) {
-		val |= CPACR_EL1_TTA;
-
-		if (guest_owns_fp_regs()) {
-			val |= CPACR_EL1_FPEN;
-			if (vcpu_has_sve(vcpu))
-				val |= CPACR_EL1_ZEN;
-		}
-
-		write_sysreg(val, cpacr_el1);
-	} else {
-		val |= CPTR_EL2_TTA | CPTR_NVHE_EL2_RES1;
-
-		/*
-		 * Always trap SME since it's not supported in KVM.
-		 * TSM is RES1 if SME isn't implemented.
-		 */
-		val |= CPTR_EL2_TSM;
-
-		if (!vcpu_has_sve(vcpu) || !guest_owns_fp_regs())
-			val |= CPTR_EL2_TZ;
-
-		if (!guest_owns_fp_regs())
-			val |= CPTR_EL2_TFP;
-
-		write_sysreg(val, cptr_el2);
-	}
-}
-
-static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
-{
-	if (has_hvhe()) {
-		u64 val = CPACR_EL1_FPEN;
-
-		if (cpus_have_final_cap(ARM64_SVE))
-			val |= CPACR_EL1_ZEN;
-		if (cpus_have_final_cap(ARM64_SME))
-			val |= CPACR_EL1_SMEN;
-
-		write_sysreg(val, cpacr_el1);
-	} else {
-		u64 val = CPTR_NVHE_EL2_RES1;
-
-		if (!cpus_have_final_cap(ARM64_SVE))
-			val |= CPTR_EL2_TZ;
-		if (!cpus_have_final_cap(ARM64_SME))
-			val |= CPTR_EL2_TSM;
-
-		write_sysreg(val, cptr_el2);
-	}
-}
-
 static void __activate_traps(struct kvm_vcpu *vcpu)
 {
 	___activate_traps(vcpu, vcpu->arch.hcr_el2);
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 20df44b49ceb..32b4814eb2b7 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -90,87 +90,6 @@ static u64 __compute_hcr(struct kvm_vcpu *vcpu)
 	return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE);
 }
 
-static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
-{
-	u64 cptr;
-
-	/*
-	 * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
-	 * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
-	 * except for some missing controls, such as TAM.
-	 * In this case, CPTR_EL2.TAM has the same position with or without
-	 * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
-	 * shift value for trapping the AMU accesses.
-	 */
-	u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM;
-
-	if (guest_owns_fp_regs()) {
-		val |= CPACR_EL1_FPEN;
-		if (vcpu_has_sve(vcpu))
-			val |= CPACR_EL1_ZEN;
-	} else {
-		__activate_traps_fpsimd32(vcpu);
-	}
-
-	if (!vcpu_has_nv(vcpu))
-		goto write;
-
-	/*
-	 * The architecture is a bit crap (what a surprise): an EL2 guest
-	 * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
-	 * as they are RES0 in the guest's view. To work around it, trap the
-	 * sucker using the very same bit it can't set...
-	 */
-	if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
-		val |= CPTR_EL2_TCPAC;
-
-	/*
-	 * Layer the guest hypervisor's trap configuration on top of our own if
-	 * we're in a nested context.
-	 */
-	if (is_hyp_ctxt(vcpu))
-		goto write;
-
-	cptr = vcpu_sanitised_cptr_el2(vcpu);
-
-	/*
-	 * Pay attention, there's some interesting detail here.
-	 *
-	 * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
-	 * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
-	 *
-	 *  - CPTR_EL2.xEN = x0, traps are enabled
-	 *  - CPTR_EL2.xEN = x1, traps are disabled
-	 *
-	 * In other words, bit[0] determines if guest accesses trap or not. In
-	 * the interest of simplicity, clear the entire field if the guest
-	 * hypervisor has traps enabled to dispel any illusion of something more
-	 * complicated taking place.
-	 */
-	if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
-		val &= ~CPACR_EL1_FPEN;
-	if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
-		val &= ~CPACR_EL1_ZEN;
-
-	if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
-		val |= cptr & CPACR_EL1_E0POE;
-
-	val |= cptr & CPTR_EL2_TCPAC;
-
-write:
-	write_sysreg(val, cpacr_el1);
-}
-
-static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
-{
-	u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN;
-
-	if (cpus_have_final_cap(ARM64_SME))
-		val |= CPACR_EL1_SMEN_EL1EN;
-
-	write_sysreg(val, cpacr_el1);
-}
-
 static void __activate_traps(struct kvm_vcpu *vcpu)
 {
 	u64 val;
-- 
cgit v1.2.3


From 59e6e101a6fa542a365dd5858affd18ba3e84cb8 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 17 Jun 2025 14:37:15 +0100
Subject: KVM: arm64: Remove ad-hoc CPTR manipulation from fpsimd_sve_sync()

There's no need for fpsimd_sve_sync() to write to CPTR/CPACR. All
relevant traps are always disabled earlier within __kvm_vcpu_run(), when
__deactivate_cptr_traps() configures CPTR/CPACR.

With irrelevant details elided, the flow is:

handle___kvm_vcpu_run(...)
{
	flush_hyp_vcpu(...) {
		fpsimd_sve_flush(...);
	}

	__kvm_vcpu_run(...) {
		__activate_traps(...) {
			__activate_cptr_traps(...);
		}

		do {
			__guest_enter(...);
		} while (...);

		__deactivate_traps(....) {
			__deactivate_cptr_traps(...);
		}
	}

	sync_hyp_vcpu(...) {
		fpsimd_sve_sync(...);
	}
}

Remove the unnecessary write to CPTR/CPACR. An ISB is still necessary,
so a comment is added to describe this requirement.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Fuad Tabba <tabba@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20250617133718.4014181-5-mark.rutland@arm.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/hyp/nvhe/hyp-main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index e9198e56e784..3206b2c07f82 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -69,7 +69,10 @@ static void fpsimd_sve_sync(struct kvm_vcpu *vcpu)
 	if (!guest_owns_fp_regs())
 		return;
 
-	cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
+	/*
+	 * Traps have been disabled by __deactivate_cptr_traps(), but there
+	 * hasn't necessarily been a context synchronization event yet.
+	 */
 	isb();
 
 	if (vcpu_has_sve(vcpu))
-- 
cgit v1.2.3


From 186b58bacd74d9b7892869f7c7d20cf865a3c237 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 17 Jun 2025 14:37:16 +0100
Subject: KVM: arm64: Remove ad-hoc CPTR manipulation from
 kvm_hyp_handle_fpsimd()

The hyp code FPSIMD/SVE/SME trap handling logic has some rather messy
open-coded manipulation of CPTR/CPACR. This is benign for non-nested
guests, but broken for nested guests, as the guest hypervisor's CPTR
configuration is not taken into account.

Consider the case where L0 provides FPSIMD+SVE to an L1 guest
hypervisor, and the L1 guest hypervisor only provides FPSIMD to an L2
guest (with L1 configuring CPTR/CPACR to trap SVE usage from L2). If the
L2 guest triggers an FPSIMD trap to the L0 hypervisor,
kvm_hyp_handle_fpsimd() will see that the vCPU supports FPSIMD+SVE, and
will configure CPTR/CPACR to NOT trap FPSIMD+SVE before returning to the
L2 guest. Consequently the L2 guest would be able to manipulate SVE
state even though the L1 hypervisor had configured CPTR/CPACR to forbid
this.

Clean this up, and fix the nested virt issue by always using
__deactivate_cptr_traps() and __activate_cptr_traps() to manage the CPTR
traps. This removes the need for the ad-hoc fixup in
kvm_hyp_save_fpsimd_host(), and ensures that any guest hypervisor
configuration of CPTR/CPACR is taken into account.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Fuad Tabba <tabba@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20250617133718.4014181-6-mark.rutland@arm.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/kvm/hyp/include/hyp/switch.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h
index 8a77fcccbcf6..2ad57b117385 100644
--- a/arch/arm64/kvm/hyp/include/hyp/switch.h
+++ b/arch/arm64/kvm/hyp/include/hyp/switch.h
@@ -616,11 +616,6 @@ static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
 	 */
 	if (system_supports_sve()) {
 		__hyp_sve_save_host();
-
-		/* Re-enable SVE traps if not supported for the guest vcpu. */
-		if (!vcpu_has_sve(vcpu))
-			cpacr_clear_set(CPACR_EL1_ZEN, 0);
-
 	} else {
 		__fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs));
 	}
@@ -671,10 +666,7 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
 	/* Valid trap.  Switch the context: */
 
 	/* First disable enough traps to allow us to update the registers */
-	if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve()))
-		cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
-	else
-		cpacr_clear_set(0, CPACR_EL1_FPEN);
+	__deactivate_cptr_traps(vcpu);
 	isb();
 
 	/* Write out the host state if it's in the registers */
@@ -696,6 +688,13 @@ static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
 
 	*host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED;
 
+	/*
+	 * Re-enable traps necessary for the current state of the guest, e.g.
+	 * those enabled by a guest hypervisor. The ERET to the guest will
+	 * provide the necessary context synchronization.
+	 */
+	__activate_cptr_traps(vcpu);
+
 	return true;
 }
 
-- 
cgit v1.2.3


From 3a300a33e4063fb44c7887bec3aecd2fd6966df8 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 17 Jun 2025 14:37:17 +0100
Subject: KVM: arm64: Remove cpacr_clear_set()

We no longer use cpacr_clear_set().

Remove cpacr_clear_set() and its helper functions.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Fuad Tabba <tabba@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20250617133718.4014181-7-mark.rutland@arm.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_emulate.h | 62 ------------------------------------
 1 file changed, 62 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
index bd020fc28aa9..0720898f563e 100644
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@@ -561,68 +561,6 @@ static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu)
 		vcpu_set_flag((v), e);					\
 	} while (0)
 
-#define __build_check_all_or_none(r, bits)				\
-	BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits))
-
-#define __cpacr_to_cptr_clr(clr, set)					\
-	({								\
-		u64 cptr = 0;						\
-									\
-		if ((set) & CPACR_EL1_FPEN)				\
-			cptr |= CPTR_EL2_TFP;				\
-		if ((set) & CPACR_EL1_ZEN)				\
-			cptr |= CPTR_EL2_TZ;				\
-		if ((set) & CPACR_EL1_SMEN)				\
-			cptr |= CPTR_EL2_TSM;				\
-		if ((clr) & CPACR_EL1_TTA)				\
-			cptr |= CPTR_EL2_TTA;				\
-		if ((clr) & CPTR_EL2_TAM)				\
-			cptr |= CPTR_EL2_TAM;				\
-		if ((clr) & CPTR_EL2_TCPAC)				\
-			cptr |= CPTR_EL2_TCPAC;				\
-									\
-		cptr;							\
-	})
-
-#define __cpacr_to_cptr_set(clr, set)					\
-	({								\
-		u64 cptr = 0;						\
-									\
-		if ((clr) & CPACR_EL1_FPEN)				\
-			cptr |= CPTR_EL2_TFP;				\
-		if ((clr) & CPACR_EL1_ZEN)				\
-			cptr |= CPTR_EL2_TZ;				\
-		if ((clr) & CPACR_EL1_SMEN)				\
-			cptr |= CPTR_EL2_TSM;				\
-		if ((set) & CPACR_EL1_TTA)				\
-			cptr |= CPTR_EL2_TTA;				\
-		if ((set) & CPTR_EL2_TAM)				\
-			cptr |= CPTR_EL2_TAM;				\
-		if ((set) & CPTR_EL2_TCPAC)				\
-			cptr |= CPTR_EL2_TCPAC;				\
-									\
-		cptr;							\
-	})
-
-#define cpacr_clear_set(clr, set)					\
-	do {								\
-		BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0);		\
-		BUILD_BUG_ON((clr) & CPACR_EL1_E0POE);			\
-		__build_check_all_or_none((clr), CPACR_EL1_FPEN);	\
-		__build_check_all_or_none((set), CPACR_EL1_FPEN);	\
-		__build_check_all_or_none((clr), CPACR_EL1_ZEN);	\
-		__build_check_all_or_none((set), CPACR_EL1_ZEN);	\
-		__build_check_all_or_none((clr), CPACR_EL1_SMEN);	\
-		__build_check_all_or_none((set), CPACR_EL1_SMEN);	\
-									\
-		if (has_vhe() || has_hvhe())				\
-			sysreg_clear_set(cpacr_el1, clr, set);		\
-		else							\
-			sysreg_clear_set(cptr_el2,			\
-					 __cpacr_to_cptr_clr(clr, set),	\
-					 __cpacr_to_cptr_set(clr, set));\
-	} while (0)
-
 /*
  * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE
  * format if E2H isn't set.
-- 
cgit v1.2.3


From 04c5355b2a94ff3191ce63ab035fb7f04d036869 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 17 Jun 2025 14:37:18 +0100
Subject: KVM: arm64: VHE: Centralize ISBs when returning to host

The VHE hyp code has recently gained a few ISBs. Simplify this to one
unconditional ISB in __kvm_vcpu_run_vhe(), and remove the unnecessary
ISB from the kvm_call_hyp_ret() macro.

While kvm_call_hyp_ret() is also used to invoke
__vgic_v3_get_gic_config(), but no ISB is necessary in that case either.

For the moment, an ISB is left in kvm_call_hyp(), as there are many more
users, and removing the ISB would require a more thorough audit.

Suggested-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Fuad Tabba <tabba@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Brown <broonie@kernel.org>
Cc: Oliver Upton <oliver.upton@linux.dev>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20250617133718.4014181-8-mark.rutland@arm.com
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/kvm_host.h         |  6 ++----
 arch/arm64/kvm/hyp/include/hyp/debug-sr.h |  3 ---
 arch/arm64/kvm/hyp/vhe/switch.c           | 25 ++++++++++++-------------
 3 files changed, 14 insertions(+), 20 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 5ccca509dff1..d27079968341 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -1289,9 +1289,8 @@ void kvm_arm_resume_guest(struct kvm *kvm);
 	})
 
 /*
- * The couple of isb() below are there to guarantee the same behaviour
- * on VHE as on !VHE, where the eret to EL1 acts as a context
- * synchronization event.
+ * The isb() below is there to guarantee the same behaviour on VHE as on !VHE,
+ * where the eret to EL1 acts as a context synchronization event.
  */
 #define kvm_call_hyp(f, ...)						\
 	do {								\
@@ -1309,7 +1308,6 @@ void kvm_arm_resume_guest(struct kvm *kvm);
 									\
 		if (has_vhe()) {					\
 			ret = f(__VA_ARGS__);				\
-			isb();						\
 		} else {						\
 			ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__);	\
 		}							\
diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
index 73881e1dc267..502a5b73ee70 100644
--- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
+++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h
@@ -167,9 +167,6 @@ static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
 
 	__debug_save_state(guest_dbg, guest_ctxt);
 	__debug_restore_state(host_dbg, host_ctxt);
-
-	if (has_vhe())
-		isb();
 }
 
 #endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */
diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c
index 32b4814eb2b7..477f1580ffea 100644
--- a/arch/arm64/kvm/hyp/vhe/switch.c
+++ b/arch/arm64/kvm/hyp/vhe/switch.c
@@ -558,10 +558,10 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 	host_ctxt = host_data_ptr(host_ctxt);
 	guest_ctxt = &vcpu->arch.ctxt;
 
-	sysreg_save_host_state_vhe(host_ctxt);
-
 	fpsimd_lazy_switch_to_guest(vcpu);
 
+	sysreg_save_host_state_vhe(host_ctxt);
+
 	/*
 	 * Note that ARM erratum 1165522 requires us to configure both stage 1
 	 * and stage 2 translation for the guest context before we clear
@@ -586,18 +586,23 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
 
 	__deactivate_traps(vcpu);
 
-	/* Ensure CPTR trap deactivation has taken effect */
+	sysreg_restore_host_state_vhe(host_ctxt);
+
+	__debug_switch_to_host(vcpu);
+
+	/*
+	 * Ensure that all system register writes above have taken effect
+	 * before returning to the host. In VHE mode, CPTR traps for
+	 * FPSIMD/SVE/SME also apply to EL2, so FPSIMD/SVE/SME state must be
+	 * manipulated after the ISB.
+	 */
 	isb();
 
 	fpsimd_lazy_switch_to_host(vcpu);
 
-	sysreg_restore_host_state_vhe(host_ctxt);
-
 	if (guest_owns_fp_regs())
 		__fpsimd_save_fpexc32(vcpu);
 
-	__debug_switch_to_host(vcpu);
-
 	return exit_code;
 }
 NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe);
@@ -627,12 +632,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
 	 */
 	local_daif_restore(DAIF_PROCCTX_NOIRQ);
 
-	/*
-	 * When we exit from the guest we change a number of CPU configuration
-	 * parameters, such as traps.  We rely on the isb() in kvm_call_hyp*()
-	 * to make sure these changes take effect before running the host or
-	 * additional guests.
-	 */
 	return ret;
 }
 
-- 
cgit v1.2.3


From b5aafcb4efd2bdacbc37753cf807d69faa6a7304 Mon Sep 17 00:00:00 2001
From: Binbin Wu <binbin.wu@linux.intel.com>
Date: Tue, 10 Jun 2025 10:14:19 +0800
Subject: KVM: TDX: Add new TDVMCALL status code for unsupported subfuncs

Add the new TDVMCALL status code TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED and
return it for unimplemented TDVMCALL subfunctions.

Returning TDVMCALL_STATUS_INVALID_OPERAND when a subfunction is not
implemented is vague because TDX guests can't tell the error is due to
the subfunction is not supported or an invalid input of the subfunction.
New GHCI spec adds TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED to avoid the
ambiguity. Use it instead of TDVMCALL_STATUS_INVALID_OPERAND.

Before the change, for common guest implementations, when a TDX guest
receives TDVMCALL_STATUS_INVALID_OPERAND, it has two cases:
1. Some operand is invalid. It could change the operand to another value
   retry.
2. The subfunction is not supported.

For case 1, an invalid operand usually means the guest implementation bug.
Since the TDX guest can't tell which case is, the best practice for
handling TDVMCALL_STATUS_INVALID_OPERAND is stopping calling such leaf,
treating the failure as fatal if the TDVMCALL is essential or ignoring
it if the TDVMCALL is optional.

With this change, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED could be sent to
old TDX guest that do not know about it, but it is expected that the
guest will make the same action as TDVMCALL_STATUS_INVALID_OPERAND.
Currently, no known TDX guest checks TDVMCALL_STATUS_INVALID_OPERAND
specifically; for example Linux just checks for success.

Signed-off-by: Binbin Wu <binbin.wu@linux.intel.com>
[Return it for untrapped KVM_HC_MAP_GPA_RANGE. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/shared/tdx.h |  1 +
 arch/x86/kvm/vmx/tdx.c            | 10 ++++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h
index 2f3820342598..d8525e6ef50a 100644
--- a/arch/x86/include/asm/shared/tdx.h
+++ b/arch/x86/include/asm/shared/tdx.h
@@ -80,6 +80,7 @@
 #define TDVMCALL_STATUS_RETRY		0x0000000000000001ULL
 #define TDVMCALL_STATUS_INVALID_OPERAND	0x8000000000000000ULL
 #define TDVMCALL_STATUS_ALIGN_ERROR	0x8000000000000002ULL
+#define TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED	0x8000000000000003ULL
 
 /*
  * Bitmasks of exposed registers (with VMM).
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b952bc673271..5d100c240ab3 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1212,11 +1212,13 @@ static int tdx_map_gpa(struct kvm_vcpu *vcpu)
 	/*
 	 * Converting TDVMCALL_MAP_GPA to KVM_HC_MAP_GPA_RANGE requires
 	 * userspace to enable KVM_CAP_EXIT_HYPERCALL with KVM_HC_MAP_GPA_RANGE
-	 * bit set.  If not, the error code is not defined in GHCI for TDX, use
-	 * TDVMCALL_STATUS_INVALID_OPERAND for this case.
+	 * bit set.  This is a base call so it should always be supported, but
+	 * KVM has no way to ensure that userspace implements the GHCI correctly.
+	 * So if KVM_HC_MAP_GPA_RANGE does not cause a VMEXIT, return an error
+	 * to the guest.
 	 */
 	if (!user_exit_on_hypercall(vcpu->kvm, KVM_HC_MAP_GPA_RANGE)) {
-		ret = TDVMCALL_STATUS_INVALID_OPERAND;
+		ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
 		goto error;
 	}
 
@@ -1476,7 +1478,7 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu)
 		break;
 	}
 
-	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+	tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED);
 	return 1;
 }
 
-- 
cgit v1.2.3


From cf207eac06f661fb692f405d5ab8230df884ee52 Mon Sep 17 00:00:00 2001
From: Binbin Wu <binbin.wu@linux.intel.com>
Date: Tue, 10 Jun 2025 10:14:20 +0800
Subject: KVM: TDX: Handle TDG.VP.VMCALL<GetQuote>

Handle TDVMCALL for GetQuote to generate a TD-Quote.

GetQuote is a doorbell-like interface used by TDX guests to request VMM
to generate a TD-Quote signed by a service hosting TD-Quoting Enclave
operating on the host.  A TDX guest passes a TD Report (TDREPORT_STRUCT) in
a shared-memory area as parameter.  Host VMM can access it and queue the
operation for a service hosting TD-Quoting enclave.  When completed, the
Quote is returned via the same shared-memory area.

KVM only checks the GPA from the TDX guest has the shared-bit set and drops
the shared-bit before exiting to userspace to avoid bleeding the shared-bit
into KVM's exit ABI.  KVM forwards the request to userspace VMM (e.g. QEMU)
and userspace VMM queues the operation asynchronously.  KVM sets the return
code according to the 'ret' field set by userspace to notify the TDX guest
whether the request has been queued successfully or not.  When the request
has been queued successfully, the TDX guest can poll the status field in
the shared-memory area to check whether the Quote generation is completed
or not.  When completed, the generated Quote is returned via the same
buffer.

Add KVM_EXIT_TDX as a new exit reason to userspace. Userspace is
required to handle the KVM exit reason as the initial support for TDX,
by reentering KVM to ensure that the TDVMCALL is complete.  While at it,
add a note that KVM_EXIT_HYPERCALL also requires reentry with KVM_RUN.

Signed-off-by: Binbin Wu <binbin.wu@linux.intel.com>
Tested-by: Mikko Ylinen <mikko.ylinen@linux.intel.com>
Acked-by: Kai Huang <kai.huang@intel.com>
[Adjust userspace API. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/tdx.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 5d100c240ab3..b619a3478983 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1465,6 +1465,36 @@ static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int tdx_complete_simple(struct kvm_vcpu *vcpu)
+{
+	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.unknown.ret);
+	return 1;
+}
+
+static int tdx_get_quote(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+	u64 gpa = tdx->vp_enter_args.r12;
+	u64 size = tdx->vp_enter_args.r13;
+
+	/* The gpa of buffer must have shared bit set. */
+	if (vt_is_tdx_private_gpa(vcpu->kvm, gpa)) {
+		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+		return 1;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_TDX;
+	vcpu->run->tdx.flags = 0;
+	vcpu->run->tdx.nr = TDVMCALL_GET_QUOTE;
+	vcpu->run->tdx.get_quote.ret = TDVMCALL_STATUS_SUBFUNC_UNSUPPORTED;
+	vcpu->run->tdx.get_quote.gpa = gpa & ~gfn_to_gpa(kvm_gfn_direct_bits(tdx->vcpu.kvm));
+	vcpu->run->tdx.get_quote.size = size;
+
+	vcpu->arch.complete_userspace_io = tdx_complete_simple;
+
+	return 0;
+}
+
 static int handle_tdvmcall(struct kvm_vcpu *vcpu)
 {
 	switch (tdvmcall_leaf(vcpu)) {
@@ -1474,6 +1504,8 @@ static int handle_tdvmcall(struct kvm_vcpu *vcpu)
 		return tdx_report_fatal_error(vcpu);
 	case TDVMCALL_GET_TD_VM_CALL_INFO:
 		return tdx_get_td_vm_call_info(vcpu);
+	case TDVMCALL_GET_QUOTE:
+		return tdx_get_quote(vcpu);
 	default:
 		break;
 	}
-- 
cgit v1.2.3


From 25e8b1dd4883e6c251c3db5b347f3c8ae4ade921 Mon Sep 17 00:00:00 2001
From: Binbin Wu <binbin.wu@linux.intel.com>
Date: Tue, 10 Jun 2025 10:14:21 +0800
Subject: KVM: TDX: Exit to userspace for GetTdVmCallInfo

Exit to userspace for TDG.VP.VMCALL<GetTdVmCallInfo> via KVM_EXIT_TDX,
to allow userspace to provide information about the support of
TDVMCALLs when r12 is 1 for the TDVMCALLs beyond the GHCI base API.

GHCI spec defines the GHCI base TDVMCALLs: <GetTdVmCallInfo>, <MapGPA>,
<ReportFatalError>, <Instruction.CPUID>, <#VE.RequestMMIO>,
<Instruction.HLT>, <Instruction.IO>, <Instruction.RDMSR> and
<Instruction.WRMSR>. They must be supported by VMM to support TDX guests.

For GetTdVmCallInfo
- When leaf (r12) to enumerate TDVMCALL functionality is set to 0,
  successful execution indicates all GHCI base TDVMCALLs listed above are
  supported.

  Update the KVM TDX document with the set of the GHCI base APIs.

- When leaf (r12) to enumerate TDVMCALL functionality is set to 1, it
  indicates the TDX guest is querying the supported TDVMCALLs beyond
  the GHCI base TDVMCALLs.
  Exit to userspace to let userspace set the TDVMCALL sub-function bit(s)
  accordingly to the leaf outputs.  KVM could set the TDVMCALL bit(s)
  supported by itself when the TDVMCALLs don't need support from userspace
  after returning from userspace and before entering guest. Currently, no
  such TDVMCALLs implemented, KVM just sets the values returned from
  userspace.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Binbin Wu <binbin.wu@linux.intel.com>
[Adjust userspace API. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx/tdx.c | 43 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index b619a3478983..1ad20c273f3b 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -1451,18 +1451,53 @@ error:
 	return 1;
 }
 
+static int tdx_complete_get_td_vm_call_info(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_tdx *tdx = to_tdx(vcpu);
+
+	tdvmcall_set_return_code(vcpu, vcpu->run->tdx.get_tdvmcall_info.ret);
+
+	/*
+	 * For now, there is no TDVMCALL beyond GHCI base API supported by KVM
+	 * directly without the support from userspace, just set the value
+	 * returned from userspace.
+	 */
+	tdx->vp_enter_args.r11 = vcpu->run->tdx.get_tdvmcall_info.r11;
+	tdx->vp_enter_args.r12 = vcpu->run->tdx.get_tdvmcall_info.r12;
+	tdx->vp_enter_args.r13 = vcpu->run->tdx.get_tdvmcall_info.r13;
+	tdx->vp_enter_args.r14 = vcpu->run->tdx.get_tdvmcall_info.r14;
+
+	return 1;
+}
+
 static int tdx_get_td_vm_call_info(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_tdx *tdx = to_tdx(vcpu);
 
-	if (tdx->vp_enter_args.r12)
-		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
-	else {
+	switch (tdx->vp_enter_args.r12) {
+	case 0:
 		tdx->vp_enter_args.r11 = 0;
+		tdx->vp_enter_args.r12 = 0;
 		tdx->vp_enter_args.r13 = 0;
 		tdx->vp_enter_args.r14 = 0;
+		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_SUCCESS);
+		return 1;
+	case 1:
+		vcpu->run->tdx.get_tdvmcall_info.leaf = tdx->vp_enter_args.r12;
+		vcpu->run->exit_reason = KVM_EXIT_TDX;
+		vcpu->run->tdx.flags = 0;
+		vcpu->run->tdx.nr = TDVMCALL_GET_TD_VM_CALL_INFO;
+		vcpu->run->tdx.get_tdvmcall_info.ret = TDVMCALL_STATUS_SUCCESS;
+		vcpu->run->tdx.get_tdvmcall_info.r11 = 0;
+		vcpu->run->tdx.get_tdvmcall_info.r12 = 0;
+		vcpu->run->tdx.get_tdvmcall_info.r13 = 0;
+		vcpu->run->tdx.get_tdvmcall_info.r14 = 0;
+		vcpu->arch.complete_userspace_io = tdx_complete_get_td_vm_call_info;
+		return 0;
+	default:
+		tdvmcall_set_return_code(vcpu, TDVMCALL_STATUS_INVALID_OPERAND);
+		return 1;
 	}
-	return 1;
 }
 
 static int tdx_complete_simple(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From c55c7a85e02a7bfee20a3ffebdff7cbeb41613ef Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Fri, 6 Jun 2025 20:44:25 +0800
Subject: um: ubd: Add missing error check in start_io_thread()

The subsequent call to os_set_fd_block() overwrites the previous
return value. OR the two return values together to fix it.

Fixes: f88f0bdfc32f ("um: UBD Improvements")
Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250606124428.148164-2-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/ubd_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c
index c5e6545f6fcf..8e8a8bf518b6 100644
--- a/arch/um/drivers/ubd_user.c
+++ b/arch/um/drivers/ubd_user.c
@@ -41,7 +41,7 @@ int start_io_thread(struct os_helper_thread **td_out, int *fd_out)
 	*fd_out = fds[1];
 
 	err = os_set_fd_block(*fd_out, 0);
-	err = os_set_fd_block(kernel_fd, 0);
+	err |= os_set_fd_block(kernel_fd, 0);
 	if (err) {
 		printk("start_io_thread - failed to set nonblocking I/O.\n");
 		goto out_close;
-- 
cgit v1.2.3


From bc4e2ae08183d9017f43bf88aa7cfdf84e76f573 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Fri, 6 Jun 2025 20:44:27 +0800
Subject: um: vfio: Prevent duplicate device assignments

Ensure devices are assigned only once. Reject subsequent requests
for duplicate assignments.

Fixes: a0e2cb6a9063 ("um: Add VFIO-based virtual PCI driver")
Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250606124428.148164-4-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/vfio_kern.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch')

diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c
index b51fc9888ae1..13b971a2bd43 100644
--- a/arch/um/drivers/vfio_kern.c
+++ b/arch/um/drivers/vfio_kern.c
@@ -570,6 +570,17 @@ static void uml_vfio_release_device(struct uml_vfio_device *dev)
 	kfree(dev);
 }
 
+static struct uml_vfio_device *uml_vfio_find_device(const char *device)
+{
+	struct uml_vfio_device *dev;
+
+	list_for_each_entry(dev, &uml_vfio_devices, list) {
+		if (!strcmp(dev->name, device))
+			return dev;
+	}
+	return NULL;
+}
+
 static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *kp)
 {
 	struct uml_vfio_device *dev;
@@ -582,6 +593,9 @@ static int uml_vfio_cmdline_set(const char *device, const struct kernel_param *k
 		uml_vfio_container.fd = fd;
 	}
 
+	if (uml_vfio_find_device(device))
+		return -EEXIST;
+
 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	if (!dev)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 8948941276024fcfb08fdb3d678867dc1f7b1c16 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Fri, 6 Jun 2025 20:44:28 +0800
Subject: um: Use correct data source in fpregs_legacy_set()

Read from the buffer pointed to by 'from' instead of '&buf', as
'buf' contains no valid data when 'ubuf' is NULL.

Fixes: b1e1bd2e6943 ("um: Add helper functions to get/set state for SECCOMP")
Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250606124428.148164-5-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/x86/um/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/um/ptrace.c b/arch/x86/um/ptrace.c
index 3275870330fe..fae8aabad10f 100644
--- a/arch/x86/um/ptrace.c
+++ b/arch/x86/um/ptrace.c
@@ -161,7 +161,7 @@ static int fpregs_legacy_set(struct task_struct *target,
 		from = kbuf;
 	}
 
-	return um_fxsr_from_i387(fxsave, &buf);
+	return um_fxsr_from_i387(fxsave, from);
 }
 #endif
 
-- 
cgit v1.2.3


From 2d65fc13be85c336c56af7077f08ccd3a3a15a4a Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Mon, 23 Jun 2025 19:08:29 +0800
Subject: um: vector: Reduce stack usage in vector_eth_configure()

When compiling with clang (19.1.7), initializing *vp using a compound
literal may result in excessive stack usage. Fix it by initializing the
required fields of *vp individually.

Without this patch:

$ objdump -d arch/um/drivers/vector_kern.o | ./scripts/checkstack.pl x86_64 0
...
0x0000000000000540 vector_eth_configure [vector_kern.o]:1472
...

With this patch:

$ objdump -d arch/um/drivers/vector_kern.o | ./scripts/checkstack.pl x86_64 0
...
0x0000000000000540 vector_eth_configure [vector_kern.o]:208
...

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202506221017.WtB7Usua-lkp@intel.com/
Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250623110829.314864-1-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/vector_kern.c | 42 +++++++++++++-----------------------------
 1 file changed, 13 insertions(+), 29 deletions(-)

(limited to 'arch')

diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c
index f292e0b4ff8b..9bbbddfe866b 100644
--- a/arch/um/drivers/vector_kern.c
+++ b/arch/um/drivers/vector_kern.c
@@ -1625,35 +1625,19 @@ static void vector_eth_configure(
 
 	device->dev = dev;
 
-	*vp = ((struct vector_private)
-		{
-		.list			= LIST_HEAD_INIT(vp->list),
-		.dev			= dev,
-		.unit			= n,
-		.options		= get_transport_options(def),
-		.rx_irq			= 0,
-		.tx_irq			= 0,
-		.parsed			= def,
-		.max_packet		= get_mtu(def) + ETH_HEADER_OTHER,
-		/* TODO - we need to calculate headroom so that ip header
-		 * is 16 byte aligned all the time
-		 */
-		.headroom		= get_headroom(def),
-		.form_header		= NULL,
-		.verify_header		= NULL,
-		.header_rxbuffer	= NULL,
-		.header_txbuffer	= NULL,
-		.header_size		= 0,
-		.rx_header_size		= 0,
-		.rexmit_scheduled	= false,
-		.opened			= false,
-		.transport_data		= NULL,
-		.in_write_poll		= false,
-		.coalesce		= 2,
-		.req_size		= get_req_size(def),
-		.in_error		= false,
-		.bpf			= NULL
-	});
+	INIT_LIST_HEAD(&vp->list);
+	vp->dev		= dev;
+	vp->unit	= n;
+	vp->options	= get_transport_options(def);
+	vp->parsed	= def;
+	vp->max_packet	= get_mtu(def) + ETH_HEADER_OTHER;
+	/*
+	 * TODO - we need to calculate headroom so that ip header
+	 * is 16 byte aligned all the time
+	 */
+	vp->headroom	= get_headroom(def);
+	vp->coalesce	= 2;
+	vp->req_size	= get_req_size(def);
 
 	dev->features = dev->hw_features = (NETIF_F_SG | NETIF_F_FRAGLIST);
 	INIT_WORK(&vp->reset_tx, vector_reset_tx);
-- 
cgit v1.2.3