From c8f3c9fa75ff3822b56b47d5cfa0aaa484040ea8 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Date: Mon, 15 Dec 2025 12:10:35 +0200
Subject: ASoC: soc-acpi / SOF: Add best_effort flag to get_function_tplg_files
 op

When there is no fallback possibility available for the function topology
use it is better to try to create  a profile for the card in best effort
manner, leaving out non supported links for example.

As an example: some laptops present SSPx-BT link but we don't have fragment
yet to support this. If we only have support for functional topology
without monolithic fallback then we would fail the card creation.
The reason why the monolithic topology works on the same device is that it
does not have the SSPx-BT link handled, it is ignored.

In case when there is no fallback possibility we should try to create the
card with links that we support as best effort instead of failing and
leaving the user without a card.

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
Reviewed-by: Kai Vehmanen <kai.vehmanen@linux.intel.com>
Reviewed-by: Bard Liao <yung-chuan.liao@linux.intel.com>
Link: https://patch.msgid.link/20251215101036.9370-2-peter.ujfalusi@linux.intel.com
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/sound/soc-acpi.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h
index 90d73b9bddab..0519afd7217f 100644
--- a/include/sound/soc-acpi.h
+++ b/include/sound/soc-acpi.h
@@ -203,6 +203,8 @@ struct snd_soc_acpi_link_adr {
  *	@mach: the pointer of the machine driver
  *	@prefix: the prefix of the topology file name. Typically, it is the path.
  *	@tplg_files: the pointer of the array of the topology file names.
+ *	@best_effort: ignore non supported links and try to build the card in best effort
+ *		      with supported links
  */
 /* Descriptor for SST ASoC machine driver */
 struct snd_soc_acpi_mach {
@@ -224,7 +226,8 @@ struct snd_soc_acpi_mach {
 	const u32 tplg_quirk_mask;
 	int (*get_function_tplg_files)(struct snd_soc_card *card,
 				       const struct snd_soc_acpi_mach *mach,
-				       const char *prefix, const char ***tplg_files);
+				       const char *prefix, const char ***tplg_files,
+				       bool best_effort);
 };
 
 #define SND_SOC_ACPI_MAX_CODECS 3
-- 
cgit v1.2.3


From 2dc675f614850b80deab7cf6d12902636ed8a7f4 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 8 Dec 2025 14:33:05 +0100
Subject: RDMA/ucma: Fix rdma_ucm_query_ib_service_resp struct padding

On a few 32-bit architectures, the newly added ib_user_service_rec
structure is not 64-bit aligned the way it is on most regular ones.

Add explicit padding into the rdma_ucm_query_ib_service_resp and
rdma_ucm_resolve_ib_service structures that embed it, so that the layout
is compatible across all of them.

This is an ABI change on i386, aligning it with x86_64 and the other
64-bit architectures to avoid having to use a compat ioctl handler.

Fixes: 810f874eda8e ("RDMA/ucma: Support query resolved service records")
Link: https://patch.msgid.link/r/20251208133311.313977-1-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/rdma/rdma_user_cm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h
index 5ded174687ee..838f8d460256 100644
--- a/include/uapi/rdma/rdma_user_cm.h
+++ b/include/uapi/rdma/rdma_user_cm.h
@@ -192,6 +192,7 @@ struct rdma_ucm_query_path_resp {
 
 struct rdma_ucm_query_ib_service_resp {
 	__u32 num_service_recs;
+	__u32 reserved;
 	struct ib_user_service_rec recs[];
 };
 
@@ -354,7 +355,7 @@ enum {
 
 #define RDMA_USER_CM_IB_SERVICE_NAME_SIZE 64
 struct rdma_ucm_ib_service {
-	__u64 service_id;
+	__aligned_u64 service_id;
 	__u8  service_name[RDMA_USER_CM_IB_SERVICE_NAME_SIZE];
 	__u32 flags;
 	__u32 reserved;
@@ -362,6 +363,7 @@ struct rdma_ucm_ib_service {
 
 struct rdma_ucm_resolve_ib_service {
 	__u32 id;
+	__u32 reserved;
 	struct rdma_ucm_ib_service ibs;
 };
 
-- 
cgit v1.2.3


From d95e99a74eaf35c070f5939295331e5d7857c723 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 8 Dec 2025 14:38:44 +0100
Subject: RDMA/irdma: Fix irdma_alloc_ucontext_resp padding

A recent commit modified struct irdma_alloc_ucontext_resp by adding a
member with implicit padding in front of it, though this does not change
the offset of the data members other than m68k. Reported by
scripts/check-uapi.sh:

==== ABI differences detected in include/rdma/irdma-abi.h from 1dd7bde2e91c -> HEAD ====
    [C] 'struct irdma_alloc_ucontext_resp' changed:
      type size changed from 704 to 640 (in bits)
      1 data member deletion:
        '__u8 rsvd3[2]', at offset 640 (in bits) at irdma-abi.h:61:1
      1 data member insertion:
        '__u8 revd3[2]', at offset 592 (in bits) at irdma-abi.h:60:1

Change the size back to the previous version, and remove the implicit
padding by making it explicit and matching what x86-64 would do by placing
max_hw_srq_quanta member into a naturally aligned location.

Fixes: 563e1feb5f6e ("RDMA/irdma: Add SRQ support")
Link: https://patch.msgid.link/r/20251208133849.315451-1-arnd@kernel.org
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Tested-by: Jacob Moroni <jmoroni@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 include/uapi/rdma/irdma-abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/uapi/rdma/irdma-abi.h b/include/uapi/rdma/irdma-abi.h
index f7788d33376b..36f20802bcc8 100644
--- a/include/uapi/rdma/irdma-abi.h
+++ b/include/uapi/rdma/irdma-abi.h
@@ -57,8 +57,8 @@ struct irdma_alloc_ucontext_resp {
 	__u8 rsvd2;
 	__aligned_u64 comp_mask;
 	__u16 min_hw_wq_size;
+	__u8 revd3[2];
 	__u32 max_hw_srq_quanta;
-	__u8 rsvd3[2];
 };
 
 struct irdma_alloc_pd_resp {
-- 
cgit v1.2.3


From 4a824c3128998158a093eaadd776a79abe3a601a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 4 Dec 2025 15:31:27 +0000
Subject: entry: Always inline local_irq_{enable,disable}_exit_to_user()

clang needs __always_inline instead of inline, even for tiny helpers.

This saves some cycles in system call fast path, and saves 195 bytes
on x86_64 build:

$ size vmlinux.before vmlinux.after
   text	   data	    bss	    dec	    hex	filename
34652814	22291961	5875180	62819955	3be8e73	vmlinux.before
34652619	22291961	5875180	62819760	3be8db0	vmlinux.after

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251204153127.1321824-1-edumazet@google.com
---
 include/linux/irq-entry-common.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h
index 6ab913e57da0..d26d1b1bcbfb 100644
--- a/include/linux/irq-entry-common.h
+++ b/include/linux/irq-entry-common.h
@@ -110,7 +110,7 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
 static inline void local_irq_enable_exit_to_user(unsigned long ti_work);
 
 #ifndef local_irq_enable_exit_to_user
-static inline void local_irq_enable_exit_to_user(unsigned long ti_work)
+static __always_inline void local_irq_enable_exit_to_user(unsigned long ti_work)
 {
 	local_irq_enable();
 }
@@ -125,7 +125,7 @@ static inline void local_irq_enable_exit_to_user(unsigned long ti_work)
 static inline void local_irq_disable_exit_to_user(void);
 
 #ifndef local_irq_disable_exit_to_user
-static inline void local_irq_disable_exit_to_user(void)
+static __always_inline void local_irq_disable_exit_to_user(void)
 {
 	local_irq_disable();
 }
-- 
cgit v1.2.3


From b61104e7a6349bd2c2b3e2fb3260d87f15eda8f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= <thomas.weissschuh@linutronix.de>
Date: Mon, 22 Dec 2025 08:45:48 +0100
Subject: regulator: uapi: Use UAPI integer type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using libc types and headers from the UAPI headers is problematic as it
introduces a dependency on a full C toolchain.

Use the fixed-width integer type provided by the UAPI headers instead.

Signed-off-by: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
Link: https://patch.msgid.link/20251222-uapi-regulator-v1-1-a71c66eb1a94@linutronix.de
Signed-off-by: Mark Brown <broonie@kernel.org>
---
 include/uapi/regulator/regulator.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/uapi/regulator/regulator.h b/include/uapi/regulator/regulator.h
index 71bf71a22e7f..c4f2d1c19828 100644
--- a/include/uapi/regulator/regulator.h
+++ b/include/uapi/regulator/regulator.h
@@ -8,11 +8,7 @@
 #ifndef _UAPI_REGULATOR_H
 #define _UAPI_REGULATOR_H
 
-#ifdef __KERNEL__
 #include <linux/types.h>
-#else
-#include <stdint.h>
-#endif
 
 /*
  * Regulator notifier events.
@@ -62,7 +58,7 @@
 
 struct reg_genl_event {
 	char reg_name[32];
-	uint64_t event;
+	__u64 event;
 };
 
 /* attributes of reg_genl_family */
-- 
cgit v1.2.3


From 87e7f6019097746d1d06f98874a9f179b7a68f3e Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Fri, 19 Dec 2025 10:36:38 +0200
Subject: software node: Also support referencing non-constant software nodes

Fwnode references are be implemented differently if referenced node is a
software node. _Generic() is used to differentiate between the two cases
but only const software nodes were present in the selection. Also add
non-const software nodes.

Reported-by: Kenneth Crudup <kenny@panix.com>
Closes: https://lore.kernel.org/all/af773b82-bef2-4209-baaf-526d4661b7fc@panix.com/
Fixes: d7cdbbc93c56 ("software node: allow referencing firmware nodes")
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Tested-By: Kenneth R. Crudup <kenny@panix.com>
Tested-by: Mehdi Djait <mehdi.djait@linux.intel.com> # Dell XPS 9315
Reviewed-by: Mehdi Djait <mehdi.djait@linux.intel.com>
Link: https://patch.msgid.link/20251219083638.2454138-1-sakari.ailus@linux.intel.com
Signed-off-by: Danilo Krummrich <dakr@kernel.org>
---
 include/linux/property.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/property.h b/include/linux/property.h
index 272bfbdea7bf..e30ef23a9af3 100644
--- a/include/linux/property.h
+++ b/include/linux/property.h
@@ -371,6 +371,7 @@ struct software_node_ref_args {
 (const struct software_node_ref_args) {				\
 	.swnode = _Generic(_ref_,				\
 			   const struct software_node *: _ref_,	\
+			   struct software_node *: _ref_,	\
 			   default: NULL),			\
 	.fwnode = _Generic(_ref_,				\
 			   struct fwnode_handle *: _ref_,	\
-- 
cgit v1.2.3


From 20e20b147cf7cb6780a5b95da2a0e37c52cd1015 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 15 Dec 2025 22:38:00 -0800
Subject: platform/x86/intel/vsec: correct kernel-doc comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix kernel-doc warnings in intel_vsec.h to eliminate all kernel-doc
warnings:

Warning: include/linux/intel_vsec.h:92 struct member 'read_telem' not
 described in 'pmt_callbacks'
Warning: include/linux/intel_vsec.h:146 expecting prototype for struct
 intel_sec_device.  Prototype was for struct intel_vsec_device instead
Warning: include/linux/intel_vsec.h:146 struct member 'priv_data_size'
 not described in 'intel_vsec_device'

In struct pmt_callbacks, correct the kernel-doc for @read_telem.
kernel-doc doesn't support documenting callback function parameters,
so drop the '@' signs on those and use "* *" to make them somewhat
readable in the produced documentation output.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Link: https://patch.msgid.link/20251216063801.2896495-1-rdunlap@infradead.org
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
---
 include/linux/intel_vsec.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h
index 53f6fe88e369..1a0f357c2427 100644
--- a/include/linux/intel_vsec.h
+++ b/include/linux/intel_vsec.h
@@ -80,13 +80,13 @@ enum intel_vsec_quirks {
 
 /**
  * struct pmt_callbacks - Callback infrastructure for PMT devices
- * ->read_telem() when specified, called by client driver to access PMT data (instead
- * of direct copy).
- * @pdev:  PCI device reference for the callback's use
- * @guid:  ID of data to acccss
- * @data:  buffer for the data to be copied
- * @off:   offset into the requested buffer
- * @count: size of buffer
+ * @read_telem: when specified, called by client driver to access PMT
+ * data (instead of direct copy).
+ * * pdev:  PCI device reference for the callback's use
+ * * guid:  ID of data to acccss
+ * * data:  buffer for the data to be copied
+ * * off:   offset into the requested buffer
+ * * count: size of buffer
  */
 struct pmt_callbacks {
 	int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, loff_t off, u32 count);
@@ -120,7 +120,7 @@ struct intel_vsec_platform_info {
 };
 
 /**
- * struct intel_sec_device - Auxbus specific device information
+ * struct intel_vsec_device - Auxbus specific device information
  * @auxdev:        auxbus device struct for auxbus access
  * @pcidev:        pci device associated with the device
  * @resource:      any resources shared by the parent
@@ -128,6 +128,7 @@ struct intel_vsec_platform_info {
  * @num_resources: number of resources
  * @id:            xarray id
  * @priv_data:     any private data needed
+ * @priv_data_size: size of private data area
  * @quirks:        specified quirks
  * @base_addr:     base address of entries (if specified)
  * @cap_id:        the enumerated id of the vsec feature
-- 
cgit v1.2.3


From c31f4aa8fed048fa70e742c4bb49bb48dc489ab3 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Fri, 19 Dec 2025 16:52:58 +0800
Subject: kunit: Enforce task execution in {soft,hard}irq contexts

The kunit_run_irq_test() helper allows a function to be run in hardirq
and softirq contexts (in addition to the task context). It does this by
running the user-provided function concurrently in the three contexts,
until either a timeout has expired or a number of iterations have
completed in the normal task context.

However, on setups where the initialisation of the hardirq and softirq
contexts (or, indeed, the scheduling of those tasks) is significantly
slower than the function execution, it's possible for that number of
iterations to be exceeded before any runs in irq contexts actually
occur. This occurs with the polyval.test_polyval_preparekey_in_irqs
test, which runs 20000 iterations of the relatively fast preparekey
function, and therefore fails often under many UML, 32-bit arm, m68k and
other environments.

Instead, ensure that the max_iterations limit counts executions in all
three contexts, and requires at least one of each. This will cause the
test to continue iterating until at least the irq contexts have been
tested, or the 1s wall-clock limit has been exceeded. This causes the
test to pass in all of my environments.

In so doing, we also update the task counters to atomic ints, to better
match both the 'int' max_iterations input, and to ensure they are
correctly updated across contexts.

Finally, we also fix a few potential assertion messages to be
less-specific to the original crypto usecases.

Fixes: 950a81224e8b ("lib/crypto: tests: Add hash-test-template.h and gen-hash-testvecs.py")
Signed-off-by: David Gow <davidgow@google.com>
Link: https://lore.kernel.org/r/20251219085259.1163048-1-davidgow@google.com
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 include/kunit/run-in-irq-context.h | 53 ++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/kunit/run-in-irq-context.h b/include/kunit/run-in-irq-context.h
index 108e96433ea4..c89b1b1b12dd 100644
--- a/include/kunit/run-in-irq-context.h
+++ b/include/kunit/run-in-irq-context.h
@@ -20,8 +20,8 @@ struct kunit_irq_test_state {
 	bool task_func_reported_failure;
 	bool hardirq_func_reported_failure;
 	bool softirq_func_reported_failure;
-	unsigned long hardirq_func_calls;
-	unsigned long softirq_func_calls;
+	atomic_t hardirq_func_calls;
+	atomic_t softirq_func_calls;
 	struct hrtimer timer;
 	struct work_struct bh_work;
 };
@@ -32,7 +32,7 @@ static enum hrtimer_restart kunit_irq_test_timer_func(struct hrtimer *timer)
 		container_of(timer, typeof(*state), timer);
 
 	WARN_ON_ONCE(!in_hardirq());
-	state->hardirq_func_calls++;
+	atomic_inc(&state->hardirq_func_calls);
 
 	if (!state->func(state->test_specific_state))
 		state->hardirq_func_reported_failure = true;
@@ -48,7 +48,7 @@ static void kunit_irq_test_bh_work_func(struct work_struct *work)
 		container_of(work, typeof(*state), bh_work);
 
 	WARN_ON_ONCE(!in_serving_softirq());
-	state->softirq_func_calls++;
+	atomic_inc(&state->softirq_func_calls);
 
 	if (!state->func(state->test_specific_state))
 		state->softirq_func_reported_failure = true;
@@ -59,7 +59,10 @@ static void kunit_irq_test_bh_work_func(struct work_struct *work)
  * hardirq context concurrently, and reports a failure to KUnit if any
  * invocation of @func in any context returns false.  @func is passed
  * @test_specific_state as its argument.  At most 3 invocations of @func will
- * run concurrently: one in each of task, softirq, and hardirq context.
+ * run concurrently: one in each of task, softirq, and hardirq context.  @func
+ * will continue running until either @max_iterations calls have been made (so
+ * long as at least one each runs in task, softirq, and hardirq contexts), or
+ * one second has passed.
  *
  * The main purpose of this interrupt context testing is to validate fallback
  * code paths that run in contexts where the normal code path cannot be used,
@@ -85,6 +88,8 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
 		.test_specific_state = test_specific_state,
 	};
 	unsigned long end_jiffies;
+	int hardirq_calls, softirq_calls;
+	bool allctx = false;
 
 	/*
 	 * Set up a hrtimer (the way we access hardirq context) and a work
@@ -94,14 +99,25 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
 			       CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
 	INIT_WORK_ONSTACK(&state.bh_work, kunit_irq_test_bh_work_func);
 
-	/* Run for up to max_iterations or 1 second, whichever comes first. */
+	/*
+	 * Run for up to max_iterations (including at least one task, softirq,
+	 * and hardirq), or 1 second, whichever comes first.
+	 */
 	end_jiffies = jiffies + HZ;
 	hrtimer_start(&state.timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL,
 		      HRTIMER_MODE_REL_HARD);
-	for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies);
-	     i++) {
+	for (int task_calls = 0, calls = 0;
+	     ((calls < max_iterations) || !allctx) &&
+	     !time_after(jiffies, end_jiffies);
+	     task_calls++) {
 		if (!func(test_specific_state))
 			state.task_func_reported_failure = true;
+
+		hardirq_calls = atomic_read(&state.hardirq_func_calls);
+		softirq_calls = atomic_read(&state.softirq_func_calls);
+		calls = task_calls + hardirq_calls + softirq_calls;
+		allctx = (task_calls > 0) && (hardirq_calls > 0) &&
+			 (softirq_calls > 0);
 	}
 
 	/* Cancel the timer and work. */
@@ -109,21 +125,18 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *),
 	flush_work(&state.bh_work);
 
 	/* Sanity check: the timer and BH functions should have been run. */
-	KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0,
+	KUNIT_EXPECT_GT_MSG(test, atomic_read(&state.hardirq_func_calls), 0,
 			    "Timer function was not called");
-	KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0,
+	KUNIT_EXPECT_GT_MSG(test, atomic_read(&state.softirq_func_calls), 0,
 			    "BH work function was not called");
 
-	/* Check for incorrect hash values reported from any context. */
-	KUNIT_EXPECT_FALSE_MSG(
-		test, state.task_func_reported_failure,
-		"Incorrect hash values reported from task context");
-	KUNIT_EXPECT_FALSE_MSG(
-		test, state.hardirq_func_reported_failure,
-		"Incorrect hash values reported from hardirq context");
-	KUNIT_EXPECT_FALSE_MSG(
-		test, state.softirq_func_reported_failure,
-		"Incorrect hash values reported from softirq context");
+	/* Check for failure reported from any context. */
+	KUNIT_EXPECT_FALSE_MSG(test, state.task_func_reported_failure,
+			       "Failure reported from task context");
+	KUNIT_EXPECT_FALSE_MSG(test, state.hardirq_func_reported_failure,
+			       "Failure reported from hardirq context");
+	KUNIT_EXPECT_FALSE_MSG(test, state.softirq_func_reported_failure,
+			       "Failure reported from softirq context");
 }
 
 #endif /* _KUNIT_RUN_IN_IRQ_CONTEXT_H */
-- 
cgit v1.2.3


From 754c23238438600e9236719f7e67aff2c4d02093 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= <thomas.hellstrom@linux.intel.com>
Date: Fri, 19 Dec 2025 12:32:59 +0100
Subject: drm/pagemap, drm/xe: Ensure that the devmem allocation is idle before
 use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In situations where no system memory is migrated to devmem, and in
upcoming patches where another GPU is performing the migration to
the newly allocated devmem buffer, there is nothing to ensure any
ongoing clear to the devmem allocation or async eviction from the
devmem allocation is complete.

Address that by passing a struct dma_fence down to the copy
functions, and ensure it is waited for before migration is marked
complete.

v3:
- New patch.
v4:
- Update the logic used for determining when to wait for the
  pre_migrate_fence.
- Update the logic used for determining when to warn for the
  pre_migrate_fence since the scheduler fences apparently
  can signal out-of-order.
v5:
- Fix a UAF (CI)
- Remove references to source P2P migration (Himal)
- Put the pre_migrate_fence after migration.
v6:
- Pipeline the pre_migrate_fence dependency (Matt Brost)

Fixes: c5b3eb5a906c ("drm/xe: Add GPUSVM device memory copy vfunc functions")
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: <stable@vger.kernel.org> # v6.15+
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> # For merging through drm-xe.
Link: https://patch.msgid.link/20251219113320.183860-4-thomas.hellstrom@linux.intel.com
(cherry picked from commit 16b5ad31952476fb925c401897fc171cd37f536b)
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 include/drm/drm_pagemap.h | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h
index f6e7e234c089..70a7991f784f 100644
--- a/include/drm/drm_pagemap.h
+++ b/include/drm/drm_pagemap.h
@@ -8,6 +8,7 @@
 
 #define NR_PAGES(order) (1U << (order))
 
+struct dma_fence;
 struct drm_pagemap;
 struct drm_pagemap_zdd;
 struct device;
@@ -174,6 +175,8 @@ struct drm_pagemap_devmem_ops {
 	 * @pages: Pointer to array of device memory pages (destination)
 	 * @pagemap_addr: Pointer to array of DMA information (source)
 	 * @npages: Number of pages to copy
+	 * @pre_migrate_fence: dma-fence to wait for before migration start.
+	 * May be NULL.
 	 *
 	 * Copy pages to device memory. If the order of a @pagemap_addr entry
 	 * is greater than 0, the entry is populated but subsequent entries
@@ -183,13 +186,16 @@ struct drm_pagemap_devmem_ops {
 	 */
 	int (*copy_to_devmem)(struct page **pages,
 			      struct drm_pagemap_addr *pagemap_addr,
-			      unsigned long npages);
+			      unsigned long npages,
+			      struct dma_fence *pre_migrate_fence);
 
 	/**
 	 * @copy_to_ram: Copy to system RAM (required for migration)
 	 * @pages: Pointer to array of device memory pages (source)
 	 * @pagemap_addr: Pointer to array of DMA information (destination)
 	 * @npages: Number of pages to copy
+	 * @pre_migrate_fence: dma-fence to wait for before migration start.
+	 * May be NULL.
 	 *
 	 * Copy pages to system RAM. If the order of a @pagemap_addr entry
 	 * is greater than 0, the entry is populated but subsequent entries
@@ -199,7 +205,8 @@ struct drm_pagemap_devmem_ops {
 	 */
 	int (*copy_to_ram)(struct page **pages,
 			   struct drm_pagemap_addr *pagemap_addr,
-			   unsigned long npages);
+			   unsigned long npages,
+			   struct dma_fence *pre_migrate_fence);
 };
 
 /**
@@ -212,6 +219,8 @@ struct drm_pagemap_devmem_ops {
  * @dpagemap: The struct drm_pagemap of the pages this allocation belongs to.
  * @size: Size of device memory allocation
  * @timeslice_expiration: Timeslice expiration in jiffies
+ * @pre_migrate_fence: Fence to wait for or pipeline behind before migration starts.
+ * (May be NULL).
  */
 struct drm_pagemap_devmem {
 	struct device *dev;
@@ -221,6 +230,7 @@ struct drm_pagemap_devmem {
 	struct drm_pagemap *dpagemap;
 	size_t size;
 	u64 timeslice_expiration;
+	struct dma_fence *pre_migrate_fence;
 };
 
 int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation,
@@ -238,7 +248,8 @@ struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page);
 void drm_pagemap_devmem_init(struct drm_pagemap_devmem *devmem_allocation,
 			     struct device *dev, struct mm_struct *mm,
 			     const struct drm_pagemap_devmem_ops *ops,
-			     struct drm_pagemap *dpagemap, size_t size);
+			     struct drm_pagemap *dpagemap, size_t size,
+			     struct dma_fence *pre_migrate_fence);
 
 int drm_pagemap_populate_mm(struct drm_pagemap *dpagemap,
 			    unsigned long start, unsigned long end,
-- 
cgit v1.2.3


From 06e219f6a706c367c93051f408ac61417643d2f9 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 15 Dec 2025 17:02:35 +0200
Subject: net: dsa: properly keep track of conduit reference

Problem description
-------------------

DSA has a mumbo-jumbo of reference handling of the conduit net device
and its kobject which, sadly, is just wrong and doesn't make sense.

There are two distinct problems.

1. The OF path, which uses of_find_net_device_by_node(), never releases
   the elevated refcount on the conduit's kobject. Nominally, the OF and
   non-OF paths should result in objects having identical reference
   counts taken, and it is already suspicious that
   dsa_dev_to_net_device() has a put_device() call which is missing in
   dsa_port_parse_of(), but we can actually even verify that an issue
   exists. With CONFIG_DEBUG_KOBJECT_RELEASE=y, if we run this command
   "before" and "after" applying this patch:

(unbind the conduit driver for net device eno2)
echo 0000:00:00.2 > /sys/bus/pci/drivers/fsl_enetc/unbind

we see these lines in the output diff which appear only with the patch
applied:

kobject: 'eno2' (ffff002009a3a6b8): kobject_release, parent 0000000000000000 (delayed 1000)
kobject: '109' (ffff0020099d59a0): kobject_release, parent 0000000000000000 (delayed 1000)

2. After we find the conduit interface one way (OF) or another (non-OF),
   it can get unregistered at any time, and DSA remains with a long-lived,
   but in this case stale, cpu_dp->conduit pointer. Holding the net
   device's underlying kobject isn't actually of much help, it just
   prevents it from being freed (but we never need that kobject
   directly). What helps us to prevent the net device from being
   unregistered is the parallel netdev reference mechanism (dev_hold()
   and dev_put()).

Actually we actually use that netdev tracker mechanism implicitly on
user ports since commit 2f1e8ea726e9 ("net: dsa: link interfaces with
the DSA master to get rid of lockdep warnings"), via netdev_upper_dev_link().
But time still passes at DSA switch probe time between the initial
of_find_net_device_by_node() code and the user port creation time, time
during which the conduit could unregister itself and DSA wouldn't know
about it.

So we have to run of_find_net_device_by_node() under rtnl_lock() to
prevent that from happening, and release the lock only with the netdev
tracker having acquired the reference.

Do we need to keep the reference until dsa_unregister_switch() /
dsa_switch_shutdown()?
1: Maybe yes. A switch device will still be registered even if all user
   ports failed to probe, see commit 86f8b1c01a0a ("net: dsa: Do not
   make user port errors fatal"), and the cpu_dp->conduit pointers
   remain valid.  I haven't audited all call paths to see whether they
   will actually use the conduit in lack of any user port, but if they
   do, it seems safer to not rely on user ports for that reference.
2. Definitely yes. We support changing the conduit which a user port is
   associated to, and we can get into a situation where we've moved all
   user ports away from a conduit, thus no longer hold any reference to
   it via the net device tracker. But we shouldn't let it go nonetheless
   - see the next change in relation to dsa_tree_find_first_conduit()
   and LAG conduits which disappear.
   We have to be prepared to return to the physical conduit, so the CPU
   port must explicitly keep another reference to it. This is also to
   say: the user ports and their CPU ports may not always keep a
   reference to the same conduit net device, and both are needed.

As for the conduit's kobject for the /sys/class/net/ entry, we don't
care about it, we can release it as soon as we hold the net device
object itself.

History and blame attribution
-----------------------------

The code has been refactored so many times, it is very difficult to
follow and properly attribute a blame, but I'll try to make a short
history which I hope to be correct.

We have two distinct probing paths:
- one for OF, introduced in 2016 in commit 83c0afaec7b7 ("net: dsa: Add
  new binding implementation")
- one for non-OF, introduced in 2017 in commit 71e0bbde0d88 ("net: dsa:
  Add support for platform data")

These are both complete rewrites of the original probing paths (which
used struct dsa_switch_driver and other weird stuff, instead of regular
devices on their respective buses for register access, like MDIO, SPI,
I2C etc):
- one for OF, introduced in 2013 in commit 5e95329b701c ("dsa: add
  device tree bindings to register DSA switches")
- one for non-OF, introduced in 2008 in commit 91da11f870f0 ("net:
  Distributed Switch Architecture protocol support")

except for tiny bits and pieces like dsa_dev_to_net_device() which were
seemingly carried over since the original commit, and used to this day.

The point is that the original probing paths received a fix in 2015 in
the form of commit 679fb46c5785 ("net: dsa: Add missing master netdev
dev_put() calls"), but the fix never made it into the "new" (dsa2)
probing paths that can still be traced to today, and the fixed probing
path was later deleted in 2019 in commit 93e86b3bc842 ("net: dsa: Remove
legacy probing support").

That is to say, the new probing paths were never quite correct in this
area.

The existence of the legacy probing support which was deleted in 2019
explains why dsa_dev_to_net_device() returns a conduit with elevated
refcount (because it was supposed to be released during
dsa_remove_dst()). After the removal of the legacy code, the only user
of dsa_dev_to_net_device() calls dev_put(conduit) immediately after this
function returns. This pattern makes no sense today, and can only be
interpreted historically to understand why dev_hold() was there in the
first place.

Change details
--------------

Today we have a better netdev tracking infrastructure which we should
use. Logically netdev_hold() belongs in common code
(dsa_port_parse_cpu(), where dp->conduit is assigned), but there is a
tradeoff to be made with the rtnl_lock() section which would become a
bit too long if we did that - dsa_port_parse_cpu() also calls
request_module(). So we duplicate a bit of logic in order for the
callers of dsa_port_parse_cpu() to be the ones responsible of holding
the conduit reference and releasing it on error. This shortens the
rtnl_lock() section significantly.

In the dsa_switch_probe() error path, dsa_switch_release_ports() will be
called in a number of situations, one being where dsa_port_parse_cpu()
maybe didn't get the chance to run at all (a different port failed
earlier, etc). So we have to test for the conduit being NULL prior to
calling netdev_put().

There have still been so many transformations to the code since the
blamed commits (rename master -> conduit, commit 0650bf52b31f ("net:
dsa: be compatible with masters which unregister on shutdown")), that it
only makes sense to fix the code using the best methods available today
and see how it can be backported to stable later. I suspect the fix
cannot even be backported to kernels which lack dsa_switch_shutdown(),
and I suspect this is also maybe why the long-lived conduit reference
didn't make it into the new DSA probing paths at the time (problems
during shutdown).

Because dsa_dev_to_net_device() has a single call site and has to be
changed anyway, the logic was just absorbed into the non-OF
dsa_port_parse().

Tested on the ocelot/felix switch and on dsa_loop, both on the NXP
LS1028A with CONFIG_DEBUG_KOBJECT_RELEASE=y.

Reported-by: Ma Ke <make24@iscas.ac.cn>
Closes: https://lore.kernel.org/netdev/20251214131204.4684-1-make24@iscas.ac.cn/
Fixes: 83c0afaec7b7 ("net: dsa: Add new binding implementation")
Fixes: 71e0bbde0d88 ("net: dsa: Add support for platform data")
Reviewed-by: Jonas Gorski <jonas.gorski@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Link: https://patch.msgid.link/20251215150236.3931670-1-vladimir.oltean@nxp.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/dsa.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index cced1a866757..6b2b5ed64ea4 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -302,6 +302,7 @@ struct dsa_port {
 	struct devlink_port	devlink_port;
 	struct phylink		*pl;
 	struct phylink_config	pl_config;
+	netdevice_tracker	conduit_tracker;
 	struct dsa_lag		*lag;
 	struct net_device	*hsr_dev;
 
-- 
cgit v1.2.3


From 5393802c94e0ab1295c04c94c57bcb00222d4674 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 27 Nov 2025 10:39:24 -0800
Subject: genalloc.h: fix htmldocs warning

WARNING: include/linux/genalloc.h:52 function parameter 'start_addr' not described in 'genpool_algo_t'

Fixes: 52fbf1134d47 ("lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Closes: https://lkml.kernel.org/r/20251127130624.563597e3@canb.auug.org.au
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Tested-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Alexey Skidanov <alexey.skidanov@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/genalloc.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 0bd581003cd5..60de63e46b33 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -44,6 +44,7 @@ struct gen_pool;
  * @nr: The number of zeroed bits we're looking for
  * @data: optional additional data used by the callback
  * @pool: the pool being allocated from
+ * @start_addr: start address of memory chunk
  */
 typedef unsigned long (*genpool_algo_t)(unsigned long *map,
 			unsigned long size,
-- 
cgit v1.2.3


From 007f5da43b3d0ecff972e2616062b8da1f862f5e Mon Sep 17 00:00:00 2001
From: Jiayuan Chen <jiayuan.chen@linux.dev>
Date: Thu, 4 Dec 2025 18:59:55 +0000
Subject: mm/kasan: fix incorrect unpoisoning in vrealloc for KASAN

Patch series "kasan: vmalloc: Fixes for the percpu allocator and
vrealloc", v3.

Patches fix two issues related to KASAN and vmalloc.

The first one, a KASAN tag mismatch, possibly resulting in a kernel panic,
can be observed on systems with a tag-based KASAN enabled and with
multiple NUMA nodes.  Initially it was only noticed on x86 [1] but later a
similar issue was also reported on arm64 [2].

Specifically the problem is related to how vm_structs interact with
pcpu_chunks - both when they are allocated, assigned and when pcpu_chunk
addresses are derived.

When vm_structs are allocated they are unpoisoned, each with a different
random tag, if vmalloc support is enabled along the KASAN mode.  Later
when first pcpu chunk is allocated it gets its 'base_addr' field set to
the first allocated vm_struct.  With that it inherits that vm_struct's
tag.

When pcpu_chunk addresses are later derived (by pcpu_chunk_addr(), for
example in pcpu_alloc_noprof()) the base_addr field is used and offsets
are added to it.  If the initial conditions are satisfied then some of the
offsets will point into memory allocated with a different vm_struct.  So
while the lower bits will get accurately derived the tag bits in the top
of the pointer won't match the shadow memory contents.

The solution (proposed at v2 of the x86 KASAN series [3]) is to unpoison
the vm_structs with the same tag when allocating them for the per cpu
allocator (in pcpu_get_vm_areas()).

The second one reported by syzkaller [4] is related to vrealloc and
happens because of random tag generation when unpoisoning memory without
allocating new pages.  This breaks shadow memory tracking and needs to
reuse the existing tag instead of generating a new one.  At the same time
an inconsistency in used flags is corrected.


This patch (of 3):

Syzkaller reported a memory out-of-bounds bug [4].  This patch fixes two
issues:

1. In vrealloc the KASAN_VMALLOC_VM_ALLOC flag is missing when
   unpoisoning the extended region. This flag is required to correctly
   associate the allocation with KASAN's vmalloc tracking.

   Note: In contrast, vzalloc (via __vmalloc_node_range_noprof)
   explicitly sets KASAN_VMALLOC_VM_ALLOC and calls
   kasan_unpoison_vmalloc() with it.  vrealloc must behave consistently --
   especially when reusing existing vmalloc regions -- to ensure KASAN can
   track allocations correctly.

2. When vrealloc reuses an existing vmalloc region (without allocating
   new pages) KASAN generates a new tag, which breaks tag-based memory
   access tracking.

Introduce KASAN_VMALLOC_KEEP_TAG, a new KASAN flag that allows reusing the
tag already attached to the pointer, ensuring consistent tag behavior
during reallocation.

Pass KASAN_VMALLOC_KEEP_TAG and KASAN_VMALLOC_VM_ALLOC to the
kasan_unpoison_vmalloc inside vrealloc_node_align_noprof().

Link: https://lkml.kernel.org/r/cover.1765978969.git.m.wieczorretman@pm.me
Link: https://lkml.kernel.org/r/38dece0a4074c43e48150d1e242f8242c73bf1a5.1764874575.git.m.wieczorretman@pm.me
Link: https://lore.kernel.org/all/e7e04692866d02e6d3b32bb43b998e5d17092ba4.1738686764.git.maciej.wieczor-retman@intel.com/ [1]
Link: https://lore.kernel.org/all/aMUrW1Znp1GEj7St@MiWiFi-R3L-srv/ [2]
Link: https://lore.kernel.org/all/CAPAsAGxDRv_uFeMYu9TwhBVWHCCtkSxoWY4xmFB_vowMbi8raw@mail.gmail.com/ [3]
Link: https://syzkaller.appspot.com/bug?extid=997752115a851cb0cf36 [4]
Fixes: a0309faf1cb0 ("mm: vmalloc: support more granular vrealloc() sizing")
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Co-developed-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Reported-by: syzbot+997752115a851cb0cf36@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/all/68e243a2.050a0220.1696c6.007d.GAE@google.com/T/
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Kees Cook <kees@kernel.org>
Cc: Marco Elver <elver@google.com>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index f335c1d7b61d..df3d8567dde9 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -28,6 +28,7 @@ typedef unsigned int __bitwise kasan_vmalloc_flags_t;
 #define KASAN_VMALLOC_INIT		((__force kasan_vmalloc_flags_t)0x01u)
 #define KASAN_VMALLOC_VM_ALLOC		((__force kasan_vmalloc_flags_t)0x02u)
 #define KASAN_VMALLOC_PROT_NORMAL	((__force kasan_vmalloc_flags_t)0x04u)
+#define KASAN_VMALLOC_KEEP_TAG		((__force kasan_vmalloc_flags_t)0x08u)
 
 #define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
 #define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
-- 
cgit v1.2.3


From 6f13db031e27e88213381039032a9cc061578ea6 Mon Sep 17 00:00:00 2001
From: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Date: Thu, 4 Dec 2025 19:00:04 +0000
Subject: kasan: refactor pcpu kasan vmalloc unpoison

A KASAN tag mismatch, possibly causing a kernel panic, can be observed
on systems with a tag-based KASAN enabled and with multiple NUMA nodes.
It was reported on arm64 and reproduced on x86. It can be explained in
the following points:

1. There can be more than one virtual memory chunk.
2. Chunk's base address has a tag.
3. The base address points at the first chunk and thus inherits
   the tag of the first chunk.
4. The subsequent chunks will be accessed with the tag from the
   first chunk.
5. Thus, the subsequent chunks need to have their tag set to
   match that of the first chunk.

Refactor code by reusing __kasan_unpoison_vmalloc in a new helper in
preparation for the actual fix.

Link: https://lkml.kernel.org/r/eb61d93b907e262eefcaa130261a08bcb6c5ce51.1764874575.git.m.wieczorretman@pm.me
Fixes: 1d96320f8d53 ("kasan, vmalloc: add vmalloc tagging for SW_TAGS")
Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: Jiayuan Chen <jiayuan.chen@linux.dev>
Cc: Kees Cook <kees@kernel.org>
Cc: Marco Elver <elver@google.com>
Cc: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
Cc: <stable@vger.kernel.org>	[6.1+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kasan.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index df3d8567dde9..9c6ac4b62eb9 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -631,6 +631,16 @@ static __always_inline void kasan_poison_vmalloc(const void *start,
 		__kasan_poison_vmalloc(start, size);
 }
 
+void __kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms,
+				 kasan_vmalloc_flags_t flags);
+static __always_inline void
+kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms,
+			  kasan_vmalloc_flags_t flags)
+{
+	if (kasan_enabled())
+		__kasan_unpoison_vmap_areas(vms, nr_vms, flags);
+}
+
 #else /* CONFIG_KASAN_VMALLOC */
 
 static inline void kasan_populate_early_vm_area_shadow(void *start,
@@ -655,6 +665,11 @@ static inline void *kasan_unpoison_vmalloc(const void *start,
 static inline void kasan_poison_vmalloc(const void *start, unsigned long size)
 { }
 
+static __always_inline void
+kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms,
+			  kasan_vmalloc_flags_t flags)
+{ }
+
 #endif /* CONFIG_KASAN_VMALLOC */
 
 #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \
-- 
cgit v1.2.3


From 6ba776b533ca902631fa106b8a90811b3f40b08d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 14 Dec 2025 12:15:17 -0800
Subject: mm: leafops.h: correct kernel-doc function param. names

Modify the kernel-doc function parameter names to prevent kernel-doc
warnings:

Warning: include/linux/leafops.h:135 function parameter 'entry' not
 described in 'leafent_type'
Warning: include/linux/leafops.h:540 function parameter 'pte' not
 described in 'pte_is_uffd_marker'

Link: https://lkml.kernel.org/r/20251214201517.2187051-1-rdunlap@infradead.org
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/leafops.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/leafops.h b/include/linux/leafops.h
index cfafe7a5e7b1..a9ff94b744f2 100644
--- a/include/linux/leafops.h
+++ b/include/linux/leafops.h
@@ -133,7 +133,7 @@ static inline bool softleaf_is_none(softleaf_t entry)
 
 /**
  * softleaf_type() - Identify the type of leaf entry.
- * @enntry: Leaf entry.
+ * @entry: Leaf entry.
  *
  * Returns: the leaf entry type associated with @entry.
  */
@@ -534,7 +534,7 @@ static inline bool pte_is_uffd_wp_marker(pte_t pte)
 /**
  * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker
  * leaf entry?
- * @entry: Leaf entry.
+ * @pte: PTE entry.
  *
  * It's useful to be able to determine which leaf entries encode UFFD-specific
  * markers so we can handle these correctly.
-- 
cgit v1.2.3


From fe55ea85939efcbf0e6baa234f0d70acb79e7b58 Mon Sep 17 00:00:00 2001
From: Pingfan Liu <piliu@redhat.com>
Date: Tue, 16 Dec 2025 09:48:51 +0800
Subject: kernel/kexec: change the prototype of kimage_map_segment()

The kexec segment index will be required to extract the corresponding
information for that segment in kimage_map_segment().  Additionally,
kexec_segment already holds the kexec relocation destination address and
size.  Therefore, the prototype of kimage_map_segment() can be changed.

Link: https://lkml.kernel.org/r/20251216014852.8737-1-piliu@redhat.com
Fixes: 07d24902977e ("kexec: enable CMA based contiguous allocation")
Signed-off-by: Pingfan Liu <piliu@redhat.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Mimi Zohar <zohar@linux.ibm.com>
Cc: Roberto Sassu <roberto.sassu@huawei.com>
Cc: Alexander Graf <graf@amazon.com>
Cc: Steven Chen <chenste@linux.microsoft.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/kexec.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index ff7e231b0485..8a22bc9b8c6c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -530,7 +530,7 @@ extern bool kexec_file_dbg_print;
 #define kexec_dprintk(fmt, arg...) \
         do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0)
 
-extern void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size);
+extern void *kimage_map_segment(struct kimage *image, int idx);
 extern void kimage_unmap_segment(void *buffer);
 #else /* !CONFIG_KEXEC_CORE */
 struct pt_regs;
@@ -540,7 +540,7 @@ static inline void __crash_kexec(struct pt_regs *regs) { }
 static inline void crash_kexec(struct pt_regs *regs) { }
 static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 static inline int kexec_crash_loaded(void) { return 0; }
-static inline void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size)
+static inline void *kimage_map_segment(struct kimage *image, int idx)
 { return NULL; }
 static inline void kimage_unmap_segment(void *buffer) { }
 #define kexec_in_progress false
-- 
cgit v1.2.3


From e6dbcb7c0e7b508d443a9aa6f77f63a2f83b1ae4 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Thu, 11 Dec 2025 07:06:01 +0000
Subject: mm: fixup pfnmap memory failure handling to use pgoff

The memory failure handling implementation for the PFNMAP memory with no
struct pages is faulty.  The VA of the mapping is determined based on the
the PFN.  It should instead be based on the file mapping offset.

At the occurrence of poison, the memory_failure_pfn is triggered on the
poisoned PFN.  Introduce a callback function that allows mm to translate
the PFN to the corresponding file page offset.  The kernel module using
the registration API must implement the callback function and provide the
translation.  The translated value is then used to determine the VA
information and sending the SIGBUS to the usermode process mapped to the
poisoned PFN.

The callback is also useful for the driver to be notified of the poisoned
PFN, which may then track it.

Link: https://lkml.kernel.org/r/20251211070603.338701-2-ankita@nvidia.com
Fixes: 2ec41967189c ("mm: handle poisoning of pfn without struct pages")
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Matthew R. Ochs <mochs@nvidia.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Cc: Neo Jia <cjia@nvidia.com>
Cc: Vikram Sethi <vsethi@nvidia.com>
Cc: Yishai Hadas <yishaih@nvidia.com>
Cc: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/memory-failure.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h
index bc326503d2d2..7b5e11cf905f 100644
--- a/include/linux/memory-failure.h
+++ b/include/linux/memory-failure.h
@@ -9,6 +9,8 @@ struct pfn_address_space;
 struct pfn_address_space {
 	struct interval_tree_node node;
 	struct address_space *mapping;
+	int (*pfn_to_vma_pgoff)(struct vm_area_struct *vma,
+				unsigned long pfn, pgoff_t *pgoff);
 };
 
 int register_pfn_address_space(struct pfn_address_space *pfn_space);
-- 
cgit v1.2.3


From f183663901f21fe0fba8bd31ae894bc529709ee0 Mon Sep 17 00:00:00 2001
From: Bijan Tabatabai <bijan311@gmail.com>
Date: Tue, 16 Dec 2025 14:07:27 -0600
Subject: mm: consider non-anon swap cache folios in folio_expected_ref_count()

Currently, folio_expected_ref_count() only adds references for the swap
cache if the folio is anonymous.  However, according to the comment above
the definition of PG_swapcache in enum pageflags, shmem folios can also
have PG_swapcache set.  This patch makes sure references for the swap
cache are added if folio_test_swapcache(folio) is true.

This issue was found when trying to hot-unplug memory in a QEMU/KVM
virtual machine.  When initiating hot-unplug when most of the guest memory
is allocated, hot-unplug hangs partway through removal due to migration
failures.  The following message would be printed several times, and would
be printed again about every five seconds:

[   49.641309] migrating pfn b12f25 failed ret:7
[   49.641310] page: refcount:2 mapcount:0 mapping:0000000033bd8fe2 index:0x7f404d925 pfn:0xb12f25
[   49.641311] aops:swap_aops
[   49.641313] flags: 0x300000000030508(uptodate|active|owner_priv_1|reclaim|swapbacked|node=0|zone=3)
[   49.641314] raw: 0300000000030508 ffffed312c4bc908 ffffed312c4bc9c8 0000000000000000
[   49.641315] raw: 00000007f404d925 00000000000c823b 00000002ffffffff 0000000000000000
[   49.641315] page dumped because: migration failure

When debugging this, I found that these migration failures were due to
__migrate_folio() returning -EAGAIN for a small set of folios because the
expected reference count it calculates via folio_expected_ref_count() is
one less than the actual reference count of the folios.  Furthermore, all
of the affected folios were not anonymous, but had the PG_swapcache flag
set, inspiring this patch.  After applying this patch, the memory
hot-unplug behaves as expected.

I tested this on a machine running Ubuntu 24.04 with kernel version
6.8.0-90-generic and 64GB of memory.  The guest VM is managed by libvirt
and runs Ubuntu 24.04 with kernel version 6.18 (though the head of the
mm-unstable branch as a Dec 16, 2025 was also tested and behaves the same)
and 48GB of memory.  The libvirt XML definition for the VM can be found at
[1].  CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE is set in the guest
kernel so the hot-pluggable memory is automatically onlined.

Below are the steps to reproduce this behavior:

1) Define and start and virtual machine
  host$ virsh -c qemu:///system define ./test_vm.xml # test_vm.xml from [1]
  host$ virsh -c qemu:///system start test_vm

2) Setup swap in the guest
  guest$ sudo fallocate -l 32G /swapfile
  guest$ sudo chmod 0600 /swapfile
  guest$ sudo mkswap /swapfile
  guest$ sudo swapon /swapfile

3) Use alloc_data [2] to allocate most of the remaining guest memory
  guest$ ./alloc_data 45

4) In a separate guest terminal, monitor the amount of used memory
  guest$ watch -n1 free -h

5) When alloc_data has finished allocating, initiate the memory
hot-unplug using the provided xml file [3]
  host$ virsh -c qemu:///system detach-device test_vm ./remove.xml --live

After initiating the memory hot-unplug, you should see the amount of
available memory in the guest decrease, and the amount of used swap data
increase.  If everything works as expected, when all of the memory is
unplugged, there should be around 8.5-9GB of data in swap.  If the
unplugging is unsuccessful, the amount of used swap data will settle below
that.  If that happens, you should be able to see log messages in dmesg
similar to the one posted above.

Link: https://lkml.kernel.org/r/20251216200727.2360228-1-bijan311@gmail.com
Link: https://github.com/BijanT/linux_patch_files/blob/main/test_vm.xml [1]
Link: https://github.com/BijanT/linux_patch_files/blob/main/alloc_data.c [2]
Link: https://github.com/BijanT/linux_patch_files/blob/main/remove.xml [3]
Fixes: 86ebd50224c0 ("mm: add folio_expected_ref_count() for reference count calculation")
Signed-off-by: Bijan Tabatabai <bijan311@gmail.com>
Acked-by: David Hildenbrand (Red Hat) <david@kernel.org>
Acked-by: Zi Yan <ziy@nvidia.com>
Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: Liam Howlett <liam.howlett@oracle.com>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Shivank Garg <shivankg@amd.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Kairui Song <ryncsn@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/mm.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 15076261d0c2..6f959d8ca4b4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2459,10 +2459,10 @@ static inline int folio_expected_ref_count(const struct folio *folio)
 	if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio)))
 		return 0;
 
-	if (folio_test_anon(folio)) {
-		/* One reference per page from the swapcache. */
-		ref_count += folio_test_swapcache(folio) << order;
-	} else {
+	/* One reference per page from the swapcache. */
+	ref_count += folio_test_swapcache(folio) << order;
+
+	if (!folio_test_anon(folio)) {
 		/* One reference per page from the pagecache. */
 		ref_count += !!folio->mapping << order;
 		/* One reference from PG_private. */
-- 
cgit v1.2.3


From dc85a46928c41423ad89869baf05a589e2975575 Mon Sep 17 00:00:00 2001
From: Kevin Tian <kevin.tian@intel.com>
Date: Thu, 18 Dec 2025 08:16:49 +0000
Subject: vfio/pci: Disable qword access to the PCI ROM bar

Commit 2b938e3db335 ("vfio/pci: Enable iowrite64 and ioread64 for vfio
pci") enables qword access to the PCI bar resources. However certain
devices (e.g. Intel X710) are observed with problem upon qword accesses
to the rom bar, e.g. triggering PCI aer errors.

This is triggered by Qemu which caches the rom content by simply does a
pread() of the remaining size until it gets the full contents. The other
bars would only perform operations at the same access width as their
guest drivers.

Instead of trying to identify all broken devices, universally disable
qword access to the rom bar i.e. going back to the old way which worked
reliably for years.

Reported-by: Farrah Chen <farrah.chen@intel.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220740
Fixes: 2b938e3db335 ("vfio/pci: Enable iowrite64 and ioread64 for vfio pci")
Cc: stable@vger.kernel.org
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Farrah Chen <farrah.chen@intel.com>
Link: https://lore.kernel.org/r/20251218081650.555015-2-kevin.tian@intel.com
Signed-off-by: Alex Williamson <alex@shazbot.org>
---
 include/linux/vfio_pci_core.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 706877f998ff..1ac86896875c 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -145,6 +145,13 @@ struct vfio_pci_core_device {
 	struct list_head	dmabufs;
 };
 
+enum vfio_pci_io_width {
+	VFIO_PCI_IO_WIDTH_1 = 1,
+	VFIO_PCI_IO_WIDTH_2 = 2,
+	VFIO_PCI_IO_WIDTH_4 = 4,
+	VFIO_PCI_IO_WIDTH_8 = 8,
+};
+
 /* Will be exported for vfio pci drivers usage */
 int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
 				      unsigned int type, unsigned int subtype,
@@ -188,7 +195,8 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 			       void __iomem *io, char __user *buf,
 			       loff_t off, size_t count, size_t x_start,
-			       size_t x_end, bool iswrite);
+			       size_t x_end, bool iswrite,
+			       enum vfio_pci_io_width max_width);
 bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev);
 bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
 					 loff_t reg_start, size_t reg_cnt,
-- 
cgit v1.2.3


From f059588c552746e0fe299214f35c58effa715b74 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 4 Dec 2025 13:31:52 -0500
Subject: virtio: make it self-contained

virtio.h uses struct module, add a forward declaration to
make the header self-contained.

Message-ID: <9171b5cac60793eb59ab044c96ee038bf1363bee.1764873799.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 132a474e5914..3626eb694728 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -13,6 +13,8 @@
 #include <linux/completion.h>
 #include <linux/virtio_features.h>
 
+struct module;
+
 /**
  * struct virtqueue - a queue to register buffers for sending or receiving.
  * @list: the chain of virtqueues for this device
-- 
cgit v1.2.3


From e88dfb93311c81359b00c12e0b396bd0ea13ad6c Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Thu, 4 Dec 2025 12:49:34 -0500
Subject: virtio_features: make it self-contained

virtio_features.h uses WARN_ON_ONCE and memset so it must
include linux/bug.h and linux/string.h

Message-ID: <579986aa9b8d023844990d2a0e267382f8ad85d5.1764873799.git.mst@redhat.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/virtio_features.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/virtio_features.h b/include/linux/virtio_features.h
index ea2ad8717882..ce59ea91f474 100644
--- a/include/linux/virtio_features.h
+++ b/include/linux/virtio_features.h
@@ -3,6 +3,8 @@
 #define _LINUX_VIRTIO_FEATURES_H
 
 #include <linux/bits.h>
+#include <linux/bug.h>
+#include <linux/string.h>
 
 #define VIRTIO_FEATURES_U64S	2
 #define VIRTIO_FEATURES_BITS	(VIRTIO_FEATURES_U64S * 64)
-- 
cgit v1.2.3


From 5623eb1ed035f01dfa620366a82b667545b10c82 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Dec 2025 08:12:46 -0700
Subject: io_uring/tctx: add separate lock for list of tctx's in ctx

ctx->tcxt_list holds the tasks using this ring, and it's currently
protected by the normal ctx->uring_lock. However, this can cause a
circular locking issue, as reported by syzbot, where cancelations off
exec end up needing to remove an entry from this list:

======================================================
WARNING: possible circular locking dependency detected
syzkaller #0 Tainted: G             L
------------------------------------------------------
syz.0.9999/12287 is trying to acquire lock:
ffff88805851c0a8 (&ctx->uring_lock){+.+.}-{4:4}, at: io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179

but task is already holding lock:
ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline]
ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&sig->cred_guard_mutex){+.+.}-{4:4}:
       __mutex_lock_common kernel/locking/mutex.c:614 [inline]
       __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776
       proc_pid_attr_write+0x547/0x630 fs/proc/base.c:2837
       vfs_write+0x27e/0xb30 fs/read_write.c:684
       ksys_write+0x145/0x250 fs/read_write.c:738
       do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
       do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
       entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #1 (sb_writers#3){.+.+}-{0:0}:
       percpu_down_read_internal include/linux/percpu-rwsem.h:53 [inline]
       percpu_down_read_freezable include/linux/percpu-rwsem.h:83 [inline]
       __sb_start_write include/linux/fs/super.h:19 [inline]
       sb_start_write+0x4d/0x1c0 include/linux/fs/super.h:125
       mnt_want_write+0x41/0x90 fs/namespace.c:499
       open_last_lookups fs/namei.c:4529 [inline]
       path_openat+0xadd/0x3dd0 fs/namei.c:4784
       do_filp_open+0x1fa/0x410 fs/namei.c:4814
       io_openat2+0x3e0/0x5c0 io_uring/openclose.c:143
       __io_issue_sqe+0x181/0x4b0 io_uring/io_uring.c:1792
       io_issue_sqe+0x165/0x1060 io_uring/io_uring.c:1815
       io_queue_sqe io_uring/io_uring.c:2042 [inline]
       io_submit_sqe io_uring/io_uring.c:2320 [inline]
       io_submit_sqes+0xbf4/0x2140 io_uring/io_uring.c:2434
       __do_sys_io_uring_enter io_uring/io_uring.c:3280 [inline]
       __se_sys_io_uring_enter+0x2e0/0x2b60 io_uring/io_uring.c:3219
       do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
       do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
       entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #0 (&ctx->uring_lock){+.+.}-{4:4}:
       check_prev_add kernel/locking/lockdep.c:3165 [inline]
       check_prevs_add kernel/locking/lockdep.c:3284 [inline]
       validate_chain kernel/locking/lockdep.c:3908 [inline]
       __lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237
       lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868
       __mutex_lock_common kernel/locking/mutex.c:614 [inline]
       __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776
       io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179
       io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195
       io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646
       io_uring_task_cancel include/linux/io_uring.h:24 [inline]
       begin_new_exec+0x10ed/0x2440 fs/exec.c:1131
       load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010
       search_binary_handler fs/exec.c:1669 [inline]
       exec_binprm fs/exec.c:1701 [inline]
       bprm_execve+0x92e/0x1400 fs/exec.c:1753
       do_execveat_common+0x510/0x6a0 fs/exec.c:1859
       do_execve fs/exec.c:1933 [inline]
       __do_sys_execve fs/exec.c:2009 [inline]
       __se_sys_execve fs/exec.c:2004 [inline]
       __x64_sys_execve+0x94/0xb0 fs/exec.c:2004
       do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
       do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
       entry_SYSCALL_64_after_hwframe+0x77/0x7f

other info that might help us debug this:

Chain exists of:
  &ctx->uring_lock --> sb_writers#3 --> &sig->cred_guard_mutex

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&sig->cred_guard_mutex);
                               lock(sb_writers#3);
                               lock(&sig->cred_guard_mutex);
  lock(&ctx->uring_lock);

 *** DEADLOCK ***

1 lock held by syz.0.9999/12287:
 #0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline]
 #0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733

stack backtrace:
CPU: 0 UID: 0 PID: 12287 Comm: syz.0.9999 Tainted: G             L      syzkaller #0 PREEMPT(full)
Tainted: [L]=SOFTLOCKUP
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/25/2025
Call Trace:
 <TASK>
 dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
 print_circular_bug+0x2e2/0x300 kernel/locking/lockdep.c:2043
 check_noncircular+0x12e/0x150 kernel/locking/lockdep.c:2175
 check_prev_add kernel/locking/lockdep.c:3165 [inline]
 check_prevs_add kernel/locking/lockdep.c:3284 [inline]
 validate_chain kernel/locking/lockdep.c:3908 [inline]
 __lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237
 lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868
 __mutex_lock_common kernel/locking/mutex.c:614 [inline]
 __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776
 io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179
 io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195
 io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646
 io_uring_task_cancel include/linux/io_uring.h:24 [inline]
 begin_new_exec+0x10ed/0x2440 fs/exec.c:1131
 load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010
 search_binary_handler fs/exec.c:1669 [inline]
 exec_binprm fs/exec.c:1701 [inline]
 bprm_execve+0x92e/0x1400 fs/exec.c:1753
 do_execveat_common+0x510/0x6a0 fs/exec.c:1859
 do_execve fs/exec.c:1933 [inline]
 __do_sys_execve fs/exec.c:2009 [inline]
 __se_sys_execve fs/exec.c:2004 [inline]
 __x64_sys_execve+0x94/0xb0 fs/exec.c:2004
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7ff3a8b8f749
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007ff3a9a97038 EFLAGS: 00000246 ORIG_RAX: 000000000000003b
RAX: ffffffffffffffda RBX: 00007ff3a8de5fa0 RCX: 00007ff3a8b8f749
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000200000000400
RBP: 00007ff3a8c13f91 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007ff3a8de6038 R14: 00007ff3a8de5fa0 R15: 00007ff3a8f0fa28
 </TASK>

Add a separate lock just for the tctx_list, tctx_lock. This can nest
under ->uring_lock, where necessary, and be used separately for list
manipulation. For the cancelation off exec side, this removes the
need to grab ->uring_lock, hence fixing the circular locking
dependency.

Reported-by: syzbot+b0e3b77ffaa8a4067ce5@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e1adb0d20a0a..a3e8ddc9b380 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -424,11 +424,17 @@ struct io_ring_ctx {
 	struct user_struct		*user;
 	struct mm_struct		*mm_account;
 
+	/*
+	 * List of tctx nodes for this ctx, protected by tctx_lock. For
+	 * cancelation purposes, nests under uring_lock.
+	 */
+	struct list_head		tctx_list;
+	struct mutex			tctx_lock;
+
 	/* ctx exit and cancelation */
 	struct llist_head		fallback_llist;
 	struct delayed_work		fallback_work;
 	struct work_struct		exit_work;
-	struct list_head		tctx_list;
 	struct completion		ref_comp;
 
 	/* io-wq management, e.g. thread count */
-- 
cgit v1.2.3