From c8f3c9fa75ff3822b56b47d5cfa0aaa484040ea8 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 15 Dec 2025 12:10:35 +0200 Subject: ASoC: soc-acpi / SOF: Add best_effort flag to get_function_tplg_files op When there is no fallback possibility available for the function topology use it is better to try to create a profile for the card in best effort manner, leaving out non supported links for example. As an example: some laptops present SSPx-BT link but we don't have fragment yet to support this. If we only have support for functional topology without monolithic fallback then we would fail the card creation. The reason why the monolithic topology works on the same device is that it does not have the SSPx-BT link handled, it is ignored. In case when there is no fallback possibility we should try to create the card with links that we support as best effort instead of failing and leaving the user without a card. Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Link: https://patch.msgid.link/20251215101036.9370-2-peter.ujfalusi@linux.intel.com Signed-off-by: Mark Brown --- include/sound/soc-acpi.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h index 90d73b9bddab..0519afd7217f 100644 --- a/include/sound/soc-acpi.h +++ b/include/sound/soc-acpi.h @@ -203,6 +203,8 @@ struct snd_soc_acpi_link_adr { * @mach: the pointer of the machine driver * @prefix: the prefix of the topology file name. Typically, it is the path. * @tplg_files: the pointer of the array of the topology file names. + * @best_effort: ignore non supported links and try to build the card in best effort + * with supported links */ /* Descriptor for SST ASoC machine driver */ struct snd_soc_acpi_mach { @@ -224,7 +226,8 @@ struct snd_soc_acpi_mach { const u32 tplg_quirk_mask; int (*get_function_tplg_files)(struct snd_soc_card *card, const struct snd_soc_acpi_mach *mach, - const char *prefix, const char ***tplg_files); + const char *prefix, const char ***tplg_files, + bool best_effort); }; #define SND_SOC_ACPI_MAX_CODECS 3 -- cgit v1.2.3 From 2dc675f614850b80deab7cf6d12902636ed8a7f4 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Dec 2025 14:33:05 +0100 Subject: RDMA/ucma: Fix rdma_ucm_query_ib_service_resp struct padding On a few 32-bit architectures, the newly added ib_user_service_rec structure is not 64-bit aligned the way it is on most regular ones. Add explicit padding into the rdma_ucm_query_ib_service_resp and rdma_ucm_resolve_ib_service structures that embed it, so that the layout is compatible across all of them. This is an ABI change on i386, aligning it with x86_64 and the other 64-bit architectures to avoid having to use a compat ioctl handler. Fixes: 810f874eda8e ("RDMA/ucma: Support query resolved service records") Link: https://patch.msgid.link/r/20251208133311.313977-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/rdma_user_cm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 5ded174687ee..838f8d460256 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -192,6 +192,7 @@ struct rdma_ucm_query_path_resp { struct rdma_ucm_query_ib_service_resp { __u32 num_service_recs; + __u32 reserved; struct ib_user_service_rec recs[]; }; @@ -354,7 +355,7 @@ enum { #define RDMA_USER_CM_IB_SERVICE_NAME_SIZE 64 struct rdma_ucm_ib_service { - __u64 service_id; + __aligned_u64 service_id; __u8 service_name[RDMA_USER_CM_IB_SERVICE_NAME_SIZE]; __u32 flags; __u32 reserved; @@ -362,6 +363,7 @@ struct rdma_ucm_ib_service { struct rdma_ucm_resolve_ib_service { __u32 id; + __u32 reserved; struct rdma_ucm_ib_service ibs; }; -- cgit v1.2.3 From d95e99a74eaf35c070f5939295331e5d7857c723 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Dec 2025 14:38:44 +0100 Subject: RDMA/irdma: Fix irdma_alloc_ucontext_resp padding A recent commit modified struct irdma_alloc_ucontext_resp by adding a member with implicit padding in front of it, though this does not change the offset of the data members other than m68k. Reported by scripts/check-uapi.sh: ==== ABI differences detected in include/rdma/irdma-abi.h from 1dd7bde2e91c -> HEAD ==== [C] 'struct irdma_alloc_ucontext_resp' changed: type size changed from 704 to 640 (in bits) 1 data member deletion: '__u8 rsvd3[2]', at offset 640 (in bits) at irdma-abi.h:61:1 1 data member insertion: '__u8 revd3[2]', at offset 592 (in bits) at irdma-abi.h:60:1 Change the size back to the previous version, and remove the implicit padding by making it explicit and matching what x86-64 would do by placing max_hw_srq_quanta member into a naturally aligned location. Fixes: 563e1feb5f6e ("RDMA/irdma: Add SRQ support") Link: https://patch.msgid.link/r/20251208133849.315451-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Geert Uytterhoeven Tested-by: Jacob Moroni Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/irdma-abi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/rdma/irdma-abi.h b/include/uapi/rdma/irdma-abi.h index f7788d33376b..36f20802bcc8 100644 --- a/include/uapi/rdma/irdma-abi.h +++ b/include/uapi/rdma/irdma-abi.h @@ -57,8 +57,8 @@ struct irdma_alloc_ucontext_resp { __u8 rsvd2; __aligned_u64 comp_mask; __u16 min_hw_wq_size; + __u8 revd3[2]; __u32 max_hw_srq_quanta; - __u8 rsvd3[2]; }; struct irdma_alloc_pd_resp { -- cgit v1.2.3 From 4a824c3128998158a093eaadd776a79abe3a601a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Dec 2025 15:31:27 +0000 Subject: entry: Always inline local_irq_{enable,disable}_exit_to_user() clang needs __always_inline instead of inline, even for tiny helpers. This saves some cycles in system call fast path, and saves 195 bytes on x86_64 build: $ size vmlinux.before vmlinux.after text data bss dec hex filename 34652814 22291961 5875180 62819955 3be8e73 vmlinux.before 34652619 22291961 5875180 62819760 3be8db0 vmlinux.after Signed-off-by: Eric Dumazet Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251204153127.1321824-1-edumazet@google.com --- include/linux/irq-entry-common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 6ab913e57da0..d26d1b1bcbfb 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -110,7 +110,7 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs) static inline void local_irq_enable_exit_to_user(unsigned long ti_work); #ifndef local_irq_enable_exit_to_user -static inline void local_irq_enable_exit_to_user(unsigned long ti_work) +static __always_inline void local_irq_enable_exit_to_user(unsigned long ti_work) { local_irq_enable(); } @@ -125,7 +125,7 @@ static inline void local_irq_enable_exit_to_user(unsigned long ti_work) static inline void local_irq_disable_exit_to_user(void); #ifndef local_irq_disable_exit_to_user -static inline void local_irq_disable_exit_to_user(void) +static __always_inline void local_irq_disable_exit_to_user(void) { local_irq_disable(); } -- cgit v1.2.3 From b61104e7a6349bd2c2b3e2fb3260d87f15eda8f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 22 Dec 2025 08:45:48 +0100 Subject: regulator: uapi: Use UAPI integer type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using libc types and headers from the UAPI headers is problematic as it introduces a dependency on a full C toolchain. Use the fixed-width integer type provided by the UAPI headers instead. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20251222-uapi-regulator-v1-1-a71c66eb1a94@linutronix.de Signed-off-by: Mark Brown --- include/uapi/regulator/regulator.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/uapi/regulator/regulator.h b/include/uapi/regulator/regulator.h index 71bf71a22e7f..c4f2d1c19828 100644 --- a/include/uapi/regulator/regulator.h +++ b/include/uapi/regulator/regulator.h @@ -8,11 +8,7 @@ #ifndef _UAPI_REGULATOR_H #define _UAPI_REGULATOR_H -#ifdef __KERNEL__ #include -#else -#include -#endif /* * Regulator notifier events. @@ -62,7 +58,7 @@ struct reg_genl_event { char reg_name[32]; - uint64_t event; + __u64 event; }; /* attributes of reg_genl_family */ -- cgit v1.2.3 From 87e7f6019097746d1d06f98874a9f179b7a68f3e Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Fri, 19 Dec 2025 10:36:38 +0200 Subject: software node: Also support referencing non-constant software nodes Fwnode references are be implemented differently if referenced node is a software node. _Generic() is used to differentiate between the two cases but only const software nodes were present in the selection. Also add non-const software nodes. Reported-by: Kenneth Crudup Closes: https://lore.kernel.org/all/af773b82-bef2-4209-baaf-526d4661b7fc@panix.com/ Fixes: d7cdbbc93c56 ("software node: allow referencing firmware nodes") Signed-off-by: Sakari Ailus Tested-By: Kenneth R. Crudup Tested-by: Mehdi Djait # Dell XPS 9315 Reviewed-by: Mehdi Djait Link: https://patch.msgid.link/20251219083638.2454138-1-sakari.ailus@linux.intel.com Signed-off-by: Danilo Krummrich --- include/linux/property.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/property.h b/include/linux/property.h index 272bfbdea7bf..e30ef23a9af3 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -371,6 +371,7 @@ struct software_node_ref_args { (const struct software_node_ref_args) { \ .swnode = _Generic(_ref_, \ const struct software_node *: _ref_, \ + struct software_node *: _ref_, \ default: NULL), \ .fwnode = _Generic(_ref_, \ struct fwnode_handle *: _ref_, \ -- cgit v1.2.3 From 20e20b147cf7cb6780a5b95da2a0e37c52cd1015 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 15 Dec 2025 22:38:00 -0800 Subject: platform/x86/intel/vsec: correct kernel-doc comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix kernel-doc warnings in intel_vsec.h to eliminate all kernel-doc warnings: Warning: include/linux/intel_vsec.h:92 struct member 'read_telem' not described in 'pmt_callbacks' Warning: include/linux/intel_vsec.h:146 expecting prototype for struct intel_sec_device. Prototype was for struct intel_vsec_device instead Warning: include/linux/intel_vsec.h:146 struct member 'priv_data_size' not described in 'intel_vsec_device' In struct pmt_callbacks, correct the kernel-doc for @read_telem. kernel-doc doesn't support documenting callback function parameters, so drop the '@' signs on those and use "* *" to make them somewhat readable in the produced documentation output. Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251216063801.2896495-1-rdunlap@infradead.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/intel_vsec.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/intel_vsec.h b/include/linux/intel_vsec.h index 53f6fe88e369..1a0f357c2427 100644 --- a/include/linux/intel_vsec.h +++ b/include/linux/intel_vsec.h @@ -80,13 +80,13 @@ enum intel_vsec_quirks { /** * struct pmt_callbacks - Callback infrastructure for PMT devices - * ->read_telem() when specified, called by client driver to access PMT data (instead - * of direct copy). - * @pdev: PCI device reference for the callback's use - * @guid: ID of data to acccss - * @data: buffer for the data to be copied - * @off: offset into the requested buffer - * @count: size of buffer + * @read_telem: when specified, called by client driver to access PMT + * data (instead of direct copy). + * * pdev: PCI device reference for the callback's use + * * guid: ID of data to acccss + * * data: buffer for the data to be copied + * * off: offset into the requested buffer + * * count: size of buffer */ struct pmt_callbacks { int (*read_telem)(struct pci_dev *pdev, u32 guid, u64 *data, loff_t off, u32 count); @@ -120,7 +120,7 @@ struct intel_vsec_platform_info { }; /** - * struct intel_sec_device - Auxbus specific device information + * struct intel_vsec_device - Auxbus specific device information * @auxdev: auxbus device struct for auxbus access * @pcidev: pci device associated with the device * @resource: any resources shared by the parent @@ -128,6 +128,7 @@ struct intel_vsec_platform_info { * @num_resources: number of resources * @id: xarray id * @priv_data: any private data needed + * @priv_data_size: size of private data area * @quirks: specified quirks * @base_addr: base address of entries (if specified) * @cap_id: the enumerated id of the vsec feature -- cgit v1.2.3 From c31f4aa8fed048fa70e742c4bb49bb48dc489ab3 Mon Sep 17 00:00:00 2001 From: David Gow Date: Fri, 19 Dec 2025 16:52:58 +0800 Subject: kunit: Enforce task execution in {soft,hard}irq contexts The kunit_run_irq_test() helper allows a function to be run in hardirq and softirq contexts (in addition to the task context). It does this by running the user-provided function concurrently in the three contexts, until either a timeout has expired or a number of iterations have completed in the normal task context. However, on setups where the initialisation of the hardirq and softirq contexts (or, indeed, the scheduling of those tasks) is significantly slower than the function execution, it's possible for that number of iterations to be exceeded before any runs in irq contexts actually occur. This occurs with the polyval.test_polyval_preparekey_in_irqs test, which runs 20000 iterations of the relatively fast preparekey function, and therefore fails often under many UML, 32-bit arm, m68k and other environments. Instead, ensure that the max_iterations limit counts executions in all three contexts, and requires at least one of each. This will cause the test to continue iterating until at least the irq contexts have been tested, or the 1s wall-clock limit has been exceeded. This causes the test to pass in all of my environments. In so doing, we also update the task counters to atomic ints, to better match both the 'int' max_iterations input, and to ensure they are correctly updated across contexts. Finally, we also fix a few potential assertion messages to be less-specific to the original crypto usecases. Fixes: 950a81224e8b ("lib/crypto: tests: Add hash-test-template.h and gen-hash-testvecs.py") Signed-off-by: David Gow Link: https://lore.kernel.org/r/20251219085259.1163048-1-davidgow@google.com Signed-off-by: Eric Biggers --- include/kunit/run-in-irq-context.h | 53 ++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/kunit/run-in-irq-context.h b/include/kunit/run-in-irq-context.h index 108e96433ea4..c89b1b1b12dd 100644 --- a/include/kunit/run-in-irq-context.h +++ b/include/kunit/run-in-irq-context.h @@ -20,8 +20,8 @@ struct kunit_irq_test_state { bool task_func_reported_failure; bool hardirq_func_reported_failure; bool softirq_func_reported_failure; - unsigned long hardirq_func_calls; - unsigned long softirq_func_calls; + atomic_t hardirq_func_calls; + atomic_t softirq_func_calls; struct hrtimer timer; struct work_struct bh_work; }; @@ -32,7 +32,7 @@ static enum hrtimer_restart kunit_irq_test_timer_func(struct hrtimer *timer) container_of(timer, typeof(*state), timer); WARN_ON_ONCE(!in_hardirq()); - state->hardirq_func_calls++; + atomic_inc(&state->hardirq_func_calls); if (!state->func(state->test_specific_state)) state->hardirq_func_reported_failure = true; @@ -48,7 +48,7 @@ static void kunit_irq_test_bh_work_func(struct work_struct *work) container_of(work, typeof(*state), bh_work); WARN_ON_ONCE(!in_serving_softirq()); - state->softirq_func_calls++; + atomic_inc(&state->softirq_func_calls); if (!state->func(state->test_specific_state)) state->softirq_func_reported_failure = true; @@ -59,7 +59,10 @@ static void kunit_irq_test_bh_work_func(struct work_struct *work) * hardirq context concurrently, and reports a failure to KUnit if any * invocation of @func in any context returns false. @func is passed * @test_specific_state as its argument. At most 3 invocations of @func will - * run concurrently: one in each of task, softirq, and hardirq context. + * run concurrently: one in each of task, softirq, and hardirq context. @func + * will continue running until either @max_iterations calls have been made (so + * long as at least one each runs in task, softirq, and hardirq contexts), or + * one second has passed. * * The main purpose of this interrupt context testing is to validate fallback * code paths that run in contexts where the normal code path cannot be used, @@ -85,6 +88,8 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *), .test_specific_state = test_specific_state, }; unsigned long end_jiffies; + int hardirq_calls, softirq_calls; + bool allctx = false; /* * Set up a hrtimer (the way we access hardirq context) and a work @@ -94,14 +99,25 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *), CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); INIT_WORK_ONSTACK(&state.bh_work, kunit_irq_test_bh_work_func); - /* Run for up to max_iterations or 1 second, whichever comes first. */ + /* + * Run for up to max_iterations (including at least one task, softirq, + * and hardirq), or 1 second, whichever comes first. + */ end_jiffies = jiffies + HZ; hrtimer_start(&state.timer, KUNIT_IRQ_TEST_HRTIMER_INTERVAL, HRTIMER_MODE_REL_HARD); - for (int i = 0; i < max_iterations && !time_after(jiffies, end_jiffies); - i++) { + for (int task_calls = 0, calls = 0; + ((calls < max_iterations) || !allctx) && + !time_after(jiffies, end_jiffies); + task_calls++) { if (!func(test_specific_state)) state.task_func_reported_failure = true; + + hardirq_calls = atomic_read(&state.hardirq_func_calls); + softirq_calls = atomic_read(&state.softirq_func_calls); + calls = task_calls + hardirq_calls + softirq_calls; + allctx = (task_calls > 0) && (hardirq_calls > 0) && + (softirq_calls > 0); } /* Cancel the timer and work. */ @@ -109,21 +125,18 @@ static inline void kunit_run_irq_test(struct kunit *test, bool (*func)(void *), flush_work(&state.bh_work); /* Sanity check: the timer and BH functions should have been run. */ - KUNIT_EXPECT_GT_MSG(test, state.hardirq_func_calls, 0, + KUNIT_EXPECT_GT_MSG(test, atomic_read(&state.hardirq_func_calls), 0, "Timer function was not called"); - KUNIT_EXPECT_GT_MSG(test, state.softirq_func_calls, 0, + KUNIT_EXPECT_GT_MSG(test, atomic_read(&state.softirq_func_calls), 0, "BH work function was not called"); - /* Check for incorrect hash values reported from any context. */ - KUNIT_EXPECT_FALSE_MSG( - test, state.task_func_reported_failure, - "Incorrect hash values reported from task context"); - KUNIT_EXPECT_FALSE_MSG( - test, state.hardirq_func_reported_failure, - "Incorrect hash values reported from hardirq context"); - KUNIT_EXPECT_FALSE_MSG( - test, state.softirq_func_reported_failure, - "Incorrect hash values reported from softirq context"); + /* Check for failure reported from any context. */ + KUNIT_EXPECT_FALSE_MSG(test, state.task_func_reported_failure, + "Failure reported from task context"); + KUNIT_EXPECT_FALSE_MSG(test, state.hardirq_func_reported_failure, + "Failure reported from hardirq context"); + KUNIT_EXPECT_FALSE_MSG(test, state.softirq_func_reported_failure, + "Failure reported from softirq context"); } #endif /* _KUNIT_RUN_IN_IRQ_CONTEXT_H */ -- cgit v1.2.3 From 754c23238438600e9236719f7e67aff2c4d02093 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Hellstr=C3=B6m?= Date: Fri, 19 Dec 2025 12:32:59 +0100 Subject: drm/pagemap, drm/xe: Ensure that the devmem allocation is idle before use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In situations where no system memory is migrated to devmem, and in upcoming patches where another GPU is performing the migration to the newly allocated devmem buffer, there is nothing to ensure any ongoing clear to the devmem allocation or async eviction from the devmem allocation is complete. Address that by passing a struct dma_fence down to the copy functions, and ensure it is waited for before migration is marked complete. v3: - New patch. v4: - Update the logic used for determining when to wait for the pre_migrate_fence. - Update the logic used for determining when to warn for the pre_migrate_fence since the scheduler fences apparently can signal out-of-order. v5: - Fix a UAF (CI) - Remove references to source P2P migration (Himal) - Put the pre_migrate_fence after migration. v6: - Pipeline the pre_migrate_fence dependency (Matt Brost) Fixes: c5b3eb5a906c ("drm/xe: Add GPUSVM device memory copy vfunc functions") Cc: Matthew Brost Cc: # v6.15+ Signed-off-by: Thomas Hellström Reviewed-by: Matthew Brost Acked-by: Maarten Lankhorst # For merging through drm-xe. Link: https://patch.msgid.link/20251219113320.183860-4-thomas.hellstrom@linux.intel.com (cherry picked from commit 16b5ad31952476fb925c401897fc171cd37f536b) Signed-off-by: Thomas Hellström --- include/drm/drm_pagemap.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h index f6e7e234c089..70a7991f784f 100644 --- a/include/drm/drm_pagemap.h +++ b/include/drm/drm_pagemap.h @@ -8,6 +8,7 @@ #define NR_PAGES(order) (1U << (order)) +struct dma_fence; struct drm_pagemap; struct drm_pagemap_zdd; struct device; @@ -174,6 +175,8 @@ struct drm_pagemap_devmem_ops { * @pages: Pointer to array of device memory pages (destination) * @pagemap_addr: Pointer to array of DMA information (source) * @npages: Number of pages to copy + * @pre_migrate_fence: dma-fence to wait for before migration start. + * May be NULL. * * Copy pages to device memory. If the order of a @pagemap_addr entry * is greater than 0, the entry is populated but subsequent entries @@ -183,13 +186,16 @@ struct drm_pagemap_devmem_ops { */ int (*copy_to_devmem)(struct page **pages, struct drm_pagemap_addr *pagemap_addr, - unsigned long npages); + unsigned long npages, + struct dma_fence *pre_migrate_fence); /** * @copy_to_ram: Copy to system RAM (required for migration) * @pages: Pointer to array of device memory pages (source) * @pagemap_addr: Pointer to array of DMA information (destination) * @npages: Number of pages to copy + * @pre_migrate_fence: dma-fence to wait for before migration start. + * May be NULL. * * Copy pages to system RAM. If the order of a @pagemap_addr entry * is greater than 0, the entry is populated but subsequent entries @@ -199,7 +205,8 @@ struct drm_pagemap_devmem_ops { */ int (*copy_to_ram)(struct page **pages, struct drm_pagemap_addr *pagemap_addr, - unsigned long npages); + unsigned long npages, + struct dma_fence *pre_migrate_fence); }; /** @@ -212,6 +219,8 @@ struct drm_pagemap_devmem_ops { * @dpagemap: The struct drm_pagemap of the pages this allocation belongs to. * @size: Size of device memory allocation * @timeslice_expiration: Timeslice expiration in jiffies + * @pre_migrate_fence: Fence to wait for or pipeline behind before migration starts. + * (May be NULL). */ struct drm_pagemap_devmem { struct device *dev; @@ -221,6 +230,7 @@ struct drm_pagemap_devmem { struct drm_pagemap *dpagemap; size_t size; u64 timeslice_expiration; + struct dma_fence *pre_migrate_fence; }; int drm_pagemap_migrate_to_devmem(struct drm_pagemap_devmem *devmem_allocation, @@ -238,7 +248,8 @@ struct drm_pagemap *drm_pagemap_page_to_dpagemap(struct page *page); void drm_pagemap_devmem_init(struct drm_pagemap_devmem *devmem_allocation, struct device *dev, struct mm_struct *mm, const struct drm_pagemap_devmem_ops *ops, - struct drm_pagemap *dpagemap, size_t size); + struct drm_pagemap *dpagemap, size_t size, + struct dma_fence *pre_migrate_fence); int drm_pagemap_populate_mm(struct drm_pagemap *dpagemap, unsigned long start, unsigned long end, -- cgit v1.2.3 From 06e219f6a706c367c93051f408ac61417643d2f9 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 15 Dec 2025 17:02:35 +0200 Subject: net: dsa: properly keep track of conduit reference Problem description ------------------- DSA has a mumbo-jumbo of reference handling of the conduit net device and its kobject which, sadly, is just wrong and doesn't make sense. There are two distinct problems. 1. The OF path, which uses of_find_net_device_by_node(), never releases the elevated refcount on the conduit's kobject. Nominally, the OF and non-OF paths should result in objects having identical reference counts taken, and it is already suspicious that dsa_dev_to_net_device() has a put_device() call which is missing in dsa_port_parse_of(), but we can actually even verify that an issue exists. With CONFIG_DEBUG_KOBJECT_RELEASE=y, if we run this command "before" and "after" applying this patch: (unbind the conduit driver for net device eno2) echo 0000:00:00.2 > /sys/bus/pci/drivers/fsl_enetc/unbind we see these lines in the output diff which appear only with the patch applied: kobject: 'eno2' (ffff002009a3a6b8): kobject_release, parent 0000000000000000 (delayed 1000) kobject: '109' (ffff0020099d59a0): kobject_release, parent 0000000000000000 (delayed 1000) 2. After we find the conduit interface one way (OF) or another (non-OF), it can get unregistered at any time, and DSA remains with a long-lived, but in this case stale, cpu_dp->conduit pointer. Holding the net device's underlying kobject isn't actually of much help, it just prevents it from being freed (but we never need that kobject directly). What helps us to prevent the net device from being unregistered is the parallel netdev reference mechanism (dev_hold() and dev_put()). Actually we actually use that netdev tracker mechanism implicitly on user ports since commit 2f1e8ea726e9 ("net: dsa: link interfaces with the DSA master to get rid of lockdep warnings"), via netdev_upper_dev_link(). But time still passes at DSA switch probe time between the initial of_find_net_device_by_node() code and the user port creation time, time during which the conduit could unregister itself and DSA wouldn't know about it. So we have to run of_find_net_device_by_node() under rtnl_lock() to prevent that from happening, and release the lock only with the netdev tracker having acquired the reference. Do we need to keep the reference until dsa_unregister_switch() / dsa_switch_shutdown()? 1: Maybe yes. A switch device will still be registered even if all user ports failed to probe, see commit 86f8b1c01a0a ("net: dsa: Do not make user port errors fatal"), and the cpu_dp->conduit pointers remain valid. I haven't audited all call paths to see whether they will actually use the conduit in lack of any user port, but if they do, it seems safer to not rely on user ports for that reference. 2. Definitely yes. We support changing the conduit which a user port is associated to, and we can get into a situation where we've moved all user ports away from a conduit, thus no longer hold any reference to it via the net device tracker. But we shouldn't let it go nonetheless - see the next change in relation to dsa_tree_find_first_conduit() and LAG conduits which disappear. We have to be prepared to return to the physical conduit, so the CPU port must explicitly keep another reference to it. This is also to say: the user ports and their CPU ports may not always keep a reference to the same conduit net device, and both are needed. As for the conduit's kobject for the /sys/class/net/ entry, we don't care about it, we can release it as soon as we hold the net device object itself. History and blame attribution ----------------------------- The code has been refactored so many times, it is very difficult to follow and properly attribute a blame, but I'll try to make a short history which I hope to be correct. We have two distinct probing paths: - one for OF, introduced in 2016 in commit 83c0afaec7b7 ("net: dsa: Add new binding implementation") - one for non-OF, introduced in 2017 in commit 71e0bbde0d88 ("net: dsa: Add support for platform data") These are both complete rewrites of the original probing paths (which used struct dsa_switch_driver and other weird stuff, instead of regular devices on their respective buses for register access, like MDIO, SPI, I2C etc): - one for OF, introduced in 2013 in commit 5e95329b701c ("dsa: add device tree bindings to register DSA switches") - one for non-OF, introduced in 2008 in commit 91da11f870f0 ("net: Distributed Switch Architecture protocol support") except for tiny bits and pieces like dsa_dev_to_net_device() which were seemingly carried over since the original commit, and used to this day. The point is that the original probing paths received a fix in 2015 in the form of commit 679fb46c5785 ("net: dsa: Add missing master netdev dev_put() calls"), but the fix never made it into the "new" (dsa2) probing paths that can still be traced to today, and the fixed probing path was later deleted in 2019 in commit 93e86b3bc842 ("net: dsa: Remove legacy probing support"). That is to say, the new probing paths were never quite correct in this area. The existence of the legacy probing support which was deleted in 2019 explains why dsa_dev_to_net_device() returns a conduit with elevated refcount (because it was supposed to be released during dsa_remove_dst()). After the removal of the legacy code, the only user of dsa_dev_to_net_device() calls dev_put(conduit) immediately after this function returns. This pattern makes no sense today, and can only be interpreted historically to understand why dev_hold() was there in the first place. Change details -------------- Today we have a better netdev tracking infrastructure which we should use. Logically netdev_hold() belongs in common code (dsa_port_parse_cpu(), where dp->conduit is assigned), but there is a tradeoff to be made with the rtnl_lock() section which would become a bit too long if we did that - dsa_port_parse_cpu() also calls request_module(). So we duplicate a bit of logic in order for the callers of dsa_port_parse_cpu() to be the ones responsible of holding the conduit reference and releasing it on error. This shortens the rtnl_lock() section significantly. In the dsa_switch_probe() error path, dsa_switch_release_ports() will be called in a number of situations, one being where dsa_port_parse_cpu() maybe didn't get the chance to run at all (a different port failed earlier, etc). So we have to test for the conduit being NULL prior to calling netdev_put(). There have still been so many transformations to the code since the blamed commits (rename master -> conduit, commit 0650bf52b31f ("net: dsa: be compatible with masters which unregister on shutdown")), that it only makes sense to fix the code using the best methods available today and see how it can be backported to stable later. I suspect the fix cannot even be backported to kernels which lack dsa_switch_shutdown(), and I suspect this is also maybe why the long-lived conduit reference didn't make it into the new DSA probing paths at the time (problems during shutdown). Because dsa_dev_to_net_device() has a single call site and has to be changed anyway, the logic was just absorbed into the non-OF dsa_port_parse(). Tested on the ocelot/felix switch and on dsa_loop, both on the NXP LS1028A with CONFIG_DEBUG_KOBJECT_RELEASE=y. Reported-by: Ma Ke Closes: https://lore.kernel.org/netdev/20251214131204.4684-1-make24@iscas.ac.cn/ Fixes: 83c0afaec7b7 ("net: dsa: Add new binding implementation") Fixes: 71e0bbde0d88 ("net: dsa: Add support for platform data") Reviewed-by: Jonas Gorski Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20251215150236.3931670-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- include/net/dsa.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index cced1a866757..6b2b5ed64ea4 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -302,6 +302,7 @@ struct dsa_port { struct devlink_port devlink_port; struct phylink *pl; struct phylink_config pl_config; + netdevice_tracker conduit_tracker; struct dsa_lag *lag; struct net_device *hsr_dev; -- cgit v1.2.3 From 5393802c94e0ab1295c04c94c57bcb00222d4674 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 27 Nov 2025 10:39:24 -0800 Subject: genalloc.h: fix htmldocs warning WARNING: include/linux/genalloc.h:52 function parameter 'start_addr' not described in 'genpool_algo_t' Fixes: 52fbf1134d47 ("lib/genalloc.c: fix allocation of aligned buffer from non-aligned chunk") Reported-by: Stephen Rothwell Closes: https://lkml.kernel.org/r/20251127130624.563597e3@canb.auug.org.au Acked-by: Randy Dunlap Tested-by: Randy Dunlap Cc: Alexey Skidanov Signed-off-by: Andrew Morton --- include/linux/genalloc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 0bd581003cd5..60de63e46b33 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -44,6 +44,7 @@ struct gen_pool; * @nr: The number of zeroed bits we're looking for * @data: optional additional data used by the callback * @pool: the pool being allocated from + * @start_addr: start address of memory chunk */ typedef unsigned long (*genpool_algo_t)(unsigned long *map, unsigned long size, -- cgit v1.2.3 From 007f5da43b3d0ecff972e2616062b8da1f862f5e Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Thu, 4 Dec 2025 18:59:55 +0000 Subject: mm/kasan: fix incorrect unpoisoning in vrealloc for KASAN Patch series "kasan: vmalloc: Fixes for the percpu allocator and vrealloc", v3. Patches fix two issues related to KASAN and vmalloc. The first one, a KASAN tag mismatch, possibly resulting in a kernel panic, can be observed on systems with a tag-based KASAN enabled and with multiple NUMA nodes. Initially it was only noticed on x86 [1] but later a similar issue was also reported on arm64 [2]. Specifically the problem is related to how vm_structs interact with pcpu_chunks - both when they are allocated, assigned and when pcpu_chunk addresses are derived. When vm_structs are allocated they are unpoisoned, each with a different random tag, if vmalloc support is enabled along the KASAN mode. Later when first pcpu chunk is allocated it gets its 'base_addr' field set to the first allocated vm_struct. With that it inherits that vm_struct's tag. When pcpu_chunk addresses are later derived (by pcpu_chunk_addr(), for example in pcpu_alloc_noprof()) the base_addr field is used and offsets are added to it. If the initial conditions are satisfied then some of the offsets will point into memory allocated with a different vm_struct. So while the lower bits will get accurately derived the tag bits in the top of the pointer won't match the shadow memory contents. The solution (proposed at v2 of the x86 KASAN series [3]) is to unpoison the vm_structs with the same tag when allocating them for the per cpu allocator (in pcpu_get_vm_areas()). The second one reported by syzkaller [4] is related to vrealloc and happens because of random tag generation when unpoisoning memory without allocating new pages. This breaks shadow memory tracking and needs to reuse the existing tag instead of generating a new one. At the same time an inconsistency in used flags is corrected. This patch (of 3): Syzkaller reported a memory out-of-bounds bug [4]. This patch fixes two issues: 1. In vrealloc the KASAN_VMALLOC_VM_ALLOC flag is missing when unpoisoning the extended region. This flag is required to correctly associate the allocation with KASAN's vmalloc tracking. Note: In contrast, vzalloc (via __vmalloc_node_range_noprof) explicitly sets KASAN_VMALLOC_VM_ALLOC and calls kasan_unpoison_vmalloc() with it. vrealloc must behave consistently -- especially when reusing existing vmalloc regions -- to ensure KASAN can track allocations correctly. 2. When vrealloc reuses an existing vmalloc region (without allocating new pages) KASAN generates a new tag, which breaks tag-based memory access tracking. Introduce KASAN_VMALLOC_KEEP_TAG, a new KASAN flag that allows reusing the tag already attached to the pointer, ensuring consistent tag behavior during reallocation. Pass KASAN_VMALLOC_KEEP_TAG and KASAN_VMALLOC_VM_ALLOC to the kasan_unpoison_vmalloc inside vrealloc_node_align_noprof(). Link: https://lkml.kernel.org/r/cover.1765978969.git.m.wieczorretman@pm.me Link: https://lkml.kernel.org/r/38dece0a4074c43e48150d1e242f8242c73bf1a5.1764874575.git.m.wieczorretman@pm.me Link: https://lore.kernel.org/all/e7e04692866d02e6d3b32bb43b998e5d17092ba4.1738686764.git.maciej.wieczor-retman@intel.com/ [1] Link: https://lore.kernel.org/all/aMUrW1Znp1GEj7St@MiWiFi-R3L-srv/ [2] Link: https://lore.kernel.org/all/CAPAsAGxDRv_uFeMYu9TwhBVWHCCtkSxoWY4xmFB_vowMbi8raw@mail.gmail.com/ [3] Link: https://syzkaller.appspot.com/bug?extid=997752115a851cb0cf36 [4] Fixes: a0309faf1cb0 ("mm: vmalloc: support more granular vrealloc() sizing") Signed-off-by: Jiayuan Chen Co-developed-by: Maciej Wieczor-Retman Signed-off-by: Maciej Wieczor-Retman Reported-by: syzbot+997752115a851cb0cf36@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68e243a2.050a0220.1696c6.007d.GAE@google.com/T/ Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Danilo Krummrich Cc: Dmitriy Vyukov Cc: Kees Cook Cc: Marco Elver Cc: "Uladzislau Rezki (Sony)" Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- include/linux/kasan.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index f335c1d7b61d..df3d8567dde9 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -28,6 +28,7 @@ typedef unsigned int __bitwise kasan_vmalloc_flags_t; #define KASAN_VMALLOC_INIT ((__force kasan_vmalloc_flags_t)0x01u) #define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u) #define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u) +#define KASAN_VMALLOC_KEEP_TAG ((__force kasan_vmalloc_flags_t)0x08u) #define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */ #define KASAN_VMALLOC_TLB_FLUSH 0x2 /* TLB flush */ -- cgit v1.2.3 From 6f13db031e27e88213381039032a9cc061578ea6 Mon Sep 17 00:00:00 2001 From: Maciej Wieczor-Retman Date: Thu, 4 Dec 2025 19:00:04 +0000 Subject: kasan: refactor pcpu kasan vmalloc unpoison A KASAN tag mismatch, possibly causing a kernel panic, can be observed on systems with a tag-based KASAN enabled and with multiple NUMA nodes. It was reported on arm64 and reproduced on x86. It can be explained in the following points: 1. There can be more than one virtual memory chunk. 2. Chunk's base address has a tag. 3. The base address points at the first chunk and thus inherits the tag of the first chunk. 4. The subsequent chunks will be accessed with the tag from the first chunk. 5. Thus, the subsequent chunks need to have their tag set to match that of the first chunk. Refactor code by reusing __kasan_unpoison_vmalloc in a new helper in preparation for the actual fix. Link: https://lkml.kernel.org/r/eb61d93b907e262eefcaa130261a08bcb6c5ce51.1764874575.git.m.wieczorretman@pm.me Fixes: 1d96320f8d53 ("kasan, vmalloc: add vmalloc tagging for SW_TAGS") Signed-off-by: Maciej Wieczor-Retman Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Danilo Krummrich Cc: Dmitriy Vyukov Cc: Jiayuan Chen Cc: Kees Cook Cc: Marco Elver Cc: "Uladzislau Rezki (Sony)" Cc: Vincenzo Frascino Cc: [6.1+] Signed-off-by: Andrew Morton --- include/linux/kasan.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index df3d8567dde9..9c6ac4b62eb9 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -631,6 +631,16 @@ static __always_inline void kasan_poison_vmalloc(const void *start, __kasan_poison_vmalloc(start, size); } +void __kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, + kasan_vmalloc_flags_t flags); +static __always_inline void +kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, + kasan_vmalloc_flags_t flags) +{ + if (kasan_enabled()) + __kasan_unpoison_vmap_areas(vms, nr_vms, flags); +} + #else /* CONFIG_KASAN_VMALLOC */ static inline void kasan_populate_early_vm_area_shadow(void *start, @@ -655,6 +665,11 @@ static inline void *kasan_unpoison_vmalloc(const void *start, static inline void kasan_poison_vmalloc(const void *start, unsigned long size) { } +static __always_inline void +kasan_unpoison_vmap_areas(struct vm_struct **vms, int nr_vms, + kasan_vmalloc_flags_t flags) +{ } + #endif /* CONFIG_KASAN_VMALLOC */ #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ -- cgit v1.2.3 From 6ba776b533ca902631fa106b8a90811b3f40b08d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 14 Dec 2025 12:15:17 -0800 Subject: mm: leafops.h: correct kernel-doc function param. names Modify the kernel-doc function parameter names to prevent kernel-doc warnings: Warning: include/linux/leafops.h:135 function parameter 'entry' not described in 'leafent_type' Warning: include/linux/leafops.h:540 function parameter 'pte' not described in 'pte_is_uffd_marker' Link: https://lkml.kernel.org/r/20251214201517.2187051-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Lorenzo Stoakes Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/leafops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index cfafe7a5e7b1..a9ff94b744f2 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -133,7 +133,7 @@ static inline bool softleaf_is_none(softleaf_t entry) /** * softleaf_type() - Identify the type of leaf entry. - * @enntry: Leaf entry. + * @entry: Leaf entry. * * Returns: the leaf entry type associated with @entry. */ @@ -534,7 +534,7 @@ static inline bool pte_is_uffd_wp_marker(pte_t pte) /** * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker * leaf entry? - * @entry: Leaf entry. + * @pte: PTE entry. * * It's useful to be able to determine which leaf entries encode UFFD-specific * markers so we can handle these correctly. -- cgit v1.2.3 From fe55ea85939efcbf0e6baa234f0d70acb79e7b58 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Tue, 16 Dec 2025 09:48:51 +0800 Subject: kernel/kexec: change the prototype of kimage_map_segment() The kexec segment index will be required to extract the corresponding information for that segment in kimage_map_segment(). Additionally, kexec_segment already holds the kexec relocation destination address and size. Therefore, the prototype of kimage_map_segment() can be changed. Link: https://lkml.kernel.org/r/20251216014852.8737-1-piliu@redhat.com Fixes: 07d24902977e ("kexec: enable CMA based contiguous allocation") Signed-off-by: Pingfan Liu Acked-by: Baoquan He Cc: Mimi Zohar Cc: Roberto Sassu Cc: Alexander Graf Cc: Steven Chen Cc: Signed-off-by: Andrew Morton --- include/linux/kexec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index ff7e231b0485..8a22bc9b8c6c 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -530,7 +530,7 @@ extern bool kexec_file_dbg_print; #define kexec_dprintk(fmt, arg...) \ do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0) -extern void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size); +extern void *kimage_map_segment(struct kimage *image, int idx); extern void kimage_unmap_segment(void *buffer); #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; @@ -540,7 +540,7 @@ static inline void __crash_kexec(struct pt_regs *regs) { } static inline void crash_kexec(struct pt_regs *regs) { } static inline int kexec_should_crash(struct task_struct *p) { return 0; } static inline int kexec_crash_loaded(void) { return 0; } -static inline void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size) +static inline void *kimage_map_segment(struct kimage *image, int idx) { return NULL; } static inline void kimage_unmap_segment(void *buffer) { } #define kexec_in_progress false -- cgit v1.2.3 From e6dbcb7c0e7b508d443a9aa6f77f63a2f83b1ae4 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 11 Dec 2025 07:06:01 +0000 Subject: mm: fixup pfnmap memory failure handling to use pgoff The memory failure handling implementation for the PFNMAP memory with no struct pages is faulty. The VA of the mapping is determined based on the the PFN. It should instead be based on the file mapping offset. At the occurrence of poison, the memory_failure_pfn is triggered on the poisoned PFN. Introduce a callback function that allows mm to translate the PFN to the corresponding file page offset. The kernel module using the registration API must implement the callback function and provide the translation. The translated value is then used to determine the VA information and sending the SIGBUS to the usermode process mapped to the poisoned PFN. The callback is also useful for the driver to be notified of the poisoned PFN, which may then track it. Link: https://lkml.kernel.org/r/20251211070603.338701-2-ankita@nvidia.com Fixes: 2ec41967189c ("mm: handle poisoning of pfn without struct pages") Signed-off-by: Ankit Agrawal Suggested-by: Jason Gunthorpe Cc: Kevin Tian Cc: Matthew R. Ochs Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Neo Jia Cc: Vikram Sethi Cc: Yishai Hadas Cc: Zhi Wang Signed-off-by: Andrew Morton --- include/linux/memory-failure.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h index bc326503d2d2..7b5e11cf905f 100644 --- a/include/linux/memory-failure.h +++ b/include/linux/memory-failure.h @@ -9,6 +9,8 @@ struct pfn_address_space; struct pfn_address_space { struct interval_tree_node node; struct address_space *mapping; + int (*pfn_to_vma_pgoff)(struct vm_area_struct *vma, + unsigned long pfn, pgoff_t *pgoff); }; int register_pfn_address_space(struct pfn_address_space *pfn_space); -- cgit v1.2.3 From f183663901f21fe0fba8bd31ae894bc529709ee0 Mon Sep 17 00:00:00 2001 From: Bijan Tabatabai Date: Tue, 16 Dec 2025 14:07:27 -0600 Subject: mm: consider non-anon swap cache folios in folio_expected_ref_count() Currently, folio_expected_ref_count() only adds references for the swap cache if the folio is anonymous. However, according to the comment above the definition of PG_swapcache in enum pageflags, shmem folios can also have PG_swapcache set. This patch makes sure references for the swap cache are added if folio_test_swapcache(folio) is true. This issue was found when trying to hot-unplug memory in a QEMU/KVM virtual machine. When initiating hot-unplug when most of the guest memory is allocated, hot-unplug hangs partway through removal due to migration failures. The following message would be printed several times, and would be printed again about every five seconds: [ 49.641309] migrating pfn b12f25 failed ret:7 [ 49.641310] page: refcount:2 mapcount:0 mapping:0000000033bd8fe2 index:0x7f404d925 pfn:0xb12f25 [ 49.641311] aops:swap_aops [ 49.641313] flags: 0x300000000030508(uptodate|active|owner_priv_1|reclaim|swapbacked|node=0|zone=3) [ 49.641314] raw: 0300000000030508 ffffed312c4bc908 ffffed312c4bc9c8 0000000000000000 [ 49.641315] raw: 00000007f404d925 00000000000c823b 00000002ffffffff 0000000000000000 [ 49.641315] page dumped because: migration failure When debugging this, I found that these migration failures were due to __migrate_folio() returning -EAGAIN for a small set of folios because the expected reference count it calculates via folio_expected_ref_count() is one less than the actual reference count of the folios. Furthermore, all of the affected folios were not anonymous, but had the PG_swapcache flag set, inspiring this patch. After applying this patch, the memory hot-unplug behaves as expected. I tested this on a machine running Ubuntu 24.04 with kernel version 6.8.0-90-generic and 64GB of memory. The guest VM is managed by libvirt and runs Ubuntu 24.04 with kernel version 6.18 (though the head of the mm-unstable branch as a Dec 16, 2025 was also tested and behaves the same) and 48GB of memory. The libvirt XML definition for the VM can be found at [1]. CONFIG_MHP_DEFAULT_ONLINE_TYPE_ONLINE_MOVABLE is set in the guest kernel so the hot-pluggable memory is automatically onlined. Below are the steps to reproduce this behavior: 1) Define and start and virtual machine host$ virsh -c qemu:///system define ./test_vm.xml # test_vm.xml from [1] host$ virsh -c qemu:///system start test_vm 2) Setup swap in the guest guest$ sudo fallocate -l 32G /swapfile guest$ sudo chmod 0600 /swapfile guest$ sudo mkswap /swapfile guest$ sudo swapon /swapfile 3) Use alloc_data [2] to allocate most of the remaining guest memory guest$ ./alloc_data 45 4) In a separate guest terminal, monitor the amount of used memory guest$ watch -n1 free -h 5) When alloc_data has finished allocating, initiate the memory hot-unplug using the provided xml file [3] host$ virsh -c qemu:///system detach-device test_vm ./remove.xml --live After initiating the memory hot-unplug, you should see the amount of available memory in the guest decrease, and the amount of used swap data increase. If everything works as expected, when all of the memory is unplugged, there should be around 8.5-9GB of data in swap. If the unplugging is unsuccessful, the amount of used swap data will settle below that. If that happens, you should be able to see log messages in dmesg similar to the one posted above. Link: https://lkml.kernel.org/r/20251216200727.2360228-1-bijan311@gmail.com Link: https://github.com/BijanT/linux_patch_files/blob/main/test_vm.xml [1] Link: https://github.com/BijanT/linux_patch_files/blob/main/alloc_data.c [2] Link: https://github.com/BijanT/linux_patch_files/blob/main/remove.xml [3] Fixes: 86ebd50224c0 ("mm: add folio_expected_ref_count() for reference count calculation") Signed-off-by: Bijan Tabatabai Acked-by: David Hildenbrand (Red Hat) Acked-by: Zi Yan Reviewed-by: Baolin Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shivank Garg Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Kairui Song Cc: Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 15076261d0c2..6f959d8ca4b4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2459,10 +2459,10 @@ static inline int folio_expected_ref_count(const struct folio *folio) if (WARN_ON_ONCE(page_has_type(&folio->page) && !folio_test_hugetlb(folio))) return 0; - if (folio_test_anon(folio)) { - /* One reference per page from the swapcache. */ - ref_count += folio_test_swapcache(folio) << order; - } else { + /* One reference per page from the swapcache. */ + ref_count += folio_test_swapcache(folio) << order; + + if (!folio_test_anon(folio)) { /* One reference per page from the pagecache. */ ref_count += !!folio->mapping << order; /* One reference from PG_private. */ -- cgit v1.2.3 From dc85a46928c41423ad89869baf05a589e2975575 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Thu, 18 Dec 2025 08:16:49 +0000 Subject: vfio/pci: Disable qword access to the PCI ROM bar Commit 2b938e3db335 ("vfio/pci: Enable iowrite64 and ioread64 for vfio pci") enables qword access to the PCI bar resources. However certain devices (e.g. Intel X710) are observed with problem upon qword accesses to the rom bar, e.g. triggering PCI aer errors. This is triggered by Qemu which caches the rom content by simply does a pread() of the remaining size until it gets the full contents. The other bars would only perform operations at the same access width as their guest drivers. Instead of trying to identify all broken devices, universally disable qword access to the rom bar i.e. going back to the old way which worked reliably for years. Reported-by: Farrah Chen Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220740 Fixes: 2b938e3db335 ("vfio/pci: Enable iowrite64 and ioread64 for vfio pci") Cc: stable@vger.kernel.org Signed-off-by: Kevin Tian Tested-by: Farrah Chen Link: https://lore.kernel.org/r/20251218081650.555015-2-kevin.tian@intel.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 706877f998ff..1ac86896875c 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -145,6 +145,13 @@ struct vfio_pci_core_device { struct list_head dmabufs; }; +enum vfio_pci_io_width { + VFIO_PCI_IO_WIDTH_1 = 1, + VFIO_PCI_IO_WIDTH_2 = 2, + VFIO_PCI_IO_WIDTH_4 = 4, + VFIO_PCI_IO_WIDTH_8 = 8, +}; + /* Will be exported for vfio pci drivers usage */ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, unsigned int type, unsigned int subtype, @@ -188,7 +195,8 @@ pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, void __iomem *io, char __user *buf, loff_t off, size_t count, size_t x_start, - size_t x_end, bool iswrite); + size_t x_end, bool iswrite, + enum vfio_pci_io_width max_width); bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, loff_t reg_start, size_t reg_cnt, -- cgit v1.2.3 From f059588c552746e0fe299214f35c58effa715b74 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 4 Dec 2025 13:31:52 -0500 Subject: virtio: make it self-contained virtio.h uses struct module, add a forward declaration to make the header self-contained. Message-ID: <9171b5cac60793eb59ab044c96ee038bf1363bee.1764873799.git.mst@redhat.com> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 132a474e5914..3626eb694728 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -13,6 +13,8 @@ #include #include +struct module; + /** * struct virtqueue - a queue to register buffers for sending or receiving. * @list: the chain of virtqueues for this device -- cgit v1.2.3 From e88dfb93311c81359b00c12e0b396bd0ea13ad6c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 4 Dec 2025 12:49:34 -0500 Subject: virtio_features: make it self-contained virtio_features.h uses WARN_ON_ONCE and memset so it must include linux/bug.h and linux/string.h Message-ID: <579986aa9b8d023844990d2a0e267382f8ad85d5.1764873799.git.mst@redhat.com> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_features.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/virtio_features.h b/include/linux/virtio_features.h index ea2ad8717882..ce59ea91f474 100644 --- a/include/linux/virtio_features.h +++ b/include/linux/virtio_features.h @@ -3,6 +3,8 @@ #define _LINUX_VIRTIO_FEATURES_H #include +#include +#include #define VIRTIO_FEATURES_U64S 2 #define VIRTIO_FEATURES_BITS (VIRTIO_FEATURES_U64S * 64) -- cgit v1.2.3 From 5623eb1ed035f01dfa620366a82b667545b10c82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 31 Dec 2025 08:12:46 -0700 Subject: io_uring/tctx: add separate lock for list of tctx's in ctx ctx->tcxt_list holds the tasks using this ring, and it's currently protected by the normal ctx->uring_lock. However, this can cause a circular locking issue, as reported by syzbot, where cancelations off exec end up needing to remove an entry from this list: ====================================================== WARNING: possible circular locking dependency detected syzkaller #0 Tainted: G L ------------------------------------------------------ syz.0.9999/12287 is trying to acquire lock: ffff88805851c0a8 (&ctx->uring_lock){+.+.}-{4:4}, at: io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179 but task is already holding lock: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline] ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&sig->cred_guard_mutex){+.+.}-{4:4}: __mutex_lock_common kernel/locking/mutex.c:614 [inline] __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776 proc_pid_attr_write+0x547/0x630 fs/proc/base.c:2837 vfs_write+0x27e/0xb30 fs/read_write.c:684 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #1 (sb_writers#3){.+.+}-{0:0}: percpu_down_read_internal include/linux/percpu-rwsem.h:53 [inline] percpu_down_read_freezable include/linux/percpu-rwsem.h:83 [inline] __sb_start_write include/linux/fs/super.h:19 [inline] sb_start_write+0x4d/0x1c0 include/linux/fs/super.h:125 mnt_want_write+0x41/0x90 fs/namespace.c:499 open_last_lookups fs/namei.c:4529 [inline] path_openat+0xadd/0x3dd0 fs/namei.c:4784 do_filp_open+0x1fa/0x410 fs/namei.c:4814 io_openat2+0x3e0/0x5c0 io_uring/openclose.c:143 __io_issue_sqe+0x181/0x4b0 io_uring/io_uring.c:1792 io_issue_sqe+0x165/0x1060 io_uring/io_uring.c:1815 io_queue_sqe io_uring/io_uring.c:2042 [inline] io_submit_sqe io_uring/io_uring.c:2320 [inline] io_submit_sqes+0xbf4/0x2140 io_uring/io_uring.c:2434 __do_sys_io_uring_enter io_uring/io_uring.c:3280 [inline] __se_sys_io_uring_enter+0x2e0/0x2b60 io_uring/io_uring.c:3219 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f -> #0 (&ctx->uring_lock){+.+.}-{4:4}: check_prev_add kernel/locking/lockdep.c:3165 [inline] check_prevs_add kernel/locking/lockdep.c:3284 [inline] validate_chain kernel/locking/lockdep.c:3908 [inline] __lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237 lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868 __mutex_lock_common kernel/locking/mutex.c:614 [inline] __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776 io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179 io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195 io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646 io_uring_task_cancel include/linux/io_uring.h:24 [inline] begin_new_exec+0x10ed/0x2440 fs/exec.c:1131 load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010 search_binary_handler fs/exec.c:1669 [inline] exec_binprm fs/exec.c:1701 [inline] bprm_execve+0x92e/0x1400 fs/exec.c:1753 do_execveat_common+0x510/0x6a0 fs/exec.c:1859 do_execve fs/exec.c:1933 [inline] __do_sys_execve fs/exec.c:2009 [inline] __se_sys_execve fs/exec.c:2004 [inline] __x64_sys_execve+0x94/0xb0 fs/exec.c:2004 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f other info that might help us debug this: Chain exists of: &ctx->uring_lock --> sb_writers#3 --> &sig->cred_guard_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sig->cred_guard_mutex); lock(sb_writers#3); lock(&sig->cred_guard_mutex); lock(&ctx->uring_lock); *** DEADLOCK *** 1 lock held by syz.0.9999/12287: #0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: prepare_bprm_creds fs/exec.c:1360 [inline] #0: ffff88802db5a2e0 (&sig->cred_guard_mutex){+.+.}-{4:4}, at: bprm_execve+0xb9/0x1400 fs/exec.c:1733 stack backtrace: CPU: 0 UID: 0 PID: 12287 Comm: syz.0.9999 Tainted: G L syzkaller #0 PREEMPT(full) Tainted: [L]=SOFTLOCKUP Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/25/2025 Call Trace: dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120 print_circular_bug+0x2e2/0x300 kernel/locking/lockdep.c:2043 check_noncircular+0x12e/0x150 kernel/locking/lockdep.c:2175 check_prev_add kernel/locking/lockdep.c:3165 [inline] check_prevs_add kernel/locking/lockdep.c:3284 [inline] validate_chain kernel/locking/lockdep.c:3908 [inline] __lock_acquire+0x15a6/0x2cf0 kernel/locking/lockdep.c:5237 lock_acquire+0x107/0x340 kernel/locking/lockdep.c:5868 __mutex_lock_common kernel/locking/mutex.c:614 [inline] __mutex_lock+0x187/0x1350 kernel/locking/mutex.c:776 io_uring_del_tctx_node+0xf0/0x2c0 io_uring/tctx.c:179 io_uring_clean_tctx+0xd4/0x1a0 io_uring/tctx.c:195 io_uring_cancel_generic+0x6ca/0x7d0 io_uring/cancel.c:646 io_uring_task_cancel include/linux/io_uring.h:24 [inline] begin_new_exec+0x10ed/0x2440 fs/exec.c:1131 load_elf_binary+0x9f8/0x2d70 fs/binfmt_elf.c:1010 search_binary_handler fs/exec.c:1669 [inline] exec_binprm fs/exec.c:1701 [inline] bprm_execve+0x92e/0x1400 fs/exec.c:1753 do_execveat_common+0x510/0x6a0 fs/exec.c:1859 do_execve fs/exec.c:1933 [inline] __do_sys_execve fs/exec.c:2009 [inline] __se_sys_execve fs/exec.c:2004 [inline] __x64_sys_execve+0x94/0xb0 fs/exec.c:2004 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xec/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7ff3a8b8f749 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ff3a9a97038 EFLAGS: 00000246 ORIG_RAX: 000000000000003b RAX: ffffffffffffffda RBX: 00007ff3a8de5fa0 RCX: 00007ff3a8b8f749 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000200000000400 RBP: 00007ff3a8c13f91 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ff3a8de6038 R14: 00007ff3a8de5fa0 R15: 00007ff3a8f0fa28 Add a separate lock just for the tctx_list, tctx_lock. This can nest under ->uring_lock, where necessary, and be used separately for list manipulation. For the cancelation off exec side, this removes the need to grab ->uring_lock, hence fixing the circular locking dependency. Reported-by: syzbot+b0e3b77ffaa8a4067ce5@syzkaller.appspotmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e1adb0d20a0a..a3e8ddc9b380 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -424,11 +424,17 @@ struct io_ring_ctx { struct user_struct *user; struct mm_struct *mm_account; + /* + * List of tctx nodes for this ctx, protected by tctx_lock. For + * cancelation purposes, nests under uring_lock. + */ + struct list_head tctx_list; + struct mutex tctx_lock; + /* ctx exit and cancelation */ struct llist_head fallback_llist; struct delayed_work fallback_work; struct work_struct exit_work; - struct list_head tctx_list; struct completion ref_comp; /* io-wq management, e.g. thread count */ -- cgit v1.2.3