From aec58b48517c911fbdf2beebba46a347e5910072 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:46:30 +0200 Subject: bugs/core: Extend __WARN_FLAGS() with the 'cond_str' parameter Push the new parameter down into every architecture that defines __WARN_FLAGS(): arm64 loongarch parisc powerpc riscv s390 sh x86 Don't pass anything substantial down yet, just propagate the new parameter with empty strings, without generating it or using it. ( The string is never NULL, so it can be concatenated at the preprocessor level. ) Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: linux-arch@vger.kernel.org Link: https://lore.kernel.org/r/20250515124644.2958810-2-mingo@kernel.org --- include/asm-generic/bug.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 387720933973..af76e4a04b16 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -100,17 +100,18 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); instrumentation_end(); \ } while (0) #else -#define __WARN() __WARN_FLAGS(BUGFLAG_TAINT(TAINT_WARN)) +#define __WARN() __WARN_FLAGS("", BUGFLAG_TAINT(TAINT_WARN)) #define __WARN_printf(taint, arg...) do { \ instrumentation_begin(); \ __warn_printk(arg); \ - __WARN_FLAGS(BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ + __WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ instrumentation_end(); \ } while (0) #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_FLAGS(BUGFLAG_ONCE | \ + __WARN_FLAGS("", \ + BUGFLAG_ONCE | \ BUGFLAG_TAINT(TAINT_WARN)); \ unlikely(__ret_warn_on); \ }) -- cgit v1.2.3 From 3bc3c9c3ab6df45a3a3389f74000f8bec1bc96e3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:46:31 +0200 Subject: bugs/core: Pass down the condition string of WARN_ON_ONCE(cond) warnings to __WARN_FLAGS() Doing this will allow architecture code to store and print out this information as part of the WARN_ON and BUG_ON facilities. The format of the string is '[condition]', for example: WARN_ON_ONCE(idx < 0 && ptr); Will get the '[idx < 0 && ptr]' string literal passed down as 'cond_str' in __WARN_FLAGS(). Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: linux-arch@vger.kernel.org Link: https://lore.kernel.org/r/20250515124644.2958810-3-mingo@kernel.org --- include/asm-generic/bug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index af76e4a04b16..c8e7126bc26e 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -110,7 +110,7 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_FLAGS("", \ + __WARN_FLAGS("["#condition"] ", \ BUGFLAG_ONCE | \ BUGFLAG_TAINT(TAINT_WARN)); \ unlikely(__ret_warn_on); \ -- cgit v1.2.3 From 687fac9d1b00fb10421fdd455d60543cc46e42d0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 May 2025 14:46:32 +0200 Subject: bugs/core: Introduce the CONFIG_DEBUG_BUGVERBOSE_DETAILED Kconfig switch Allow configurability of the inclusion of more detailed WARN_ON() strings, to be implemented in subsequent commits. Since the full cost will be around 100K more memory on an x86 defconfig, disable it by default. Provide the WARN_CONDITION_STR() macro to allow the conditional passing of extra strings to lower level BUG/WARN handlers. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: linux-arch@vger.kernel.org Link: https://lore.kernel.org/r/20250515124644.2958810-4-mingo@kernel.org --- include/asm-generic/bug.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index c8e7126bc26e..2d9f61346dab 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -17,6 +17,12 @@ #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif +#ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED +# define WARN_CONDITION_STR(cond_str) cond_str +#else +# define WARN_CONDITION_STR(cond_str) +#endif + #ifndef __ASSEMBLY__ #include #include -- cgit v1.2.3 From 548fe51740d0f3294e548f654c099e5aefbf4cb7 Mon Sep 17 00:00:00 2001 From: Madhav Bhatt Date: Thu, 17 Apr 2025 02:45:43 -0700 Subject: firmware: xilinx: Add debugfs support for PM_GET_NODE_STATUS Add new debug interface to support PM_GET_NODE_STATUS to get the node information like requirements and usage. The debugfs firmware driver interface is only meant for testing and debugging EEMI APIs. Hence, it is by-default disabled in production systems. Signed-off-by: Madhav Bhatt Link: https://lore.kernel.org/r/20250417094543.3873507-1-madhav.bhatt@amd.com Signed-off-by: Michal Simek --- include/linux/firmware/xlnx-zynqmp.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index ae48d619c4e0..4699f50465f2 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -3,7 +3,7 @@ * Xilinx Zynq MPSoC Firmware layer * * Copyright (C) 2014-2021 Xilinx - * Copyright (C) 2022 - 2024, Advanced Micro Devices, Inc. + * Copyright (C) 2022 - 2025 Advanced Micro Devices, Inc. * * Michal Simek * Davorin Mista @@ -164,6 +164,7 @@ enum pm_api_cb_id { enum pm_api_id { PM_API_FEATURES = 0, PM_GET_API_VERSION = 1, + PM_GET_NODE_STATUS = 3, PM_REGISTER_NOTIFIER = 5, PM_FORCE_POWERDOWN = 8, PM_REQUEST_WAKEUP = 10, @@ -629,6 +630,8 @@ int zynqmp_pm_request_wake(const u32 node, int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode); int zynqmp_pm_set_rpu_mode(u32 node_id, enum rpu_oper_mode rpu_mode); int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mode); +int zynqmp_pm_get_node_status(const u32 node, u32 *const status, + u32 *const requirements, u32 *const usage); int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value); int zynqmp_pm_set_gem_config(u32 node, enum pm_gem_config_type config, u32 value); @@ -931,6 +934,13 @@ static inline int zynqmp_pm_set_tcm_config(u32 node_id, enum rpu_tcm_comb tcm_mo return -ENODEV; } +static inline int zynqmp_pm_get_node_status(const u32 node, u32 *const status, + u32 *const requirements, + u32 *const usage) +{ + return -ENODEV; +} + static inline int zynqmp_pm_set_sd_config(u32 node, enum pm_sd_config_type config, u32 value) -- cgit v1.2.3 From e66f4c35e375346943bfe2a0990e97253f74440f Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Tue, 1 Jul 2025 05:38:50 -0700 Subject: drivers: firmware: xilinx: Add unique family code for all platforms The family code is currently derived from the PMC_TAP_IDCODE register value, but there are issues where Versal, Versal NET, and future platforms share the same family code. Additionally for some platforms have identical subfamily code, making it challenging to differentiate between platforms based on the family and subfamily codes. To resolve this, a new family code member is added to the platform data, initialized with unique values. This change enables better platform distinction via the compatible string. Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20250701123851.1314531-3-jay.buddhabhatti@amd.com Signed-off-by: Michal Simek --- include/linux/firmware/xlnx-zynqmp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 4699f50465f2..6458ef4e04e2 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -54,6 +54,11 @@ #define ZYNQMP_FAMILY_CODE 0x23 #define VERSAL_FAMILY_CODE 0x26 +/* Family codes */ +#define PM_ZYNQMP_FAMILY_CODE 0x1 /* ZynqMP family code */ +#define PM_VERSAL_FAMILY_CODE 0x2 /* Versal family code */ +#define PM_VERSAL_NET_FAMILY_CODE 0x3 /* Versal NET family code */ + /* When all subfamily of platform need to support */ #define ALL_SUB_FAMILY_CODE 0x00 #define VERSAL_SUB_FAMILY_CODE 0x01 -- cgit v1.2.3 From 25e3ae0ce364fa725a6eea68d63d6a2ee09e019f Mon Sep 17 00:00:00 2001 From: Jay Buddhabhatti Date: Tue, 1 Jul 2025 05:38:51 -0700 Subject: drivers: firmware: xilinx: Switch to new family code in zynqmp_pm_get_family_info() Currently, the family code and subfamily code are derived from the PMC_TAP_IDCODE register. Versal, Versal NET share the same family code. Also some platforms share the same subfamily code, making it difficult to distinguish between platforms. Update zynqmp_pm_get_family_info() to use IDs derived from the compatible string instead of silicon ID codes derived from PMC_TAP_IDCODE register. Signed-off-by: Jay Buddhabhatti Link: https://lore.kernel.org/r/20250701123851.1314531-4-jay.buddhabhatti@amd.com Signed-off-by: Michal Simek --- include/linux/firmware/xlnx-zynqmp.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index 6458ef4e04e2..be6817ac5120 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -51,22 +51,11 @@ #define PM_PINCTRL_PARAM_SET_VERSION 2 -#define ZYNQMP_FAMILY_CODE 0x23 -#define VERSAL_FAMILY_CODE 0x26 - /* Family codes */ #define PM_ZYNQMP_FAMILY_CODE 0x1 /* ZynqMP family code */ #define PM_VERSAL_FAMILY_CODE 0x2 /* Versal family code */ #define PM_VERSAL_NET_FAMILY_CODE 0x3 /* Versal NET family code */ -/* When all subfamily of platform need to support */ -#define ALL_SUB_FAMILY_CODE 0x00 -#define VERSAL_SUB_FAMILY_CODE 0x01 -#define VERSALNET_SUB_FAMILY_CODE 0x03 - -#define FAMILY_CODE_MASK GENMASK(27, 21) -#define SUB_FAMILY_CODE_MASK GENMASK(20, 19) - #define API_ID_MASK GENMASK(7, 0) #define MODULE_ID_MASK GENMASK(11, 8) #define PLM_MODULE_ID_MASK GENMASK(15, 8) @@ -570,7 +559,7 @@ int zynqmp_pm_invoke_fw_fn(u32 pm_api_id, u32 *ret_payload, u32 num_args, ...); #if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) int zynqmp_pm_get_api_version(u32 *version); int zynqmp_pm_get_chipid(u32 *idcode, u32 *version); -int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily); +int zynqmp_pm_get_family_info(u32 *family); int zynqmp_pm_query_data(struct zynqmp_pm_query_data qdata, u32 *out); int zynqmp_pm_clock_enable(u32 clock_id); int zynqmp_pm_clock_disable(u32 clock_id); @@ -651,7 +640,7 @@ static inline int zynqmp_pm_get_chipid(u32 *idcode, u32 *version) return -ENODEV; } -static inline int zynqmp_pm_get_family_info(u32 *family, u32 *subfamily) +static inline int zynqmp_pm_get_family_info(u32 *family) { return -ENODEV; } -- cgit v1.2.3 From f233d4855918547f19c5bff95223706d1c836b7c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 7 Oct 2025 22:03:48 +0000 Subject: bpf: Refactor storage_get_func_atomic to generic non_sleepable flag Rename the storage_get_func_atomic flag to a more generic non_sleepable flag that tracks whether a helper or kfunc may be called from a non-sleepable context. This makes the flag more broadly applicable beyond just storage_get helpers. See [0] for more context. The flag is now set unconditionally for all helpers and kfuncs when: - RCU critical section is active. - Preemption is disabled. - IRQs are disabled. - In a non-sleepable context within a sleepable program (e.g., timer callbacks), which is indicated by !in_sleepable(). Previously, the flag was only set for storage_get helpers in these contexts. With this change, it can be used by any code that needs to differentiate between sleepable and non-sleepable contexts at the per-instruction level. The existing usage in do_misc_fixups() for storage_get helpers is preserved by checking is_storage_get_function() before using the flag. [0]: https://lore.kernel.org/bpf/CAP01T76cbaNi4p-y8E0sjE2NXSra2S=Uja8G4hSQDu_SbXxREQ@mail.gmail.com Cc: Mykyta Yatsenko Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: Eduard Zingerman Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20251007220349.3852807-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 4c497e839526..b57222a25a4a 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -548,7 +548,7 @@ struct bpf_insn_aux_data { bool nospec_result; /* result is unsafe under speculation, nospec must follow */ bool zext_dst; /* this insn zero extends dst reg */ bool needs_zext; /* alu op needs to clear upper bits */ - bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ + bool non_sleepable; /* helper/kfunc may be called from non-sleepable context */ bool is_iter_next; /* bpf_iter__next() kfunc call */ bool call_with_percpu_alloc_ptr; /* {this,per}_cpu_ptr() with prog percpu alloc */ u8 alu_state; /* used in combination with alu_limit */ -- cgit v1.2.3 From 4c97c4b149a019a3b318dc6ea3dc96efe0ee1f39 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Fri, 10 Oct 2025 17:46:06 +0100 Subject: bpf: Extract internal structs validation logic into helpers The arraymap and hashtab duplicate the logic that checks for and frees internal structs (timer, workqueue, task_work) based on BTF record flags. Centralize this by introducing two helpers: * bpf_map_has_internal_structs(map) Returns true if the map value contains any of internal structs: BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK. * bpf_map_free_internal_structs(map, obj) Frees the internal structs for a single value object. Convert arraymap and both the prealloc/malloc hashtab paths to use the new generic functions. This keeps the functionality for when/how to free these special fields in one place and makes it easier to add support for new internal structs in the future without touching every map implementation. Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251010164606.147298-3-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a98c83346134..f87fb203aaae 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -663,6 +663,13 @@ int map_check_no_btf(const struct bpf_map *map, bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1); +static inline bool bpf_map_has_internal_structs(struct bpf_map *map) +{ + return btf_record_has_field(map->record, BPF_TIMER | BPF_WORKQUEUE | BPF_TASK_WORK); +} + +void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); + extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of -- cgit v1.2.3 From 4914c17a76047ccbde24397cf9d406558183d756 Mon Sep 17 00:00:00 2001 From: Raghav Sharma Date: Mon, 15 Sep 2025 15:23:59 +0530 Subject: dt-bindings: clock: exynosautov920: add m2m clock definitions Add device tree clock binding definitions for CMU_M2M Signed-off-by: Raghav Sharma Acked-by: Rob Herring (Arm) Reviewed-by: Alim Akhtar Signed-off-by: Krzysztof Kozlowski --- include/dt-bindings/clock/samsung,exynosautov920.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/samsung,exynosautov920.h b/include/dt-bindings/clock/samsung,exynosautov920.h index 93e6233d1358..0342a988565a 100644 --- a/include/dt-bindings/clock/samsung,exynosautov920.h +++ b/include/dt-bindings/clock/samsung,exynosautov920.h @@ -295,4 +295,9 @@ #define CLK_DOUT_HSI2_ETHERNET 6 #define CLK_DOUT_HSI2_ETHERNET_PTP 7 +/* CMU_M2M */ +#define CLK_MOUT_M2M_JPEG_USER 1 +#define CLK_MOUT_M2M_NOC_USER 2 +#define CLK_DOUT_M2M_NOCP 3 + #endif /* _DT_BINDINGS_CLOCK_EXYNOSAUTOV920_H */ -- cgit v1.2.3 From 0b94201e327471d034d81cf5fd2131a5529eea19 Mon Sep 17 00:00:00 2001 From: Raghav Sharma Date: Thu, 25 Sep 2025 18:34:55 +0530 Subject: dt-bindings: clock: exynosautov920: add mfc clock definitions Add device tree clock binding definitions for CMU_MFC Signed-off-by: Raghav Sharma Reviewed-by: Krzysztof Kozlowski Signed-off-by: Krzysztof Kozlowski --- include/dt-bindings/clock/samsung,exynosautov920.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/samsung,exynosautov920.h b/include/dt-bindings/clock/samsung,exynosautov920.h index 0342a988565a..970d05167fc6 100644 --- a/include/dt-bindings/clock/samsung,exynosautov920.h +++ b/include/dt-bindings/clock/samsung,exynosautov920.h @@ -300,4 +300,9 @@ #define CLK_MOUT_M2M_NOC_USER 2 #define CLK_DOUT_M2M_NOCP 3 +/* CMU_MFC */ +#define CLK_MOUT_MFC_MFC_USER 1 +#define CLK_MOUT_MFC_WFD_USER 2 +#define CLK_DOUT_MFC_NOCP 3 + #endif /* _DT_BINDINGS_CLOCK_EXYNOSAUTOV920_H */ -- cgit v1.2.3 From ed4a5c5de56ad4e23c9e5da8981639352b63b8ac Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Tue, 23 Sep 2025 18:16:07 +0000 Subject: usb: typec: class: add typec_get_data_role symbol Alt Mode drivers are responsible for sending Enter Mode through the TCPM, but only a DFP is allowed to send Enter Mode. typec_get_data_role gets the port's data role, which can then be used in altmode drivers via typec_altmode_get_data_role to know if Enter Mode should be sent. Signed-off-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250923181606.1583584-5-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec.h | 1 + include/linux/usb/typec_altmode.h | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h index 252af3f77039..309251572e2e 100644 --- a/include/linux/usb/typec.h +++ b/include/linux/usb/typec.h @@ -337,6 +337,7 @@ struct typec_plug *typec_register_plug(struct typec_cable *cable, void typec_unregister_plug(struct typec_plug *plug); void typec_set_data_role(struct typec_port *port, enum typec_data_role role); +enum typec_data_role typec_get_data_role(struct typec_port *port); void typec_set_pwr_role(struct typec_port *port, enum typec_role role); void typec_set_vconn_role(struct typec_port *port, enum typec_role role); void typec_set_pwr_opmode(struct typec_port *port, enum typec_pwr_opmode mode); diff --git a/include/linux/usb/typec_altmode.h b/include/linux/usb/typec_altmode.h index b3c0866ea70f..f7db3bd4c90e 100644 --- a/include/linux/usb/typec_altmode.h +++ b/include/linux/usb/typec_altmode.h @@ -172,6 +172,19 @@ typec_altmode_get_svdm_version(struct typec_altmode *altmode) return typec_get_negotiated_svdm_version(typec_altmode2port(altmode)); } +/** + * typec_altmode_get_data_role - Get port data role + * @altmode: Handle to the alternate mode + * + * Alt Mode drivers should only issue Enter Mode through the port if they are + * the DFP. + */ +static inline enum typec_data_role +typec_altmode_get_data_role(struct typec_altmode *altmode) +{ + return typec_get_data_role(typec_altmode2port(altmode)); +} + /** * struct typec_altmode_driver - USB Type-C alternate mode device driver * @id_table: Null terminated array of SVIDs -- cgit v1.2.3 From 536bf30d282a6b2f676c6106587f0e1946449aca Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:53 -0500 Subject: iio: buffer: document iio_push_to_buffers_with_ts() Document the iio_push_to_buffers_with_ts() function. This is copied and slightly cleaned up from iio_push_to_buffers_with_timestamp(). Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index 5c84ec4a9810..e46b818981aa 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -45,6 +45,22 @@ static inline int iio_push_to_buffers_with_timestamp(struct iio_dev *indio_dev, return iio_push_to_buffers(indio_dev, data); } +/** + * iio_push_to_buffers_with_ts() - push data and timestamp to buffers + * @indio_dev: iio_dev structure for device. + * @data: Pointer to sample data buffer. + * @data_total_len: The size of @data in bytes. + * @timestamp: Timestamp for the sample data. + * + * Pushes data to the IIO device's buffers. If timestamps are enabled for the + * device the function will store the supplied timestamp as the last element in + * the sample data buffer before pushing it to the device buffers. The sample + * data buffer needs to be large enough to hold the additional timestamp + * (usually the buffer should be at least indio->scan_bytes bytes large). + * + * Context: Any context. + * Return: 0 on success, a negative error code otherwise. + */ static inline int iio_push_to_buffers_with_ts(struct iio_dev *indio_dev, void *data, size_t data_total_len, s64 timestamp) -- cgit v1.2.3 From 4992ce003b76ee1629ad4e7332a49ea2619e7523 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:54 -0500 Subject: iio: buffer: deprecated iio_push_to_buffers_with_timestamp() Replace the documentation of iio_push_to_buffers_with_timestamp() with a deprecation notice pointing to the preferred alternative. Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/iio/buffer.h b/include/linux/iio/buffer.h index e46b818981aa..d37f82678f71 100644 --- a/include/linux/iio/buffer.h +++ b/include/linux/iio/buffer.h @@ -26,11 +26,7 @@ int iio_pop_from_buffer(struct iio_buffer *buffer, void *data); * @data: sample data * @timestamp: timestamp for the sample data * - * Pushes data to the IIO device's buffers. If timestamps are enabled for the - * device the function will store the supplied timestamp as the last element in - * the sample data buffer before pushing it to the device buffers. The sample - * data buffer needs to be large enough to hold the additional timestamp - * (usually the buffer should be indio->scan_bytes bytes large). + * DEPRECATED: Use iio_push_to_buffers_with_ts() instead. * * Returns 0 on success, a negative error code otherwise. */ -- cgit v1.2.3 From 748ed9fc8596015e7e136877465919b89c7d08d6 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:56 -0500 Subject: iio: buffer: document store_to() callback may be called in any context Document that the struct iio_buffer_access_funcs.store_to() callback must be safe to call from any context since it is called from iio_push_to_buffer() which may be called from any context. Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer_impl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index e72552e026f3..0daff9ff20ce 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -24,7 +24,8 @@ struct sg_table; /** * struct iio_buffer_access_funcs - access functions for buffers. - * @store_to: actually store stuff to the buffer + * @store_to: actually store stuff to the buffer - must be safe to + * call from any context (e.g. must not sleep). * @read: try to get a specified number of bytes (must exist) * @data_available: indicates how much data is available for reading from * the buffer. -- cgit v1.2.3 From 592ae0ccecfac9af8f67444cab11cbb11770f571 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Tue, 16 Sep 2025 16:02:57 -0500 Subject: iio: buffer: document that buffer callback must be context safe Document that the callback registered with iio_channel_get_all_cb() must be safe to call from any context since it is called from by iio_push_to_buffer() which can be called in any context. Signed-off-by: David Lechner Signed-off-by: Jonathan Cameron --- include/linux/iio/consumer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iio/consumer.h b/include/linux/iio/consumer.h index a38b277c2c02..5039558267e4 100644 --- a/include/linux/iio/consumer.h +++ b/include/linux/iio/consumer.h @@ -131,7 +131,8 @@ struct iio_cb_buffer; /** * iio_channel_get_all_cb() - register callback for triggered capture * @dev: Pointer to client device. - * @cb: Callback function. + * @cb: Callback function. Must be safe to call from any context + * (e.g. must not sleep). * @private: Private data passed to callback. * * NB right now we have no ability to mux data from multiple devices. -- cgit v1.2.3 From a514bb109eada64f798f1c86c17182229cc20fe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 7 Oct 2025 10:15:21 +0100 Subject: iio: buffer: support getting dma channel from the buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new buffer accessor .get_dma_dev() in order to get the struct device responsible for actually providing the dma channel. We cannot assume that we can use the parent of the IIO device for mapping the DMA buffer. This becomes important on systems (like the Xilinx/AMD zynqMP Ultrascale) where memory (or part of it) is mapped above the 32 bit range. On such systems and given that a device by default has a dma mask of 32 bits we would then need to rely on bounce buffers (to swiotlb) for mapping memory above the dma mask limit. In the process, add an iio_buffer_get_dma_dev() helper function to get the proper DMA device. Cc: stable@vger.kernel.org Reviewed-by: David Lechner Signed-off-by: Nuno Sá Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer_impl.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/iio/buffer_impl.h b/include/linux/iio/buffer_impl.h index e72552e026f3..8d770ced66b2 100644 --- a/include/linux/iio/buffer_impl.h +++ b/include/linux/iio/buffer_impl.h @@ -50,6 +50,7 @@ struct sg_table; * @enqueue_dmabuf: called from userspace via ioctl to queue this DMABUF * object to this buffer. Requires a valid DMABUF fd, that * was previouly attached to this buffer. + * @get_dma_dev: called to get the DMA channel associated with this buffer. * @lock_queue: called when the core needs to lock the buffer queue; * it is used when enqueueing DMABUF objects. * @unlock_queue: used to unlock a previously locked buffer queue @@ -90,6 +91,7 @@ struct iio_buffer_access_funcs { struct iio_dma_buffer_block *block, struct dma_fence *fence, struct sg_table *sgt, size_t size, bool cyclic); + struct device * (*get_dma_dev)(struct iio_buffer *buffer); void (*lock_queue)(struct iio_buffer *buffer); void (*unlock_queue)(struct iio_buffer *buffer); -- cgit v1.2.3 From f9c198c3ccaf90a1a265fb2ffa8d4b093c3b0784 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nuno=20S=C3=A1?= Date: Tue, 7 Oct 2025 10:15:22 +0100 Subject: iio: buffer-dma: support getting the DMA channel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the .get_dma_dev() callback for DMA buffers by returning the device that owns the DMA channel. This allows the core DMABUF infrastructure to properly map DMA buffers using the correct device, avoiding the need for bounce buffers on systems where memory is mapped above the 32-bit range. The function returns the DMA queue's device, which is the actual device responsible for DMA operations in buffer-dma implementations. Cc: stable@vger.kernel.org Reviewed-by: David Lechner Signed-off-by: Nuno Sá Signed-off-by: Jonathan Cameron --- include/linux/iio/buffer-dma.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/iio/buffer-dma.h b/include/linux/iio/buffer-dma.h index 5eb66a399002..4f33e6a39797 100644 --- a/include/linux/iio/buffer-dma.h +++ b/include/linux/iio/buffer-dma.h @@ -174,5 +174,6 @@ int iio_dma_buffer_enqueue_dmabuf(struct iio_buffer *buffer, size_t size, bool cyclic); void iio_dma_buffer_lock_queue(struct iio_buffer *buffer); void iio_dma_buffer_unlock_queue(struct iio_buffer *buffer); +struct device *iio_dma_buffer_get_dma_dev(struct iio_buffer *buffer); #endif -- cgit v1.2.3 From d25de16477657f9eddd4be9abd409515edcc3b9e Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Thu, 9 Oct 2025 17:40:16 +0800 Subject: ASoC: soc-acpi: make some variables of acpi adr and link adr non-const Currently, we use predefined snd_soc_acpi_link_adr tables to match the link adr from ACPI table to select the machine driver and the topology. However, with the mechanism, we need to create the snd_soc_acpi_link_adr table for each audio config. The sof_sdw machine driver is used by almost all Intel platforms with SOF and we can load required topology file dynamically today. In other words, we can use sof_sdw machine driver as the default machine driver for Intel SOF SoundWire codecs and no need to create snd_soc_acpi_link_adr table for every new audio configs. To achieve it, we need to drop the const for some members and edit the link adr and acpi adr data to match the data from the ACPI table. Suggested-by: Charles Keepax Signed-off-by: Bard Liao Reviewed-by: Simon Trimmer Reviewed-by: Ranjani Sridharan Link: https://patch.msgid.link/20251009094023.3474895-3-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/sound/soc-acpi.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/sound/soc-acpi.h b/include/sound/soc-acpi.h index b8af309c2683..90d73b9bddab 100644 --- a/include/sound/soc-acpi.h +++ b/include/sound/soc-acpi.h @@ -114,8 +114,8 @@ struct snd_soc_acpi_endpoint { * @name_prefix: string used for codec controls */ struct snd_soc_acpi_adr_device { - const u64 adr; - const u8 num_endpoints; + u64 adr; + u8 num_endpoints; const struct snd_soc_acpi_endpoint *endpoints; const char *name_prefix; }; @@ -131,8 +131,8 @@ struct snd_soc_acpi_adr_device { */ struct snd_soc_acpi_link_adr { - const u32 mask; - const u32 num_adr; + u32 mask; + u32 num_adr; const struct snd_soc_acpi_adr_device *adr_d; }; -- cgit v1.2.3 From ea97713903784286ef1ce45456f404ed288f19b1 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Thu, 9 Oct 2025 17:40:17 +0800 Subject: ASoC: soc_sdw_utils: add name_prefix to asoc_sdw_codec_info struct Currently, the codec name_prefix of Intel SoundWire machine driver is from the ACPI match table. We can have it in the asoc_sdw_codec_info struct as a default name_prefix of a codec if there is no corresponding audio config found in the ACPI match table. Signed-off-by: Bard Liao Reviewed-by: Simon Trimmer Reviewed-by: Ranjani Sridharan Link: https://patch.msgid.link/20251009094023.3474895-4-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/sound/soc_sdw_utils.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index 3c5e9b2af7f1..e289b453baba 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -68,6 +68,7 @@ struct asoc_sdw_codec_info { const int part_id; const int version_id; const char *codec_name; + const char *name_prefix; int amp_num; const u8 acpi_id[ACPI_ID_LEN]; const bool ignore_internal_dmic; -- cgit v1.2.3 From 5ed60e45c59d66e61586a10433e2b5527d4d72b5 Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Thu, 9 Oct 2025 17:40:19 +0800 Subject: ASoC: soc_sdw_utils: export asoc_sdw_get_dai_type asoc_sdw_get_dai_type() is quite useful to convert SDCA function types to SDW DAI types. It can be used by other drivers. Signed-off-by: Bard Liao Reviewed-by: Simon Trimmer Reviewed-by: Ranjani Sridharan Link: https://patch.msgid.link/20251009094023.3474895-6-yung-chuan.liao@linux.intel.com Signed-off-by: Mark Brown --- include/sound/soc_sdw_utils.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index e289b453baba..76c64c5245d4 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -169,6 +169,7 @@ int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card, int *num_devs, int * struct asoc_sdw_dailink *asoc_sdw_find_dailink(struct asoc_sdw_dailink *dailinks, const struct snd_soc_acpi_endpoint *new); +int asoc_sdw_get_dai_type(u32 type); int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card, struct asoc_sdw_dailink *soc_dais, -- cgit v1.2.3 From 4ebe64f507ca921c5109eb37eae6058b77413d93 Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Fri, 10 Oct 2025 16:53:48 +0800 Subject: ASoC: tas2781: Add TAS5802, TAS5815, and TAS5828 TAS5802/TAS5815/TAS5828 has on-chip DSP without current/voltage feedback. Signed-off-by: Baojun Xu Link: https://patch.msgid.link/20251010085349.52951-1-baojun.xu@ti.com Signed-off-by: Mark Brown --- include/sound/tas2781.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h index ddd997ac3216..0fbcdb15c74b 100644 --- a/include/sound/tas2781.h +++ b/include/sound/tas2781.h @@ -120,8 +120,11 @@ enum audio_device { TAS2570, TAS2572, TAS2781, + TAS5802, + TAS5815, TAS5825, TAS5827, + TAS5828, TAS_OTHERS, }; -- cgit v1.2.3 From b83fb1b14c06bdd765903ac852ba20a14e24f227 Mon Sep 17 00:00:00 2001 From: Axel Haslam Date: Mon, 6 Oct 2025 11:25:41 -0300 Subject: spi: offload: Add offset parameter Add an offset parameter that can be passed in the periodic trigger. This is useful for example when ADC drivers implement a separate periodic signal to trigger conversion and need offload to read the result with some delay. While at it, add some documentation to offload periodic trigger parameters. Reviewed-by: David Lechner Signed-off-by: Axel Haslam Signed-off-by: Marcelo Schmitt Link: https://patch.msgid.link/cd315e95c0bd8523f00e91c400abcd6a418e5924.1759760519.git.marcelo.schmitt@analog.com Signed-off-by: Mark Brown --- include/linux/spi/offload/types.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/spi/offload/types.h b/include/linux/spi/offload/types.h index 6f7892347871..cd61f8adb7a5 100644 --- a/include/linux/spi/offload/types.h +++ b/include/linux/spi/offload/types.h @@ -57,8 +57,17 @@ enum spi_offload_trigger_type { SPI_OFFLOAD_TRIGGER_PERIODIC, }; +/** + * spi_offload_trigger_periodic - configuration parameters for periodic triggers + * @frequency_hz: The rate that the trigger should fire in Hz. + * @offset_ns: A delay in nanoseconds between when this trigger fires + * compared to another trigger. This requires specialized hardware + * that supports such synchronization with a delay between two or + * more triggers. Set to 0 when not needed. + */ struct spi_offload_trigger_periodic { u64 frequency_hz; + u64 offset_ns; }; struct spi_offload_trigger_config { -- cgit v1.2.3 From 5a43dc9f4ee0a3624d0598ee14e8ef8468914525 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 13 Oct 2025 23:03:10 +0900 Subject: firewire: core: detect device quirk when reading configuration ROM Every time the bus manager runs, the cached configuration ROM content of the IRM device is investigated to detect device-specific quirks. This detection can be performed in advance when reading the configuration ROM. This commit adds device quirk flags to the fw_device structure, and initializes them after reading the bus information block of the configuration ROM. The quirk flags are immutable once the configuration ROM has been read. Although they are likely accessed concurrently only by the bus manager, this commit ensures safe access by preventing torn writes and reads using the WRITE_ONCE()/READ_ONCE() macros. Link: https://lore.kernel.org/r/20251013140311.97159-2-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 6d208769d456..161829cfcc00 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -170,6 +170,14 @@ struct fw_attribute_group { struct attribute *attrs[13]; }; +enum fw_device_quirk { + // See afa1282a35d3 ("firewire: core: check for 1394a compliant IRM, fix inaccessibility of Sony camcorder"). + FW_DEVICE_QUIRK_IRM_IS_1394_1995_ONLY = BIT(0), + + // See a509e43ff338 ("firewire: core: fix unstable I/O with Canon camcorder"). + FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER = BIT(1), +}; + enum fw_device_state { FW_DEVICE_INITIALIZING, FW_DEVICE_RUNNING, @@ -203,6 +211,9 @@ struct fw_device { struct fw_card *card; struct device device; + // A set of enum fw_device_quirk. + int quirks; + struct mutex client_list_mutex; struct list_head client_list; -- cgit v1.2.3 From 15f9610fc96ac6fd2844e63f7bf5a0b08e1c31c8 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Mon, 13 Oct 2025 23:03:11 +0900 Subject: firewire: core: handle device quirk of MOTU Audio Express A commit 3a93d082bacf ("ALSA: firewire-motu: add support for MOTU Audio Express") describes a quirk of MOTU Audio Express. The device returns acknowledge packet with 0x10 as the pending state of any types of asynchronous request transaction. It is completely out of specification. This commit implements handling for that device-specific quirk. The quirk is detected after reading the root directory of configuration ROM. When processing the acknowledge code in 1394 OHCI AT context event handler, firewire-ohci module seeks the device instance of destination node by traversing device hierarchy. If the device has the quirk, the acknowledge code is replaced with the standard code. The 1394 OHCI AT context events occur for outgoing asynchronous request packets. The device traversal is safe since no new request initiators exist after the fw_card_instance has been invalidated. Link: https://lore.kernel.org/r/20251013140311.97159-3-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 161829cfcc00..f1d8734c0ec6 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -176,6 +176,9 @@ enum fw_device_quirk { // See a509e43ff338 ("firewire: core: fix unstable I/O with Canon camcorder"). FW_DEVICE_QUIRK_IRM_IGNORES_BUS_MANAGER = BIT(1), + + // MOTU Audio Express transfers acknowledge packet with 0x10 for pending state. + FW_DEVICE_QUIRK_ACK_PACKET_WITH_INVALID_PENDING_CODE = BIT(2), }; enum fw_device_state { -- cgit v1.2.3 From c510368bce39cbaf4cb66f4acf788f5efa8692a6 Mon Sep 17 00:00:00 2001 From: Tommaso Merciai Date: Wed, 1 Oct 2025 23:26:52 +0200 Subject: dt-bindings: clock: renesas,r9a09g047-cpg: Add USB2 PHY core clocks Add definitions for USB2 PHY core clocks in the R9A09G047 CPG DT bindings header file. Signed-off-by: Tommaso Merciai Reviewed-by: Geert Uytterhoeven Link: https://patch.msgid.link/20251001212709.579080-9-tommaso.merciai.xr@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/renesas,r9a09g047-cpg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/renesas,r9a09g047-cpg.h b/include/dt-bindings/clock/renesas,r9a09g047-cpg.h index f165df8a6f5a..dab24740de3c 100644 --- a/include/dt-bindings/clock/renesas,r9a09g047-cpg.h +++ b/include/dt-bindings/clock/renesas,r9a09g047-cpg.h @@ -22,5 +22,7 @@ #define R9A09G047_GBETH_1_CLK_PTP_REF_I 11 #define R9A09G047_USB3_0_REF_ALT_CLK_P 12 #define R9A09G047_USB3_0_CLKCORE 13 +#define R9A09G047_USB2_0_CLK_CORE0 14 +#define R9A09G047_USB2_0_CLK_CORE1 15 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G047_CPG_H__ */ -- cgit v1.2.3 From 53615ad26e9789bfcdf3a4dccbcecb15294ea024 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 13 Oct 2025 13:41:33 +0900 Subject: netmem: replace __netmem_clear_lsb() with netmem_to_nmdesc() Now that we have struct netmem_desc, it'd better access the pp fields via struct netmem_desc rather than struct net_iov. Introduce netmem_to_nmdesc() for safely converting netmem_ref to netmem_desc regardless of the type underneath e.i. netmem_desc, net_iov. While at it, remove __netmem_clear_lsb() and make netmem_to_nmdesc() used instead. Suggested-by: Pavel Begunkov Signed-off-by: Byungchul Park Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20251013044133.69472-1-byungchul@sk.com Signed-off-by: Paolo Abeni --- include/net/netmem.h | 66 ++++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/net/netmem.h b/include/net/netmem.h index f7dacc9e75fd..651e2c62d1dd 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -247,6 +247,23 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem) return page_to_pfn(netmem_to_page(netmem)); } +/* XXX: How to extract netmem_desc from page must be changed, once + * netmem_desc no longer overlays on page and will be allocated through + * slab. + */ +#define __pp_page_to_nmdesc(p) (_Generic((p), \ + const struct page * : (const struct netmem_desc *)(p), \ + struct page * : (struct netmem_desc *)(p))) + +/* CAUTION: Check if the page is a pp page before calling this helper or + * know it's a pp page. + */ +#define pp_page_to_nmdesc(p) \ +({ \ + DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ + __pp_page_to_nmdesc(p); \ +}) + /** * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing * @netmem @@ -265,42 +282,25 @@ static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem) return (__force struct netmem_desc *)netmem; } -/* __netmem_clear_lsb - convert netmem_ref to struct net_iov * for access to - * common fields. - * @netmem: netmem reference to extract as net_iov. - * - * All the sub types of netmem_ref (page, net_iov) have the same pp, pp_magic, - * dma_addr, and pp_ref_count fields at the same offsets. Thus, we can access - * these fields without a type check to make sure that the underlying mem is - * net_iov or page. +/* netmem_to_nmdesc - convert netmem_ref to struct netmem_desc * for + * access to common fields. + * @netmem: netmem reference to get netmem_desc. * - * The resulting value of this function can only be used to access the fields - * that are NET_IOV_ASSERT_OFFSET'd. Accessing any other fields will result in - * undefined behavior. + * All the sub types of netmem_ref (netmem_desc, net_iov) have the same + * pp, pp_magic, dma_addr, and pp_ref_count fields via netmem_desc. * - * Return: the netmem_ref cast to net_iov* regardless of its underlying type. + * Return: the pointer to struct netmem_desc * regardless of its + * underlying type. */ -static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) +static inline struct netmem_desc *netmem_to_nmdesc(netmem_ref netmem) { - return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); -} + void *p = (void *)((__force unsigned long)netmem & ~NET_IOV); -/* XXX: How to extract netmem_desc from page must be changed, once - * netmem_desc no longer overlays on page and will be allocated through - * slab. - */ -#define __pp_page_to_nmdesc(p) (_Generic((p), \ - const struct page * : (const struct netmem_desc *)(p), \ - struct page * : (struct netmem_desc *)(p))) + if (netmem_is_net_iov(netmem)) + return &((struct net_iov *)p)->desc; -/* CAUTION: Check if the page is a pp page before calling this helper or - * know it's a pp page. - */ -#define pp_page_to_nmdesc(p) \ -({ \ - DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ - __pp_page_to_nmdesc(p); \ -}) + return __pp_page_to_nmdesc((struct page *)p); +} /** * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem @@ -320,12 +320,12 @@ static inline struct page_pool *__netmem_get_pp(netmem_ref netmem) static inline struct page_pool *netmem_get_pp(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp; + return netmem_to_nmdesc(netmem)->pp; } static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem) { - return &__netmem_clear_lsb(netmem)->pp_ref_count; + return &netmem_to_nmdesc(netmem)->pp_ref_count; } static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid) @@ -390,7 +390,7 @@ static inline bool netmem_is_pfmemalloc(netmem_ref netmem) static inline unsigned long netmem_get_dma_addr(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->dma_addr; + return netmem_to_nmdesc(netmem)->dma_addr; } void get_netmem(netmem_ref netmem); -- cgit v1.2.3 From 300709fbefd19ff7293c7d0ded9b56e69216e634 Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Fri, 10 Oct 2025 10:51:47 +0200 Subject: mm/memory_hotplug: Remove MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers were introduced to prepare the transition of memory to and from a physically accessible state. This enhancement was crucial for implementing the "memmap on memory" feature for s390. With introduction of dynamic (de)configuration of hotpluggable memory, memory can be brought to accessible state before add_memory(). Memory can be brought to inaccessible state before remove_memory(). Hence, there is no need of MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE memory notifiers anymore. This basically reverts commit c5f1e2d18909 ("mm/memory_hotplug: introduce MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers") Additionally, apply minor adjustments to the function parameters of move_pfn_range_to_zone() and mhp_supports_memmap_on_memory() to ensure compatibility with the latest branch. Acked-by: David Hildenbrand Signed-off-by: Sumanth Korikkar Signed-off-by: Heiko Carstens --- include/linux/memory.h | 9 --------- include/linux/memory_hotplug.h | 18 +----------------- include/linux/memremap.h | 1 - 3 files changed, 1 insertion(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index 0c214256216f..ba1515160894 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -96,17 +96,8 @@ int set_memory_block_size_order(unsigned int order); #define MEM_GOING_ONLINE (1<<3) #define MEM_CANCEL_ONLINE (1<<4) #define MEM_CANCEL_OFFLINE (1<<5) -#define MEM_PREPARE_ONLINE (1<<6) -#define MEM_FINISH_OFFLINE (1<<7) struct memory_notify { - /* - * The altmap_start_pfn and altmap_nr_pages fields are designated for - * specifying the altmap range and are exclusively intended for use in - * MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers. - */ - unsigned long altmap_start_pfn; - unsigned long altmap_nr_pages; unsigned long start_pfn; unsigned long nr_pages; }; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 23f038a16231..f2f16cdd73ee 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -58,22 +58,6 @@ typedef int __bitwise mhp_t; * implies the node id (nid). */ #define MHP_NID_IS_MGID ((__force mhp_t)BIT(2)) -/* - * The hotplugged memory is completely inaccessible while the memory is - * offline. The memory provider will handle MEM_PREPARE_ONLINE / - * MEM_FINISH_OFFLINE notifications and make the memory accessible. - * - * This flag is only relevant when used along with MHP_MEMMAP_ON_MEMORY, - * because the altmap cannot be written (e.g., poisoned) when adding - * memory -- before it is set online. - * - * This allows for adding memory with an altmap that is not currently - * made available by a hypervisor. When onlining that memory, the - * hypervisor can be instructed to make that memory available, and - * the onlining phase will not require any memory allocations, which is - * helpful in low-memory situations. - */ -#define MHP_OFFLINE_INACCESSIBLE ((__force mhp_t)BIT(3)) /* * Extended parameters for memory hotplug: @@ -123,7 +107,7 @@ extern void adjust_present_page_count(struct page *page, long nr_pages); /* VM interface that may be used by firmware interface */ extern int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages, - struct zone *zone, bool mhp_off_inaccessible); + struct zone *zone); extern void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages); extern int online_pages(unsigned long pfn, unsigned long nr_pages, struct zone *zone, struct memory_group *group); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e5951ba12a28..30c7aecbd245 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -25,7 +25,6 @@ struct vmem_altmap { unsigned long free; unsigned long align; unsigned long alloc; - bool inaccessible; }; /* -- cgit v1.2.3 From 9de877338a151860c76f194934d53b7b816d339a Mon Sep 17 00:00:00 2001 From: Raphael Gallais-Pou Date: Fri, 12 Sep 2025 13:36:09 +0200 Subject: media: include: remove c8sectpfe header Driver is not used anymore. Remove header file. Signed-off-by: Raphael Gallais-Pou Reviewed-by: Patrice Chotard Acked-by: Rob Herring (Arm) Signed-off-by: Hans Verkuil --- include/dt-bindings/media/c8sectpfe.h | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 include/dt-bindings/media/c8sectpfe.h (limited to 'include') diff --git a/include/dt-bindings/media/c8sectpfe.h b/include/dt-bindings/media/c8sectpfe.h deleted file mode 100644 index 6b1fb6f5413b..000000000000 --- a/include/dt-bindings/media/c8sectpfe.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __DT_C8SECTPFE_H -#define __DT_C8SECTPFE_H - -#define STV0367_TDA18212_NIMA_1 0 -#define STV0367_TDA18212_NIMA_2 1 -#define STV0367_TDA18212_NIMB_1 2 -#define STV0367_TDA18212_NIMB_2 3 - -#define STV0903_6110_LNB24_NIMA 4 -#define STV0903_6110_LNB24_NIMB 5 - -#endif /* __DT_C8SECTPFE_H */ -- cgit v1.2.3 From 082b86919b7a94de01d849021b4da820a6cb89dc Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 8 Oct 2025 12:55:18 +0300 Subject: media: v4l2-mem2mem: Fix outdated documentation Commit cbd9463da1b1 ("media: v4l2-mem2mem: Avoid calling .device_run in v4l2_m2m_job_finish") deferred calls to .device_run() to a work queue to avoid recursive calls when a job is finished right away from .device_run(). It failed to update the v4l2_m2m_job_finish() documentation that still states the function must not be called from .device_run(). Fix it. Fixes: cbd9463da1b1 ("media: v4l2-mem2mem: Avoid calling .device_run in v4l2_m2m_job_finish") Cc: stable@vger.kernel.org Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- include/media/v4l2-mem2mem.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h index 09c6164577cc..500f81f399df 100644 --- a/include/media/v4l2-mem2mem.h +++ b/include/media/v4l2-mem2mem.h @@ -192,8 +192,7 @@ void v4l2_m2m_try_schedule(struct v4l2_m2m_ctx *m2m_ctx); * other instances to take control of the device. * * This function has to be called only after &v4l2_m2m_ops->device_run - * callback has been called on the driver. To prevent recursion, it should - * not be called directly from the &v4l2_m2m_ops->device_run callback though. + * callback has been called on the driver. */ void v4l2_m2m_job_finish(struct v4l2_m2m_dev *m2m_dev, struct v4l2_m2m_ctx *m2m_ctx); -- cgit v1.2.3 From 347ed2d566dabb06c7970fff01129c4f59995ed6 Mon Sep 17 00:00:00 2001 From: zhidao su Date: Sat, 11 Oct 2025 15:16:51 +0800 Subject: sched/ext: Implement cgroup_set_idle() callback Implement the missing cgroup_set_idle() callback that was marked as a TODO. This allows BPF schedulers to be notified when a cgroup's idle state changes, enabling them to adjust their scheduling behavior accordingly. The implementation follows the same pattern as other cgroup callbacks like cgroup_set_weight() and cgroup_set_bandwidth(). It checks if the BPF scheduler has implemented the callback and invokes it with the appropriate parameters. Fixes a spelling error in the cgroup_set_bandwidth() documentation. tj: s/scx_cgroup_rwsem/scx_cgroup_ops_rwsem/ to fix build breakage. Signed-off-by: zhidao su Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index d82b7a9b0658..9848aeab2786 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -228,6 +228,7 @@ struct scx_task_group { u64 bw_period_us; u64 bw_quota_us; u64 bw_burst_us; + bool idle; #endif }; -- cgit v1.2.3 From 1ba9f8979426590367406c70c1c821f5b943f993 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:10 -0700 Subject: vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros TEXT_MAIN, DATA_MAIN and friends are defined differently depending on whether certain config options enable -ffunction-sections and/or -fdata-sections. There's no technical reason for that beyond voodoo coding. Keeping the separate implementations adds unnecessary complexity, fragments the logic, and increases the risk of subtle bugs. Unify the macros by using the same input section patterns across all configs. This is a prerequisite for the upcoming livepatch klp-build tooling which will manually enable -ffunction-sections and -fdata-sections via KCFLAGS. Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/asm-generic/vmlinux.lds.h | 40 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8a9a2e732a65..5facbc994634 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -87,39 +87,24 @@ #define ALIGN_FUNCTION() . = ALIGN(CONFIG_FUNCTION_ALIGNMENT) /* - * LD_DEAD_CODE_DATA_ELIMINATION option enables -fdata-sections, which - * generates .data.identifier sections, which need to be pulled in with - * .data. We don't want to pull in .data..other sections, which Linux - * has defined. Same for text and bss. + * Support -ffunction-sections by matching .text and .text.*, + * but exclude '.text..*'. * - * With LTO_CLANG, the linker also splits sections by default, so we need - * these macros to combine the sections during the final link. - * - * With AUTOFDO_CLANG and PROPELLER_CLANG, by default, the linker splits - * text sections and regroups functions into subsections. - * - * RODATA_MAIN is not used because existing code already defines .rodata.x - * sections to be brought in with rodata. + * Special .text.* sections that are typically grouped separately, such as + * .text.unlikely or .text.hot, must be matched explicitly before using + * TEXT_MAIN. */ -#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG) || \ -defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define TEXT_MAIN .text .text.[0-9a-zA-Z_]* -#else -#define TEXT_MAIN .text -#endif -#if defined(CONFIG_LD_DEAD_CODE_DATA_ELIMINATION) || defined(CONFIG_LTO_CLANG) + +/* + * Support -fdata-sections by matching .data, .data.*, and others, + * but exclude '.data..*'. + */ #define DATA_MAIN .data .data.[0-9a-zA-Z_]* .data.rel.* .data..L* .data..compoundliteral* .data.$__unnamed_* .data.$L* #define SDATA_MAIN .sdata .sdata.[0-9a-zA-Z_]* #define RODATA_MAIN .rodata .rodata.[0-9a-zA-Z_]* .rodata..L* #define BSS_MAIN .bss .bss.[0-9a-zA-Z_]* .bss..L* .bss..compoundliteral* #define SBSS_MAIN .sbss .sbss.[0-9a-zA-Z_]* -#else -#define DATA_MAIN .data .data.rel .data.rel.local -#define SDATA_MAIN .sdata -#define RODATA_MAIN .rodata -#define BSS_MAIN .bss -#define SBSS_MAIN .sbss -#endif /* * GCC 4.5 and later have a 32 bytes section alignment for structures. @@ -581,9 +566,8 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) * during second ld run in second ld pass when generating System.map * * TEXT_MAIN here will match symbols with a fixed pattern (for example, - * .text.hot or .text.unlikely) if dead code elimination or - * function-section is enabled. Match these symbols first before - * TEXT_MAIN to ensure they are grouped together. + * .text.hot or .text.unlikely). Match those before TEXT_MAIN to ensure + * they get grouped together. * * Also placing .text.hot section at the beginning of a page, this * would help the TLB performance. -- cgit v1.2.3 From afb026b6d35c79f6f47752147327932827aeac8c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:13 -0700 Subject: compiler: Tweak __UNIQUE_ID() naming In preparation for the objtool klp diff subcommand, add an underscore between the name and the counter. This will make it possible for objtool to distinguish between the non-unique and unique parts of the symbol name so it can properly correlate the symbols. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/compiler.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5b45ea7dff3e..6a32250f22f7 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -163,7 +163,11 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, __asm__ ("" : "=r" (var) : "0" (var)) #endif -#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__) +/* Format: __UNIQUE_ID__<__COUNTER__> */ +#define __UNIQUE_ID(name) \ + __PASTE(__UNIQUE_ID_, \ + __PASTE(name, \ + __PASTE(_, __COUNTER__))) /** * data_race - mark an expression as containing intentional data races -- cgit v1.2.3 From 9f14f1f91883aa2bfd6663161d2002c8ce937c43 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:14 -0700 Subject: compiler.h: Make addressable symbols less of an eyesore Avoid underscore overload by changing: __UNIQUE_ID___addressable_loops_per_jiffy_868 to the following: __UNIQUE_ID_addressable_loops_per_jiffy_868 This matches the format used by other __UNIQUE_ID()-generated symbols and improves readability for those who stare at ELF symbol table dumps. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/compiler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 6a32250f22f7..ab181d87d71d 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -287,7 +287,7 @@ static inline void *offset_to_ptr(const int *off) */ #define ___ADDRESSABLE(sym, __attrs) \ static void * __used __attrs \ - __UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym; + __UNIQUE_ID(__PASTE(addressable_, sym)) = (void *)(uintptr_t)&sym; #define __ADDRESSABLE(sym) \ ___ADDRESSABLE(sym, __section(".discard.addressable")) -- cgit v1.2.3 From c2d420796a427dda71a2400909864e7f8e037fd4 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:15 -0700 Subject: elfnote: Change ELFNOTE() to use __UNIQUE_ID() In preparation for the objtool klp diff subcommand, replace the custom unique symbol name generation in ELFNOTE() with __UNIQUE_ID(). This standardizes the naming format for all "unique" symbols, which will allow objtool to properly correlate them. Note this also removes the "one ELF note per line" limitation. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/elfnote.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/elfnote.h b/include/linux/elfnote.h index 69b136e4dd2b..bb3dcded055f 100644 --- a/include/linux/elfnote.h +++ b/include/linux/elfnote.h @@ -60,23 +60,21 @@ #else /* !__ASSEMBLER__ */ #include +#include /* * Use an anonymous structure which matches the shape of * Elf{32,64}_Nhdr, but includes the name and desc data. The size and * type of name and desc depend on the macro arguments. "name" must - * be a literal string, and "desc" must be passed by value. You may - * only define one note per line, since __LINE__ is used to generate - * unique symbols. + * be a literal string, and "desc" must be passed by value. */ -#define _ELFNOTE_PASTE(a,b) a##b -#define _ELFNOTE(size, name, unique, type, desc) \ +#define ELFNOTE(size, name, type, desc) \ static const struct { \ struct elf##size##_note _nhdr; \ unsigned char _name[sizeof(name)] \ __attribute__((aligned(sizeof(Elf##size##_Word)))); \ typeof(desc) _desc \ __attribute__((aligned(sizeof(Elf##size##_Word)))); \ - } _ELFNOTE_PASTE(_note_, unique) \ + } __UNIQUE_ID(note) \ __used \ __attribute__((section(".note." name), \ aligned(sizeof(Elf##size##_Word)), \ @@ -89,11 +87,10 @@ name, \ desc \ } -#define ELFNOTE(size, name, type, desc) \ - _ELFNOTE(size, name, __LINE__, type, desc) #define ELFNOTE32(name, type, desc) ELFNOTE(32, name, type, desc) #define ELFNOTE64(name, type, desc) ELFNOTE(64, name, type, desc) + #endif /* __ASSEMBLER__ */ #endif /* _LINUX_ELFNOTE_H */ -- cgit v1.2.3 From 6717e8f91db71641cb52855ed14c7900972ed0bc Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:16 -0700 Subject: kbuild: Remove 'kmod_' prefix from __KBUILD_MODNAME In preparation for the objtool klp diff subcommand, remove the arbitrary 'kmod_' prefix from __KBUILD_MODNAME and instead add it explicitly in the __initcall_id() macro. This change supports the standardization of "unique" symbol naming by ensuring the non-unique portion of the name comes before the unique part. That will enable objtool to properly correlate symbols across builds. Cc: Masahiro Yamada Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/init.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/init.h b/include/linux/init.h index 17c1bc712e23..40331923b9f4 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -200,12 +200,13 @@ extern struct module __this_module; /* Format: ____ */ #define __initcall_id(fn) \ + __PASTE(kmod_, \ __PASTE(__KBUILD_MODNAME, \ __PASTE(__, \ __PASTE(__COUNTER__, \ __PASTE(_, \ __PASTE(__LINE__, \ - __PASTE(_, fn)))))) + __PASTE(_, fn))))))) /* Format: ____ */ #define __initcall_name(prefix, __iid, id) \ -- cgit v1.2.3 From b37491d72b43c3a322d396c2d8e951a10be70c17 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Sep 2025 09:30:03 -0700 Subject: interval_tree: Fix ITSTATIC usage for *_subtree_search() For consistency with the other function templates, change _subtree_search_*() to use the user-supplied ITSTATIC rather than the hard-coded 'static'. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/interval_tree.h | 4 ++++ include/linux/interval_tree_generic.h | 2 +- include/linux/mm.h | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/interval_tree.h b/include/linux/interval_tree.h index 2b8026a39906..9d5791e9f737 100644 --- a/include/linux/interval_tree.h +++ b/include/linux/interval_tree.h @@ -19,6 +19,10 @@ extern void interval_tree_remove(struct interval_tree_node *node, struct rb_root_cached *root); +extern struct interval_tree_node * +interval_tree_subtree_search(struct interval_tree_node *node, + unsigned long start, unsigned long last); + extern struct interval_tree_node * interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); diff --git a/include/linux/interval_tree_generic.h b/include/linux/interval_tree_generic.h index 1b400f26f63d..c5a2fed49eb0 100644 --- a/include/linux/interval_tree_generic.h +++ b/include/linux/interval_tree_generic.h @@ -77,7 +77,7 @@ ITSTATIC void ITPREFIX ## _remove(ITSTRUCT *node, \ * Cond2: start <= ITLAST(node) \ */ \ \ -static ITSTRUCT * \ +ITSTATIC ITSTRUCT * \ ITPREFIX ## _subtree_search(ITSTRUCT *node, ITTYPE start, ITTYPE last) \ { \ while (true) { \ diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..04fa27718cd1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3369,6 +3369,8 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root_cached *root); +struct vm_area_struct *vma_interval_tree_subtree_search(struct vm_area_struct *node, + unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, -- cgit v1.2.3 From d2c60bde1c0fcac8b140e527546f80749ccd9c67 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:53 -0700 Subject: objtool: Move ANNOTATE* macros to annotate.h In preparation for using the objtool annotation macros in higher-level objtool.h macros like UNWIND_HINT, move them to their own file. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/annotate.h | 109 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/objtool.h | 90 +------------------------------------- 2 files changed, 110 insertions(+), 89 deletions(-) create mode 100644 include/linux/annotate.h (limited to 'include') diff --git a/include/linux/annotate.h b/include/linux/annotate.h new file mode 100644 index 000000000000..ccb445496331 --- /dev/null +++ b/include/linux/annotate.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ANNOTATE_H +#define _LINUX_ANNOTATE_H + +#include + +#ifdef CONFIG_OBJTOOL + +#ifndef __ASSEMBLY__ + +#define __ASM_ANNOTATE(label, type) \ + ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ + ".long " __stringify(label) " - .\n\t" \ + ".long " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define ASM_ANNOTATE(type) \ + "911:\n\t" \ + __ASM_ANNOTATE(911b, type) + +#else /* __ASSEMBLY__ */ + +.macro ANNOTATE type:req +.Lhere_\@: + .pushsection .discard.annotate_insn,"M",@progbits,8 + .long .Lhere_\@ - . + .long \type + .popsection +.endm + +#endif /* __ASSEMBLY__ */ + +#else /* !CONFIG_OBJTOOL */ +#ifndef __ASSEMBLY__ +#define __ASM_ANNOTATE(label, type) "" +#define ASM_ANNOTATE(type) +#else /* __ASSEMBLY__ */ +.macro ANNOTATE type:req +.endm +#endif /* __ASSEMBLY__ */ +#endif /* !CONFIG_OBJTOOL */ + +#ifndef __ASSEMBLY__ + +/* + * Annotate away the various 'relocation to !ENDBR` complaints; knowing that + * these relocations will never be used for indirect calls. + */ +#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) +#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) + +/* + * This should be used immediately before an indirect jump/call. It tells + * objtool the subsequent indirect jump/call is vouched safe for retpoline + * builds. + */ +#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) +/* + * See linux/instrumentation.h + */ +#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) +#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) +/* + * This macro indicates that the following intra-function call is valid. + * Any non-annotated intra-function call will cause objtool to issue a warning. + */ +#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) +/* + * Use objtool to validate the entry requirement that all code paths do + * VALIDATE_UNRET_END before RET. + * + * NOTE: The macro must be used at the beginning of a global symbol, otherwise + * it will be ignored. + */ +#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) +/* + * This should be used to refer to an instruction that is considered + * terminating, like a noreturn CALL or UD2 when we know they are not -- eg + * WARN using UD2. + */ +#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) +/* + * This should not be used; it annotates away CFI violations. There are a few + * valid use cases like kexec handover to the next kernel image, and there is + * no security concern there. + * + * There are also a few real issues annotated away, like EFI because we can't + * control the EFI code. + */ +#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI)) + +#else /* __ASSEMBLY__ */ +#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR +#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE +/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ +/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ +#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS +#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL +#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN +#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE +#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI +#endif /* __ASSEMBLY__ */ + +#endif /* _LINUX_ANNOTATE_H */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 46ebaa46e6c5..1973e9f14bf9 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -3,11 +3,10 @@ #define _LINUX_OBJTOOL_H #include +#include #ifdef CONFIG_OBJTOOL -#include - #ifndef __ASSEMBLY__ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ @@ -53,16 +52,6 @@ #define __ASM_BREF(label) label ## b -#define __ASM_ANNOTATE(label, type) \ - ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ - ".long " __stringify(label) " - .\n\t" \ - ".long " __stringify(type) "\n\t" \ - ".popsection\n\t" - -#define ASM_ANNOTATE(type) \ - "911:\n\t" \ - __ASM_ANNOTATE(911b, type) - #else /* __ASSEMBLY__ */ /* @@ -111,14 +100,6 @@ #endif .endm -.macro ANNOTATE type:req -.Lhere_\@: - .pushsection .discard.annotate_insn,"M",@progbits,8 - .long .Lhere_\@ - . - .long \type - .popsection -.endm - #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ @@ -128,84 +109,15 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) "\n\t" #define STACK_FRAME_NON_STANDARD(func) #define STACK_FRAME_NON_STANDARD_FP(func) -#define __ASM_ANNOTATE(label, type) "" -#define ASM_ANNOTATE(type) #else .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .endm .macro STACK_FRAME_NON_STANDARD func:req .endm -.macro ANNOTATE type:req -.endm #endif #endif /* CONFIG_OBJTOOL */ -#ifndef __ASSEMBLY__ -/* - * Annotate away the various 'relocation to !ENDBR` complaints; knowing that - * these relocations will never be used for indirect calls. - */ -#define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) -#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) - -/* - * This should be used immediately before an indirect jump/call. It tells - * objtool the subsequent indirect jump/call is vouched safe for retpoline - * builds. - */ -#define ANNOTATE_RETPOLINE_SAFE ASM_ANNOTATE(ANNOTYPE_RETPOLINE_SAFE) -/* - * See linux/instrumentation.h - */ -#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) -#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) -/* - * objtool annotation to ignore the alternatives and only consider the original - * instruction(s). - */ -#define ANNOTATE_IGNORE_ALTERNATIVE ASM_ANNOTATE(ANNOTYPE_IGNORE_ALTS) -/* - * This macro indicates that the following intra-function call is valid. - * Any non-annotated intra-function call will cause objtool to issue a warning. - */ -#define ANNOTATE_INTRA_FUNCTION_CALL ASM_ANNOTATE(ANNOTYPE_INTRA_FUNCTION_CALL) -/* - * Use objtool to validate the entry requirement that all code paths do - * VALIDATE_UNRET_END before RET. - * - * NOTE: The macro must be used at the beginning of a global symbol, otherwise - * it will be ignored. - */ -#define ANNOTATE_UNRET_BEGIN ASM_ANNOTATE(ANNOTYPE_UNRET_BEGIN) -/* - * This should be used to refer to an instruction that is considered - * terminating, like a noreturn CALL or UD2 when we know they are not -- eg - * WARN using UD2. - */ -#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) -/* - * This should not be used; it annotates away CFI violations. There are a few - * valid use cases like kexec handover to the next kernel image, and there is - * no security concern there. - * - * There are also a few real issues annotated away, like EFI because we can't - * control the EFI code. - */ -#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI)) - -#else -#define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR -#define ANNOTATE_RETPOLINE_SAFE ANNOTATE type=ANNOTYPE_RETPOLINE_SAFE -/* ANNOTATE_INSTR_BEGIN ANNOTATE type=ANNOTYPE_INSTR_BEGIN */ -/* ANNOTATE_INSTR_END ANNOTATE type=ANNOTYPE_INSTR_END */ -#define ANNOTATE_IGNORE_ALTERNATIVE ANNOTATE type=ANNOTYPE_IGNORE_ALTS -#define ANNOTATE_INTRA_FUNCTION_CALL ANNOTATE type=ANNOTYPE_INTRA_FUNCTION_CALL -#define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN -#define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE -#define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI -#endif - #if defined(CONFIG_NOINSTR_VALIDATION) && \ (defined(CONFIG_MITIGATION_UNRET_ENTRY) || defined(CONFIG_MITIGATION_SRSO)) #define VALIDATE_UNRET_BEGIN ANNOTATE_UNRET_BEGIN -- cgit v1.2.3 From 58f36a5756445dcd0a733504cd798955ebe968c1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:54 -0700 Subject: objtool: Add ANNOTATE_DATA_SPECIAL In preparation for the objtool klp diff subcommand, add an ANNOTATE_DATA_SPECIAL macro which annotates special section entries so that objtool can determine their size and location and extract them when needed. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/annotate.h | 49 ++++++++++++++++++++++++++++++++----------- include/linux/objtool_types.h | 2 ++ 2 files changed, 39 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/annotate.h b/include/linux/annotate.h index ccb445496331..7c10d34d198c 100644 --- a/include/linux/annotate.h +++ b/include/linux/annotate.h @@ -8,34 +8,52 @@ #ifndef __ASSEMBLY__ -#define __ASM_ANNOTATE(label, type) \ - ".pushsection .discard.annotate_insn,\"M\",@progbits,8\n\t" \ +#define __ASM_ANNOTATE(section, label, type) \ + ".pushsection " section ",\"M\", @progbits, 8\n\t" \ ".long " __stringify(label) " - .\n\t" \ ".long " __stringify(type) "\n\t" \ ".popsection\n\t" +#define ASM_ANNOTATE_LABEL(label, type) \ + __ASM_ANNOTATE(".discard.annotate_insn", label, type) + #define ASM_ANNOTATE(type) \ "911:\n\t" \ - __ASM_ANNOTATE(911b, type) + ASM_ANNOTATE_LABEL(911b, type) + +#define ASM_ANNOTATE_DATA(type) \ + "912:\n\t" \ + __ASM_ANNOTATE(".discard.annotate_data", 912b, type) #else /* __ASSEMBLY__ */ -.macro ANNOTATE type:req +.macro __ANNOTATE section, type .Lhere_\@: - .pushsection .discard.annotate_insn,"M",@progbits,8 + .pushsection \section, "M", @progbits, 8 .long .Lhere_\@ - . .long \type .popsection .endm +.macro ANNOTATE type + __ANNOTATE ".discard.annotate_insn", \type +.endm + +.macro ANNOTATE_DATA type + __ANNOTATE ".discard.annotate_data", \type +.endm + #endif /* __ASSEMBLY__ */ #else /* !CONFIG_OBJTOOL */ #ifndef __ASSEMBLY__ -#define __ASM_ANNOTATE(label, type) "" +#define ASM_ANNOTATE_LABEL(label, type) "" #define ASM_ANNOTATE(type) +#define ASM_ANNOTATE_DATA(type) #else /* __ASSEMBLY__ */ -.macro ANNOTATE type:req +.macro ANNOTATE type +.endm +.macro ANNOTATE_DATA type .endm #endif /* __ASSEMBLY__ */ #endif /* !CONFIG_OBJTOOL */ @@ -47,7 +65,7 @@ * these relocations will never be used for indirect calls. */ #define ANNOTATE_NOENDBR ASM_ANNOTATE(ANNOTYPE_NOENDBR) -#define ANNOTATE_NOENDBR_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOENDBR)) +#define ANNOTATE_NOENDBR_SYM(sym) asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOENDBR)) /* * This should be used immediately before an indirect jump/call. It tells @@ -58,8 +76,8 @@ /* * See linux/instrumentation.h */ -#define ANNOTATE_INSTR_BEGIN(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_BEGIN) -#define ANNOTATE_INSTR_END(label) __ASM_ANNOTATE(label, ANNOTYPE_INSTR_END) +#define ANNOTATE_INSTR_BEGIN(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_BEGIN) +#define ANNOTATE_INSTR_END(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_INSTR_END) /* * objtool annotation to ignore the alternatives and only consider the original * instruction(s). @@ -83,7 +101,7 @@ * terminating, like a noreturn CALL or UD2 when we know they are not -- eg * WARN using UD2. */ -#define ANNOTATE_REACHABLE(label) __ASM_ANNOTATE(label, ANNOTYPE_REACHABLE) +#define ANNOTATE_REACHABLE(label) ASM_ANNOTATE_LABEL(label, ANNOTYPE_REACHABLE) /* * This should not be used; it annotates away CFI violations. There are a few * valid use cases like kexec handover to the next kernel image, and there is @@ -92,7 +110,13 @@ * There are also a few real issues annotated away, like EFI because we can't * control the EFI code. */ -#define ANNOTATE_NOCFI_SYM(sym) asm(__ASM_ANNOTATE(sym, ANNOTYPE_NOCFI)) +#define ANNOTATE_NOCFI_SYM(sym) asm(ASM_ANNOTATE_LABEL(sym, ANNOTYPE_NOCFI)) + +/* + * Annotate a special section entry. This emables livepatch module generation + * to find and extract individual special section entries as needed. + */ +#define ANNOTATE_DATA_SPECIAL ASM_ANNOTATE_DATA(ANNOTYPE_DATA_SPECIAL) #else /* __ASSEMBLY__ */ #define ANNOTATE_NOENDBR ANNOTATE type=ANNOTYPE_NOENDBR @@ -104,6 +128,7 @@ #define ANNOTATE_UNRET_BEGIN ANNOTATE type=ANNOTYPE_UNRET_BEGIN #define ANNOTATE_REACHABLE ANNOTATE type=ANNOTYPE_REACHABLE #define ANNOTATE_NOCFI_SYM ANNOTATE type=ANNOTYPE_NOCFI +#define ANNOTATE_DATA_SPECIAL ANNOTATE_DATA type=ANNOTYPE_DATA_SPECIAL #endif /* __ASSEMBLY__ */ #endif /* _LINUX_ANNOTATE_H */ diff --git a/include/linux/objtool_types.h b/include/linux/objtool_types.h index aceac94632c8..c6def4049b1a 100644 --- a/include/linux/objtool_types.h +++ b/include/linux/objtool_types.h @@ -67,4 +67,6 @@ struct unwind_hint { #define ANNOTYPE_REACHABLE 8 #define ANNOTYPE_NOCFI 9 +#define ANNOTYPE_DATA_SPECIAL 1 + #endif /* _LINUX_OBJTOOL_TYPES_H */ -- cgit v1.2.3 From aca282ab7e75dd3c1d14230146357a03bef12194 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:55 -0700 Subject: x86/asm: Annotate special section entries In preparation for the objtool klp diff subcommand, add annotations for special section entries. This will enable objtool to determine the size and location of the entries and to extract them when needed. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/objtool.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 1973e9f14bf9..4fea6a042b28 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -9,9 +9,10 @@ #ifndef __ASSEMBLY__ -#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ +#define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ "987: \n\t" \ ".pushsection .discard.unwind_hints\n\t" \ + ANNOTATE_DATA_SPECIAL \ /* struct unwind_hint */ \ ".long 987b - .\n\t" \ ".short " __stringify(sp_offset) "\n\t" \ @@ -78,6 +79,7 @@ .macro UNWIND_HINT type:req sp_reg=0 sp_offset=0 signal=0 .Lhere_\@: .pushsection .discard.unwind_hints + ANNOTATE_DATA_SPECIAL /* struct unwind_hint */ .long .Lhere_\@ - . .short \sp_offset -- cgit v1.2.3 From f6b740ef5f4724f95363ac0d664e88d221343fa1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:56 -0700 Subject: objtool: Unify STACK_FRAME_NON_STANDARD entry sizes The C implementation of STACK_FRAME_NON_STANDARD emits 8-byte entries, whereas the asm version's entries are only 4 bytes. Make them consistent by converting the asm version to 8-byte entries. This is much easier than converting the C version to 4-bytes, which would require awkwardly putting inline asm in a dummy function in order to pass the 'func' pointer to the asm. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/objtool.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 4fea6a042b28..b18ab53561c9 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -92,7 +92,7 @@ .macro STACK_FRAME_NON_STANDARD func:req .pushsection .discard.func_stack_frame_non_standard, "aw" - .long \func - . + .quad \func .popsection .endm -- cgit v1.2.3 From dd590d4d57ebeeb826823c288741f2ed20f452af Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:03:59 -0700 Subject: objtool/klp: Introduce klp diff subcommand for diffing object files Add a new klp diff subcommand which performs a binary diff between two object files and extracts changed functions into a new object which can then be linked into a livepatch module. This builds on concepts from the longstanding out-of-tree kpatch [1] project which began in 2012 and has been used for many years to generate livepatch modules for production kernels. However, this is a complete rewrite which incorporates hard-earned lessons from 12+ years of maintaining kpatch. Key improvements compared to kpatch-build: - Integrated with objtool: Leverages objtool's existing control-flow graph analysis to help detect changed functions. - Works on vmlinux.o: Supports late-linked objects, making it compatible with LTO, IBT, and similar. - Simplified code base: ~3k fewer lines of code. - Upstream: No more out-of-tree #ifdef hacks, far less cruft. - Cleaner internals: Vastly simplified logic for symbol/section/reloc inclusion and special section extraction. - Robust __LINE__ macro handling: Avoids false positive binary diffs caused by the __LINE__ macro by introducing a fix-patch-lines script (coming in a later patch) which injects #line directives into the source .patch to preserve the original line numbers at compile time. Note the end result of this subcommand is not yet functionally complete. Livepatch needs some ELF magic which linkers don't like: - Two relocation sections (.rela*, .klp.rela*) for the same text section. - Use of SHN_LIVEPATCH to mark livepatch symbols. Unfortunately linkers tend to mangle such things. To work around that, klp diff generates a linker-compliant intermediate binary which encodes the relevant KLP section/reloc/symbol metadata. After module linking, a klp post-link step (coming soon) will clean up the mess and convert the linked .ko into a fully compliant livepatch module. Note this subcommand requires the diffed binaries to have been compiled with -ffunction-sections and -fdata-sections, and processed with 'objtool --checksum'. Those constraints will be handled by a klp-build script introduced in a later patch. Without '-ffunction-sections -fdata-sections', reliable object diffing would be infeasible due to toolchain limitations: - For intra-file+intra-section references, the compiler might occasionally generated hard-coded instruction offsets instead of relocations. - Section-symbol-based references can be ambiguous: - Overlapping or zero-length symbols create ambiguity as to which symbol is being referenced. - A reference to the end of a symbol (e.g., checking array bounds) can be misinterpreted as a reference to the next symbol, or vice versa. A potential future alternative to '-ffunction-sections -fdata-sections' would be to introduce a toolchain option that forces symbol-based (non-section) relocations. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/livepatch.h | 25 +------------ include/linux/livepatch_external.h | 76 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 24 deletions(-) create mode 100644 include/linux/livepatch_external.h (limited to 'include') diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 51a258c24ff5..772919e8096a 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #if IS_ENABLED(CONFIG_LIVEPATCH) @@ -77,30 +78,6 @@ struct klp_func { bool transition; }; -struct klp_object; - -/** - * struct klp_callbacks - pre/post live-(un)patch callback structure - * @pre_patch: executed before code patching - * @post_patch: executed after code patching - * @pre_unpatch: executed before code unpatching - * @post_unpatch: executed after code unpatching - * @post_unpatch_enabled: flag indicating if post-unpatch callback - * should run - * - * All callbacks are optional. Only the pre-patch callback, if provided, - * will be unconditionally executed. If the parent klp_object fails to - * patch for any reason, including a non-zero error status returned from - * the pre-patch callback, no further callbacks will be executed. - */ -struct klp_callbacks { - int (*pre_patch)(struct klp_object *obj); - void (*post_patch)(struct klp_object *obj); - void (*pre_unpatch)(struct klp_object *obj); - void (*post_unpatch)(struct klp_object *obj); - bool post_unpatch_enabled; -}; - /** * struct klp_object - kernel object structure for live patching * @name: module name (or NULL for vmlinux) diff --git a/include/linux/livepatch_external.h b/include/linux/livepatch_external.h new file mode 100644 index 000000000000..138af19b0f5c --- /dev/null +++ b/include/linux/livepatch_external.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * External livepatch interfaces for patch creation tooling + */ + +#ifndef _LINUX_LIVEPATCH_EXTERNAL_H_ +#define _LINUX_LIVEPATCH_EXTERNAL_H_ + +#include + +#define KLP_RELOC_SEC_PREFIX ".klp.rela." +#define KLP_SYM_PREFIX ".klp.sym." + +#define __KLP_PRE_PATCH_PREFIX __klp_pre_patch_callback_ +#define __KLP_POST_PATCH_PREFIX __klp_post_patch_callback_ +#define __KLP_PRE_UNPATCH_PREFIX __klp_pre_unpatch_callback_ +#define __KLP_POST_UNPATCH_PREFIX __klp_post_unpatch_callback_ + +#define KLP_PRE_PATCH_PREFIX __stringify(__KLP_PRE_PATCH_PREFIX) +#define KLP_POST_PATCH_PREFIX __stringify(__KLP_POST_PATCH_PREFIX) +#define KLP_PRE_UNPATCH_PREFIX __stringify(__KLP_PRE_UNPATCH_PREFIX) +#define KLP_POST_UNPATCH_PREFIX __stringify(__KLP_POST_UNPATCH_PREFIX) + +struct klp_object; + +typedef int (*klp_pre_patch_t)(struct klp_object *obj); +typedef void (*klp_post_patch_t)(struct klp_object *obj); +typedef void (*klp_pre_unpatch_t)(struct klp_object *obj); +typedef void (*klp_post_unpatch_t)(struct klp_object *obj); + +/** + * struct klp_callbacks - pre/post live-(un)patch callback structure + * @pre_patch: executed before code patching + * @post_patch: executed after code patching + * @pre_unpatch: executed before code unpatching + * @post_unpatch: executed after code unpatching + * @post_unpatch_enabled: flag indicating if post-unpatch callback + * should run + * + * All callbacks are optional. Only the pre-patch callback, if provided, + * will be unconditionally executed. If the parent klp_object fails to + * patch for any reason, including a non-zero error status returned from + * the pre-patch callback, no further callbacks will be executed. + */ +struct klp_callbacks { + klp_pre_patch_t pre_patch; + klp_post_patch_t post_patch; + klp_pre_unpatch_t pre_unpatch; + klp_post_unpatch_t post_unpatch; + bool post_unpatch_enabled; +}; + +/* + * 'struct klp_{func,object}_ext' are compact "external" representations of + * 'struct klp_{func,object}'. They are used by objtool for livepatch + * generation. The structs are then read by the livepatch module and converted + * to the real structs before calling klp_enable_patch(). + * + * TODO make these the official API for klp_enable_patch(). That should + * simplify livepatch's interface as well as its data structure lifetime + * management. + */ +struct klp_func_ext { + const char *old_name; + void *new_func; + unsigned long sympos; +}; + +struct klp_object_ext { + const char *name; + struct klp_func_ext *funcs; + struct klp_callbacks callbacks; + unsigned int nr_funcs; +}; + +#endif /* _LINUX_LIVEPATCH_EXTERNAL_H_ */ -- cgit v1.2.3 From b9976fa4649627c04dde26183333c3dcc90a0b76 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 17 Sep 2025 09:04:11 -0700 Subject: livepatch: Introduce source code helpers for livepatch modules Add some helper macros which can be used by livepatch source .patch files to register callbacks, convert static calls to regular calls where needed, and patch syscalls. Acked-by: Petr Mladek Tested-by: Joe Lawrence Signed-off-by: Josh Poimboeuf --- include/linux/livepatch_helpers.h | 77 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 include/linux/livepatch_helpers.h (limited to 'include') diff --git a/include/linux/livepatch_helpers.h b/include/linux/livepatch_helpers.h new file mode 100644 index 000000000000..99d68d0773fa --- /dev/null +++ b/include/linux/livepatch_helpers.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LIVEPATCH_HELPERS_H +#define _LINUX_LIVEPATCH_HELPERS_H + +/* + * Interfaces for use by livepatch patches + */ + +#include +#include + +#ifdef MODULE +#define KLP_OBJNAME __KBUILD_MODNAME +#else +#define KLP_OBJNAME vmlinux +#endif + +/* Livepatch callback registration */ + +#define KLP_CALLBACK_PTRS ".discard.klp_callback_ptrs" + +#define KLP_PRE_PATCH_CALLBACK(func) \ + klp_pre_patch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_PRE_PATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_POST_PATCH_CALLBACK(func) \ + klp_post_patch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_POST_PATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_PRE_UNPATCH_CALLBACK(func) \ + klp_pre_unpatch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_PRE_UNPATCH_PREFIX, KLP_OBJNAME) = func + +#define KLP_POST_UNPATCH_CALLBACK(func) \ + klp_post_unpatch_t __used __section(KLP_CALLBACK_PTRS) \ + __PASTE(__KLP_POST_UNPATCH_PREFIX, KLP_OBJNAME) = func + +/* + * Replace static_call() usage with this macro when create-diff-object + * recommends it due to the original static call key living in a module. + * + * This converts the static call to a regular indirect call. + */ +#define KLP_STATIC_CALL(name) \ + ((typeof(STATIC_CALL_TRAMP(name))*)(STATIC_CALL_KEY(name).func)) + +/* Syscall patching */ + +#define KLP_SYSCALL_DEFINE1(name, ...) KLP_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE2(name, ...) KLP_SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE3(name, ...) KLP_SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE4(name, ...) KLP_SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE5(name, ...) KLP_SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) +#define KLP_SYSCALL_DEFINE6(name, ...) KLP_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) + +#define KLP_SYSCALL_DEFINEx(x, sname, ...) \ + __KLP_SYSCALL_DEFINEx(x, sname, __VA_ARGS__) + +#ifdef CONFIG_X86_64 +// TODO move this to arch/x86/include/asm/syscall_wrapper.h and share code +#define __KLP_SYSCALL_DEFINEx(x, name, ...) \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + __X64_SYS_STUBx(x, name, __VA_ARGS__) \ + __IA32_SYS_STUBx(x, name, __VA_ARGS__) \ + static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ + { \ + long ret = __klp_do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ + return ret; \ + } \ + static inline long __klp_do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) + +#endif + +#endif /* _LINUX_LIVEPATCH_HELPERS_H */ -- cgit v1.2.3 From 10c4b4f60f5d0dbd29fa819be76e888501c7b729 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 13 Oct 2025 22:50:27 +0200 Subject: net: mdio: use macro module_driver to avoid boilerplate code Use macro module_driver to avoid boilerplate code. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/e5c37417-4984-4b57-8154-264deef61e0d@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index c640ba44dd6e..42d6d47e445b 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -689,16 +689,7 @@ struct phy_device *mdiobus_get_phy(struct mii_bus *bus, int addr); * init/exit. Each module may only use this macro once, and calling it * replaces module_init() and module_exit(). */ -#define mdio_module_driver(_mdio_driver) \ -static int __init mdio_module_init(void) \ -{ \ - return mdio_driver_register(&_mdio_driver); \ -} \ -module_init(mdio_module_init); \ -static void __exit mdio_module_exit(void) \ -{ \ - mdio_driver_unregister(&_mdio_driver); \ -} \ -module_exit(mdio_module_exit) +#define mdio_module_driver(_mdio_driver) \ + module_driver(_mdio_driver, mdio_driver_register, mdio_driver_unregister) #endif /* __LINUX_MDIO_H__ */ -- cgit v1.2.3 From 433e294c3c5b5d2020085a0e36c1cb47b694690a Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 1 Oct 2025 12:56:49 +0200 Subject: regulator: core: forward undervoltage events downstream by default Forward critical supply events downstream so consumers can react in time. An under-voltage event on an upstream rail may otherwise never reach end devices (e.g. eMMC). Register a notifier on a regulator's supply when the supply is resolved, and forward only REGULATOR_EVENT_UNDER_VOLTAGE to the consumer's notifier chain. Event handling is deferred to process context via a workqueue; the consumer rdev is lifetime-pinned and the rdev lock is held while calling the notifier chain. The notifier is unregistered on regulator teardown. No DT/UAPI changes. Behavior applies to all regulators with a supply. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20251001105650.2391477-1-o.rempel@pengutronix.de Signed-off-by: Mark Brown --- include/linux/regulator/driver.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index 4a216fdba354..978cf593b662 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -658,6 +658,9 @@ struct regulator_dev { spinlock_t err_lock; int pw_requested_mW; + + /* regulator notification forwarding */ + struct notifier_block supply_fwd_nb; }; /* -- cgit v1.2.3 From 48a97ffc6c826640907d13b199e29008f4fe2c15 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 14 Oct 2025 13:14:03 -0700 Subject: bpf: Consistently use bpf_rcu_lock_held() everywhere We have many places which open-code what's now is bpf_rcu_lock_held() macro, so replace all those places with a clean and short macro invocation. For that, move bpf_rcu_lock_held() macro into include/linux/bpf.h. Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Jiri Olsa Link: https://lore.kernel.org/bpf/20251014201403.4104511-1-andrii@kernel.org --- include/linux/bpf.h | 3 +++ include/linux/bpf_local_storage.h | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f87fb203aaae..86afd9ac6848 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2381,6 +2381,9 @@ bpf_prog_run_array_uprobe(const struct bpf_prog_array *array, bool bpf_jit_bypass_spec_v1(void); bool bpf_jit_bypass_spec_v4(void); +#define bpf_rcu_lock_held() \ + (rcu_read_lock_held() || rcu_read_lock_trace_held() || rcu_read_lock_bh_held()) + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); extern struct mutex bpf_stats_enabled_mutex; diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index ab7244d8108f..782f58feea35 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -18,9 +18,6 @@ #define BPF_LOCAL_STORAGE_CACHE_SIZE 16 -#define bpf_rcu_lock_held() \ - (rcu_read_lock_held() || rcu_read_lock_trace_held() || \ - rcu_read_lock_bh_held()) struct bpf_local_storage_map_bucket { struct hlist_head list; raw_spinlock_t lock; -- cgit v1.2.3 From 1c51450f1afff1e7419797720df3fbd9ccbf610c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Oct 2025 14:59:26 +0000 Subject: tcp: better handle TCP_TX_DELAY on established flows Some applications uses TCP_TX_DELAY socket option after TCP flow is established. Some metrics need to be updated, otherwise TCP might take time to adapt to the new (emulated) RTT. This patch adjusts tp->srtt_us, tp->rtt_min, icsk_rto and sk->sk_pacing_rate. This is best effort, and for instance icsk_rto is reset without taking backoff into account. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251013145926.833198-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 5ca230ed526a..1e547138f4fb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -461,6 +461,8 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, void tcp_enter_loss(struct sock *sk); void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag); void tcp_clear_retrans(struct tcp_sock *tp); +void tcp_update_pacing_rate(struct sock *sk); +void tcp_set_rto(struct sock *sk); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); -- cgit v1.2.3 From 6ddb811a579f87b8506344020002d396f814f7c8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Oct 2025 15:22:31 +0000 Subject: net: add SK_WMEM_ALLOC_BIAS constant sk->sk_wmem_alloc is initialized to 1, and sk_wmem_alloc_get() takes care of this initial value. Add SK_WMEM_ALLOC_BIAS define to not spread this magic value. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251013152234.842065-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 60bcb13f045c..2794bc5c5654 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2303,6 +2303,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro return 0; } +#define SK_WMEM_ALLOC_BIAS 1 /** * sk_wmem_alloc_get - returns write allocations * @sk: socket @@ -2311,7 +2312,7 @@ static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *fro */ static inline int sk_wmem_alloc_get(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) - 1; + return refcount_read(&sk->sk_wmem_alloc) - SK_WMEM_ALLOC_BIAS; } /** -- cgit v1.2.3 From 2ddef3462b3a5d62e5485e22ce128a5c02276438 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Oct 2025 15:22:33 +0000 Subject: net: add /proc/sys/net/core/txq_reselection_ms control Add a new sysctl to control how often a queue reselection can happen even if a flow has a persistent queue of skbs in a Qdisc or NIC queue. A value of zero means the feature is disabled. Default is 1000 (1 second). This sysctl is used in the following patch. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251013152234.842065-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index 9b36f0ff0c20..cb9c3e4cd738 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -13,6 +13,7 @@ struct netns_core { struct ctl_table_header *sysctl_hdr; int sysctl_somaxconn; + int sysctl_txq_reselection; int sysctl_optmem_max; u8 sysctl_txrehash; u8 sysctl_tstamp_allow_data; -- cgit v1.2.3 From 4a7708443dec13b074bc43855f494358fedbd3c0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 13 Oct 2025 15:22:34 +0000 Subject: net: allow busy connected flows to switch tx queues This is a followup of commit 726e9e8b94b9 ("tcp: refine skb->ooo_okay setting") and of prior commit in this series ("net: control skb->ooo_okay from skb_set_owner_w()") skb->ooo_okay might never be set for bulk flows that always have at least one skb in a qdisc queue of NIC queue, especially if TX completion is delayed because of a stressed cpu. The so-called "strange attractors" has caused many performance issues (see for instance 9b462d02d6dd ("tcp: TCP Small Queues and strange attractors")), we need to do better. We have tried very hard to avoid reorders because TCP was not dealing with them nicely a decade ago. Use the new net.core.txq_reselection_ms sysctl to let flows follow XPS and select a more efficient queue. After this patch, we no longer have to make sure threads are pinned to cpus, they now can be migrated without adding too much spinlock/qdisc/TX completion pressure anymore. TX completion part was problematic, because it added false sharing on various socket fields, but also added false sharing and spinlock contention in mm layers. Calling skb_orphan() from ndo_start_xmit() is not an option unfortunately. Note for later: 1) move sk->sk_tx_queue_mapping closer to sk_tx_queue_mapping_jiffies for better cache locality. 2) Study if 9b462d02d6dd ("tcp: TCP Small Queues and strange attractors") could be revised. Tested: Used a host with 32 TX queues, shared by groups of 8 cores. XPS setup : echo ff >/sys/class/net/eth1/queue/tx-0/xps_cpus echo ff00 >/sys/class/net/eth1/queue/tx-1/xps_cpus echo ff0000 >/sys/class/net/eth1/queue/tx-2/xps_cpus echo ff000000 >/sys/class/net/eth1/queue/tx-3/xps_cpus echo ff,00000000 >/sys/class/net/eth1/queue/tx-4/xps_cpus echo ff00,00000000 >/sys/class/net/eth1/queue/tx-5/xps_cpus echo ff0000,00000000 >/sys/class/net/eth1/queue/tx-6/xps_cpus echo ff000000,00000000 >/sys/class/net/eth1/queue/tx-7/xps_cpus ... Launched a tcp_stream with 15 threads and 1000 flows, initially affined to core 0-15 taskset -c 0-15 tcp_stream -T15 -F1000 -l1000 -c -H target_host Checked that only queues 0 and 1 are used as instructed by XPS : tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p" backlog 123489410b 1890p backlog 69809026b 1064p backlog 52401054b 805p Then force each thread to run on cpu 1,9,17,25,33,41,49,57,65,73,81,89,97,105,113,121 C=1;PID=`pidof tcp_stream`;for P in `ls /proc/$PID/task`; do taskset -pc $C $P; C=$(($C + 8));done Set txq_reselection_ms to 1000 echo 1000 > /proc/sys/net/core/txq_reselection_ms Check that the flows have migrated nicely: tc -s qdisc show dev eth1|grep backlog|grep -v "backlog 0b 0p" backlog 130508314b 1916p backlog 8584380b 126p backlog 8584380b 126p backlog 8379990b 123p backlog 8584380b 126p backlog 8487484b 125p backlog 8584380b 126p backlog 8448120b 124p backlog 8584380b 126p backlog 8720640b 128p backlog 8856900b 130p backlog 8584380b 126p backlog 8652510b 127p backlog 8448120b 124p backlog 8516250b 125p backlog 7834950b 115p Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251013152234.842065-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 2794bc5c5654..f0d00928db9e 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -313,6 +313,7 @@ struct sk_filter; * @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock * for timestamping * @sk_tskey: counter to disambiguate concurrent tstamp requests + * @sk_tx_queue_mapping_jiffies: time in jiffies of last @sk_tx_queue_mapping refresh. * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock. @@ -485,6 +486,7 @@ struct sock { unsigned long sk_pacing_rate; /* bytes per second */ atomic_t sk_zckey; atomic_t sk_tskey; + unsigned long sk_tx_queue_mapping_jiffies; __cacheline_group_end(sock_write_tx); __cacheline_group_begin(sock_read_tx); @@ -1992,7 +1994,15 @@ static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) /* Paired with READ_ONCE() in sk_tx_queue_get() and * other WRITE_ONCE() because socket lock might be not held. */ - WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); + if (READ_ONCE(sk->sk_tx_queue_mapping) != tx_queue) { + WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue); + WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies); + return; + } + + /* Refresh sk_tx_queue_mapping_jiffies if too old. */ + if (time_is_before_jiffies(READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + HZ)) + WRITE_ONCE(sk->sk_tx_queue_mapping_jiffies, jiffies); } #define NO_QUEUE_MAPPING USHRT_MAX @@ -2005,19 +2015,7 @@ static inline void sk_tx_queue_clear(struct sock *sk) WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING); } -static inline int sk_tx_queue_get(const struct sock *sk) -{ - if (sk) { - /* Paired with WRITE_ONCE() in sk_tx_queue_clear() - * and sk_tx_queue_set(). - */ - int val = READ_ONCE(sk->sk_tx_queue_mapping); - - if (val != NO_QUEUE_MAPPING) - return val; - } - return -1; -} +int sk_tx_queue_get(const struct sock *sk); static inline void __sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb, -- cgit v1.2.3 From 378e6523ebb1e80b3955b7675cfe40b07028d085 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 14 Oct 2025 08:02:47 +0200 Subject: net: bcmgenet: remove unused platform code This effectively reverts b0ba512e25d7 ("net: bcmgenet: enable driver to work without a device tree"). There has never been an in-tree user of struct bcmgenet_platform_data, all devices use OF or ACPI. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/108b4e64-55d4-4b4e-9a11-3c810c319d66@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/platform_data/bcmgenet.h | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 include/linux/platform_data/bcmgenet.h (limited to 'include') diff --git a/include/linux/platform_data/bcmgenet.h b/include/linux/platform_data/bcmgenet.h deleted file mode 100644 index d8f8738629d2..000000000000 --- a/include/linux/platform_data/bcmgenet.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_PLATFORM_DATA_BCMGENET_H__ -#define __LINUX_PLATFORM_DATA_BCMGENET_H__ - -#include -#include -#include - -struct bcmgenet_platform_data { - bool mdio_enabled; - phy_interface_t phy_interface; - int phy_address; - int phy_speed; - int phy_duplex; - u8 mac_address[ETH_ALEN]; - int genet_version; -}; - -#endif -- cgit v1.2.3 From 44f5c8ec5b9ad8ed4ade08d727f803b2bb07f1c3 Mon Sep 17 00:00:00 2001 From: Ryan Newton Date: Wed, 15 Oct 2025 11:50:35 -0400 Subject: sched_ext: Add lockless peek operation for DSQs The builtin DSQ queue data structures are meant to be used by a wide range of different sched_ext schedulers with different demands on these data structures. They might be per-cpu with low-contention, or high-contention shared queues. Unfortunately, DSQs have a coarse-grained lock around the whole data structure. Without going all the way to a lock-free, more scalable implementation, a small step we can take to reduce lock contention is to allow a lockless, small-fixed-cost peek at the head of the queue. This change allows certain custom SCX schedulers to cheaply peek at queues, e.g. during load balancing, before locking them. But it represents a few extra memory operations to update the pointer each time the DSQ is modified, including a memory barrier on ARM so the write appears correctly ordered. This commit adds a first_task pointer field which is updated atomically when the DSQ is modified, and allows any thread to peek at the head of the queue without holding the lock. Signed-off-by: Ryan Newton Reviewed-by: Andrea Righi Reviewed-by: Christian Loehle Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 9848aeab2786..4713f374acc0 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -58,6 +58,7 @@ enum scx_dsq_id_flags { */ struct scx_dispatch_q { raw_spinlock_t lock; + struct task_struct __rcu *first_task; /* lockless peek at head */ struct list_head list; /* tasks in dispatch order */ struct rb_root priq; /* used to order by p->scx.dsq_vtime */ u32 nr; -- cgit v1.2.3 From e5b670e5439bda09ea7e3dd3dd32edb2f367c0d3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Oct 2025 14:06:05 +0000 Subject: net: remove obsolete WARN_ON(refcount_read(&sk->sk_refcnt) == 1) sk->sk_refcnt has been converted to refcount_t in 2017. __sock_put(sk) being refcount_dec(&sk->sk_refcnt), it will complain loudly if the current refcnt is 1 (or less) in a non racy way. We can remove four WARN_ON() in favor of the generic refcount_dec() check. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Xuanqiang Luo Link: https://patch.msgid.link/20251014140605.2982703-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index f0d00928db9e..30ac2eb4ef9b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -830,11 +830,9 @@ static inline bool sk_del_node_init(struct sock *sk) { bool rc = __sk_del_node_init(sk); - if (rc) { - /* paranoid for a while -acme */ - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + if (rc) __sock_put(sk); - } + return rc; } #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) @@ -852,11 +850,9 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) { bool rc = __sk_nulls_del_node_init_rcu(sk); - if (rc) { - /* paranoid for a while -acme */ - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + if (rc) __sock_put(sk); - } + return rc; } -- cgit v1.2.3 From e9139f765ac7048cadc9981e962acdf8b08eabf3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 30 Oct 2024 13:43:43 +0100 Subject: sched: Employ sched_change guards As proposed a long while ago -- and half done by scx -- wrap the scheduler's 'change' pattern in a guard helper. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Acked-by: Tejun Heo Acked-by: Vincent Guittot --- include/linux/cleanup.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..ae381675455d 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -340,6 +340,11 @@ _label: \ #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond +#define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \ + __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ + { return (void *)1; } + #define __GUARD_IS_ERR(_ptr) \ ({ \ unsigned long _rc = (__force unsigned long)(_ptr); \ -- cgit v1.2.3 From b079d93796528053cde322f2ca838c2d21c297e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Sep 2025 10:08:05 +0200 Subject: sched: Rename do_set_cpus_allowed() Hopefully saner naming. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Acked-by: Tejun Heo Acked-by: Vincent Guittot --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..77426c347cff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1861,8 +1861,8 @@ extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); -/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ -extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); +/* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ +extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task -- cgit v1.2.3 From 50653216e4ff7a74c95b2ee9ec439916875556ec Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sat, 9 Aug 2025 14:47:50 -0400 Subject: sched: Add support to pick functions to take rf Some pick functions like the internal pick_next_task_fair() already take rf but some others dont. We need this for scx's server pick function. Prepare for this by having pick functions accept it. [peterz: - added RETRY_TASK handling - removed pick_next_task_fair indirection] Signed-off-by: Joel Fernandes Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo --- include/linux/sched.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 77426c347cff..07576479c0ed 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -637,8 +637,8 @@ struct sched_rt_entity { #endif } __randomize_layout; -typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); -typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); +struct rq_flags; +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); struct sched_dl_entity { struct rb_node rb_node; @@ -730,9 +730,6 @@ struct sched_dl_entity { * dl_server_update(). * * @rq the runqueue this server is for - * - * @server_has_tasks() returns true if @server_pick return a - * runnable task. */ struct rq *rq; dl_server_pick_f server_pick_task; -- cgit v1.2.3 From 25937d399be2ee9852103a41aaca42d91b140d79 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Thu, 9 Oct 2025 18:10:39 +0200 Subject: dt-bindings: power: Add power domain IDs for Tegra264 Add the set of power domain IDs available on the Tegra264 SoC so that they can be used in device tree files. Acked-by: Rob Herring (Arm) Signed-off-by: Thierry Reding --- include/dt-bindings/power/nvidia,tegra264-bpmp.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 include/dt-bindings/power/nvidia,tegra264-bpmp.h (limited to 'include') diff --git a/include/dt-bindings/power/nvidia,tegra264-bpmp.h b/include/dt-bindings/power/nvidia,tegra264-bpmp.h new file mode 100644 index 000000000000..2eef4a2a02b0 --- /dev/null +++ b/include/dt-bindings/power/nvidia,tegra264-bpmp.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. */ + +#ifndef DT_BINDINGS_POWER_NVIDIA_TEGRA264_BPMP_H +#define DT_BINDINGS_POWER_NVIDIA_TEGRA264_BPMP_H + +#define TEGRA264_POWER_DOMAIN_DISP 1 +#define TEGRA264_POWER_DOMAIN_AUD 2 +/* reserved 3:9 */ +#define TEGRA264_POWER_DOMAIN_XUSB_SS 10 +#define TEGRA264_POWER_DOMAIN_XUSB_DEV 11 +#define TEGRA264_POWER_DOMAIN_XUSB_HOST 12 +#define TEGRA264_POWER_DOMAIN_MGBE0 13 +#define TEGRA264_POWER_DOMAIN_MGBE1 14 +#define TEGRA264_POWER_DOMAIN_MGBE2 15 +#define TEGRA264_POWER_DOMAIN_MGBE3 16 +#define TEGRA264_POWER_DOMAIN_VI 17 +#define TEGRA264_POWER_DOMAIN_VIC 18 +#define TEGRA264_POWER_DOMAIN_ISP0 19 +#define TEGRA264_POWER_DOMAIN_ISP1 20 +#define TEGRA264_POWER_DOMAIN_PVA0 21 +#define TEGRA264_POWER_DOMAIN_GPU 22 + +#endif /* DT_BINDINGS_POWER_NVIDIA_TEGRA264_BPMP_H */ -- cgit v1.2.3 From ae495810cffe29c3c30a757bd48b0bb035fc3098 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Tue, 14 Oct 2025 18:53:53 +0300 Subject: gpio: regmap: add the .fixed_direction_output configuration parameter There are GPIO controllers such as the one present in the LX2160ARDB QIXIS FPGA which have fixed-direction input and output GPIO lines mixed together in a single register. This cannot be modeled using the gpio-regmap as-is since there is no way to present the true direction of a GPIO line. In order to make this use case possible, add a new configuration parameter - fixed_direction_output - into the gpio_regmap_config structure. This will enable user drivers to provide a bitmap that represents the fixed direction of the GPIO lines. Signed-off-by: Ioana Ciornei Acked-by: Bartosz Golaszewski Reviewed-by: Michael Walle Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/regmap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index 622a2939ebe0..87983a5f3681 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -38,6 +38,10 @@ struct regmap; * offset to a register/bitmask pair. If not * given the default gpio_regmap_simple_xlate() * is used. + * @fixed_direction_output: + * (Optional) Bitmap representing the fixed direction of + * the GPIO lines. Useful when there are GPIO lines with a + * fixed direction mixed together in the same register. * @drvdata: (Optional) Pointer to driver specific data which is * not used by gpio-remap but is provided "as is" to the * driver callback(s). @@ -85,6 +89,7 @@ struct gpio_regmap_config { int reg_stride; int ngpio_per_reg; struct irq_domain *irq_domain; + unsigned long *fixed_direction_output; #ifdef CONFIG_REGMAP_IRQ struct regmap_irq_chip *regmap_irq_chip; -- cgit v1.2.3 From eba11116f39533d2e38cc5898014f2c95f32d23a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 13 Oct 2025 15:07:15 +0200 Subject: gpiolib: of: Get rid of Last user of linux/gpio/legacy-of-mm-gpiochip.h is gone. Remove linux/gpio/legacy-of-mm-gpiochip.h and CONFIG_OF_GPIO_MM_GPIOCHIP Signed-off-by: Christophe Leroy Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/legacy-of-mm-gpiochip.h | 36 ------------------------------ 1 file changed, 36 deletions(-) delete mode 100644 include/linux/gpio/legacy-of-mm-gpiochip.h (limited to 'include') diff --git a/include/linux/gpio/legacy-of-mm-gpiochip.h b/include/linux/gpio/legacy-of-mm-gpiochip.h deleted file mode 100644 index 2e2bd3b19cc3..000000000000 --- a/include/linux/gpio/legacy-of-mm-gpiochip.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * OF helpers for the old of_mm_gpio_chip, used on ppc32 and nios2, - * do not use in new code. - * - * Copyright (c) 2007-2008 MontaVista Software, Inc. - * - * Author: Anton Vorontsov - */ - -#ifndef __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H -#define __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H - -#include -#include - -/* - * OF GPIO chip for memory mapped banks - */ -struct of_mm_gpio_chip { - struct gpio_chip gc; - void (*save_regs)(struct of_mm_gpio_chip *mm_gc); - void __iomem *regs; -}; - -static inline struct of_mm_gpio_chip *to_of_mm_gpio_chip(struct gpio_chip *gc) -{ - return container_of(gc, struct of_mm_gpio_chip, gc); -} - -extern int of_mm_gpiochip_add_data(struct device_node *np, - struct of_mm_gpio_chip *mm_gc, - void *data); -extern void of_mm_gpiochip_remove(struct of_mm_gpio_chip *mm_gc); - -#endif /* __LINUX_GPIO_LEGACY_OF_MM_GPIO_CHIP_H */ -- cgit v1.2.3 From 1e3e330c07076a0582385bbea029c9cc918fa30d Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 13 Oct 2025 11:46:11 +0200 Subject: irqchip: Pass platform device to platform drivers The IRQCHIP_PLATFORM_DRIVER macros can be used to convert OF irqchip drivers to platform drivers but currently reuse the OF init callback prototype that only takes OF nodes as arguments. This forces drivers to do reverse lookups of their struct devices during probe if they need them for things like dev_printk() and device managed resources. Half of the drivers doing reverse lookups also currently fail to release the additional reference taken during the lookup, while other drivers have had the reference leak plugged in various ways (e.g. using non-intuitive cleanup constructs which still confuse static checkers). Switch to using a probe callback that takes a platform device as its first argument to simplify drivers and plug the remaining (mostly benign) reference leaks. Fixes: 32c6c054661a ("irqchip: Add Broadcom BCM2712 MSI-X interrupt controller") Fixes: 70afdab904d2 ("irqchip: Add IMX MU MSI controller driver") Fixes: a6199bb514d8 ("irqchip: Add Qualcomm MPM controller driver") Signed-off-by: Johan Hovold Signed-off-by: Thomas Gleixner Reviewed-by: Florian Fainelli Reviewed-by: Changhuang Liang --- include/linux/irqchip.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irqchip.h b/include/linux/irqchip.h index d5e6024cb2a8..bc4ddacd6ddc 100644 --- a/include/linux/irqchip.h +++ b/include/linux/irqchip.h @@ -17,12 +17,18 @@ #include #include +typedef int (*platform_irq_probe_t)(struct platform_device *, struct device_node *); + /* Undefined on purpose */ extern of_irq_init_cb_t typecheck_irq_init_cb; +extern platform_irq_probe_t typecheck_irq_probe; #define typecheck_irq_init_cb(fn) \ (__typecheck(typecheck_irq_init_cb, &fn) ? fn : fn) +#define typecheck_irq_probe(fn) \ + (__typecheck(typecheck_irq_probe, &fn) ? fn : fn) + /* * This macro must be used by the different irqchip drivers to declare * the association between their DT compatible string and their @@ -42,7 +48,7 @@ extern int platform_irqchip_probe(struct platform_device *pdev); static const struct of_device_id drv_name##_irqchip_match_table[] = { #define IRQCHIP_MATCH(compat, fn) { .compatible = compat, \ - .data = typecheck_irq_init_cb(fn), }, + .data = typecheck_irq_probe(fn), }, #define IRQCHIP_PLATFORM_DRIVER_END(drv_name, ...) \ -- cgit v1.2.3 From 7c268eaeec6388b7bee36aef3fb5e62c9222ad3b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 14 Oct 2025 23:54:55 +0000 Subject: net: Allow opt-out from global protocol memory accounting. Some protocols (e.g., TCP, UDP) implement memory accounting for socket buffers and charge memory to per-protocol global counters pointed to by sk->sk_proto->memory_allocated. Sometimes, system processes do not want that limitation. For a similar purpose, there is SO_RESERVE_MEM for sockets under memcg. Also, by opting out of the per-protocol accounting, sockets under memcg can avoid paying costs for two orthogonal memory accounting mechanisms. A microbenchmark result is in the subsequent bpf patch. Let's allow opt-out from the per-protocol memory accounting if sk->sk_bypass_prot_mem is true. sk->sk_bypass_prot_mem and sk->sk_prot are placed in the same cache line, and sk_has_account() always fetches sk->sk_prot before accessing sk->sk_bypass_prot_mem, so there is no extra cache miss for this patch. The following patches will set sk->sk_bypass_prot_mem to true, and then, the per-protocol memory accounting will be skipped. Note that this does NOT disable memcg, but rather the per-protocol one. Another option not to use the hole in struct sock_common is create sk_prot variants like tcp_prot_bypass, but this would complicate SOCKMAP logic, tcp_bpf_prots etc. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Reviewed-by: Shakeel Butt Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Link: https://patch.msgid.link/20251014235604.3057003-3-kuniyu@google.com --- include/net/proto_memory.h | 3 +++ include/net/sock.h | 3 +++ include/net/tcp.h | 3 +++ 3 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/proto_memory.h b/include/net/proto_memory.h index 8e91a8fa31b5..ad6d703ce6fe 100644 --- a/include/net/proto_memory.h +++ b/include/net/proto_memory.h @@ -35,6 +35,9 @@ static inline bool sk_under_memory_pressure(const struct sock *sk) mem_cgroup_sk_under_memory_pressure(sk)) return true; + if (sk->sk_bypass_prot_mem) + return false; + return !!READ_ONCE(*sk->sk_prot->memory_pressure); } diff --git a/include/net/sock.h b/include/net/sock.h index 30ac2eb4ef9b..415e7381aa50 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -118,6 +118,7 @@ typedef __u64 __bitwise __addrpair; * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting + * @skc_bypass_prot_mem: bypass the per-protocol memory accounting for skb * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol @@ -174,6 +175,7 @@ struct sock_common { unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; + unsigned char skc_bypass_prot_mem:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; @@ -381,6 +383,7 @@ struct sock { #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt +#define sk_bypass_prot_mem __sk_common.skc_bypass_prot_mem #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot diff --git a/include/net/tcp.h b/include/net/tcp.h index 1e547138f4fb..439e327fdbfa 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -303,6 +303,9 @@ static inline bool tcp_under_memory_pressure(const struct sock *sk) mem_cgroup_sk_under_memory_pressure(sk)) return true; + if (sk->sk_bypass_prot_mem) + return false; + return READ_ONCE(tcp_memory_pressure); } /* -- cgit v1.2.3 From b46ab63181ff973ddce44ebc9ac24b269d42f481 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 14 Oct 2025 23:54:56 +0000 Subject: net: Introduce net.core.bypass_prot_mem sysctl. If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out of the global protocol memory accounting. Let's control the flag by a new sysctl knob. The flag is written once during socket(2) and is inherited to child sockets. Tested with a script that creates local socket pairs and send()s a bunch of data without recv()ing. Setup: # mkdir /sys/fs/cgroup/test # echo $$ >> /sys/fs/cgroup/test/cgroup.procs # sysctl -q net.ipv4.tcp_mem="1000 1000 1000" # ulimit -n 524288 Without net.core.bypass_prot_mem, charged to tcp_mem & memcg # python3 pressure.py & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 22642688 <-------------------------------------- charged to memcg # cat /proc/net/sockstat| grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 5376 <-- charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53188 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:49972 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53868 ESTAB 2000 0 127.0.0.1:34479 127.0.0.1:53554 # nstat | grep Pressure || echo no pressure TcpExtTCPMemoryPressures 1 0.0 With net.core.bypass_prot_mem=1, charged to memcg only: # sysctl -q net.core.bypass_prot_mem=1 # python3 pressure.py & # cat /sys/fs/cgroup/test/memory.stat | grep sock sock 2757468160 <------------------------------------ charged to memcg # cat /proc/net/sockstat | grep TCP TCP: inuse 2006 orphan 0 tw 0 alloc 2008 mem 0 <- NOT charged to tcp_mem # ss -tn | head -n 5 State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:49026 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:45630 ESTAB 110000 0 127.0.0.1:36019 127.0.0.1:44870 ESTAB 111000 0 127.0.0.1:36019 127.0.0.1:45274 # nstat | grep Pressure || echo no pressure no pressure Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Reviewed-by: Shakeel Butt Reviewed-by: Eric Dumazet Acked-by: Roman Gushchin Link: https://patch.msgid.link/20251014235604.3057003-4-kuniyu@google.com --- include/net/netns/core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/core.h b/include/net/netns/core.h index cb9c3e4cd738..9ef3d70e5e9c 100644 --- a/include/net/netns/core.h +++ b/include/net/netns/core.h @@ -17,6 +17,7 @@ struct netns_core { int sysctl_optmem_max; u8 sysctl_txrehash; u8 sysctl_tstamp_allow_data; + u8 sysctl_bypass_prot_mem; #ifdef CONFIG_PROC_FS struct prot_inuse __percpu *prot_inuse; -- cgit v1.2.3 From 38163af068810b388f6723a681dfd8c7b3680d38 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 14 Oct 2025 23:54:58 +0000 Subject: bpf: Introduce SK_BPF_BYPASS_PROT_MEM. If a socket has sk->sk_bypass_prot_mem flagged, the socket opts out of the global protocol memory accounting. This is easily controlled by net.core.bypass_prot_mem sysctl, but it lacks flexibility. Let's support flagging (and clearing) sk->sk_bypass_prot_mem via bpf_setsockopt() at the BPF_CGROUP_INET_SOCK_CREATE hook. int val = 1; bpf_setsockopt(ctx, SOL_SOCKET, SK_BPF_BYPASS_PROT_MEM, &val, sizeof(val)); As with net.core.bypass_prot_mem, this is inherited to child sockets, and BPF always takes precedence over sysctl at socket(2) and accept(2). SK_BPF_BYPASS_PROT_MEM is only supported at BPF_CGROUP_INET_SOCK_CREATE and not supported on other hooks for some reasons: 1. UDP charges memory under sk->sk_receive_queue.lock instead of lock_sock() 2. Modifying the flag after skb is charged to sk requires such adjustment during bpf_setsockopt() and complicates the logic unnecessarily We can support other hooks later if a real use case justifies that. Most changes are inline and hard to trace, but a microbenchmark on __sk_mem_raise_allocated() during neper/tcp_stream showed that more samples completed faster with sk->sk_bypass_prot_mem == 1. This will be more visible under tcp_mem pressure (but it's not a fair comparison). # bpftrace -e 'kprobe:__sk_mem_raise_allocated { @start[tid] = nsecs; } kretprobe:__sk_mem_raise_allocated /@start[tid]/ { @end[tid] = nsecs - @start[tid]; @times = hist(@end[tid]); delete(@start[tid]); }' # tcp_stream -6 -F 1000 -N -T 256 Without bpf prog: [128, 256) 3846 | | [256, 512) 1505326 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 1371006 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [1K, 2K) 198207 |@@@@@@ | [2K, 4K) 31199 |@ | With bpf prog in the next patch: (must be attached before tcp_stream) # bpftool prog load sk_bypass_prot_mem.bpf.o /sys/fs/bpf/test type cgroup/sock_create # bpftool cgroup attach /sys/fs/cgroup/test cgroup_inet_sock_create pinned /sys/fs/bpf/test [128, 256) 6413 | | [256, 512) 1868425 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 1101697 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [1K, 2K) 117031 |@@@@ | [2K, 4K) 11773 | | Signed-off-by: Kuniyuki Iwashima Signed-off-by: Martin KaFai Lau Acked-by: Roman Gushchin Link: https://patch.msgid.link/20251014235604.3057003-6-kuniyu@google.com --- include/uapi/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6829936d33f5..6eb75ad900b1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -7200,6 +7200,8 @@ enum { TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ TCP_BPF_SOCK_OPS_CB_FLAGS = 1008, /* Get or Set TCP sock ops flags */ SK_BPF_CB_FLAGS = 1009, /* Get or set sock ops flags in socket */ + SK_BPF_BYPASS_PROT_MEM = 1010, /* Get or Set sk->sk_bypass_prot_mem */ + }; enum { -- cgit v1.2.3 From dce745009349fc391271c9415d5e242781ddadd7 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Mon, 21 Jul 2025 08:36:26 +0200 Subject: PCI/MSI: Delete pci_msi_create_irq_domain() pci_msi_create_irq_domain() is now unused. Delete it. Signed-off-by: Nam Cao Signed-off-by: Thomas Gleixner Acked-by: Bjorn Helgaas --- include/linux/msi.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/msi.h b/include/linux/msi.h index d415dd15a0a9..8003e3218c46 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -701,9 +701,6 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data); -struct irq_domain *pci_msi_create_irq_domain(struct fwnode_handle *fwnode, - struct msi_domain_info *info, - struct irq_domain *parent); u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev); u32 pci_msi_map_rid_ctlr_node(struct pci_dev *pdev, struct device_node **node); struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev); -- cgit v1.2.3 From fe946a751d9b52b7c45ca34899723b314b79b249 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Oct 2025 17:19:04 +0000 Subject: net/sched: act_mirred: add loop detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 0f022d32c3ec ("net/sched: Fix mirred deadlock on device recursion") added code in the fast path, even when act_mirred is not used. Prepare its revert by implementing loop detection in act_mirred. Adds an array of device pointers in struct netdev_xmit. tcf_mirred_is_act_redirect() can detect if the array already contains the target device. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Toke Høiland-Jørgensen Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20251014171907.3554413-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice_xmit.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice_xmit.h b/include/linux/netdevice_xmit.h index 813a19122ebb..cc232508e695 100644 --- a/include/linux/netdevice_xmit.h +++ b/include/linux/netdevice_xmit.h @@ -2,6 +2,12 @@ #ifndef _LINUX_NETDEVICE_XMIT_H #define _LINUX_NETDEVICE_XMIT_H +#if IS_ENABLED(CONFIG_NET_ACT_MIRRED) +#define MIRRED_NEST_LIMIT 4 +#endif + +struct net_device; + struct netdev_xmit { u16 recursion; u8 more; @@ -9,7 +15,8 @@ struct netdev_xmit { u8 skip_txqueue; #endif #if IS_ENABLED(CONFIG_NET_ACT_MIRRED) - u8 sched_mirred_nest; + u8 sched_mirred_nest; + struct net_device *sched_mirred_dev[MIRRED_NEST_LIMIT]; #endif #if IS_ENABLED(CONFIG_NF_DUP_NETDEV) u8 nf_dup_skb_recursion; -- cgit v1.2.3 From 178ca30889a13b555dddab7689fd2cc58c8e5dac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Oct 2025 17:19:05 +0000 Subject: Revert "net/sched: Fix mirred deadlock on device recursion" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commits 0f022d32c3eca477fbf79a205243a6123ed0fe11 and 44180feaccf266d9b0b28cc4ceaac019817deb5c. Prior patch in this series implemented loop detection in act_mirred, we can remove q->owner to save some cycles in the fast path. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Toke Høiland-Jørgensen Reviewed-by: Victor Nogueira Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20251014171907.3554413-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 738cd5b13c62..32e9961570b4 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -117,7 +117,6 @@ struct Qdisc { struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; - int owner; unsigned long state; unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; -- cgit v1.2.3 From 526f5fb112f7c89c5a9b8b2f9870c8cb76ca4e42 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Oct 2025 17:19:06 +0000 Subject: net: sched: claim one cache line in Qdisc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace state2 field with a boolean. Move it to a hole between qstats and state so that we shrink Qdisc by a full cache line. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Toke Høiland-Jørgensen Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20251014171907.3554413-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 32e9961570b4..31561291bc92 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -41,13 +41,6 @@ enum qdisc_state_t { __QDISC_STATE_DRAINING, }; -enum qdisc_state2_t { - /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. - * Use qdisc_run_begin/end() or qdisc_is_running() instead. - */ - __QDISC_STATE2_RUNNING, -}; - #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) @@ -117,8 +110,8 @@ struct Qdisc { struct qdisc_skb_head q; struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; + bool running; /* must be written under qdisc spinlock */ unsigned long state; - unsigned long state2; /* must be written under qdisc spinlock */ struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; @@ -167,7 +160,7 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); - return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + return READ_ONCE(qdisc->running); } static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) @@ -210,7 +203,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) */ return spin_trylock(&qdisc->seqlock); } - return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + if (READ_ONCE(qdisc->running)) + return false; + WRITE_ONCE(qdisc->running, true); + return true; } static inline void qdisc_run_end(struct Qdisc *qdisc) @@ -228,7 +224,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) &qdisc->state))) __netif_schedule(qdisc); } else { - __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + WRITE_ONCE(qdisc->running, false); } } -- cgit v1.2.3 From 100dfa74cad9d4665cdcf0cc8e673b123a3ea910 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 14 Oct 2025 17:19:07 +0000 Subject: net: dev_queue_xmit() llist adoption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove busylock spinlock and use a lockless list (llist) to reduce spinlock contention to the minimum. Idea is that only one cpu might spin on the qdisc spinlock, while others simply add their skb in the llist. After this patch, we get a 300 % improvement on heavy TX workloads. - Sending twice the number of packets per second. - While consuming 50 % less cycles. Note that this also allows in the future to submit batches to various qdisc->enqueue() methods. Tested: - Dual Intel(R) Xeon(R) 6985P-C (480 hyper threads). - 100Gbit NIC, 30 TX queues with FQ packet scheduler. - echo 64 >/sys/kernel/slab/skbuff_small_head/cpu_partial (avoid contention in mm) - 240 concurrent "netperf -t UDP_STREAM -- -m 120 -n" Before: 16 Mpps (41 Mpps if each thread is pinned to a different cpu) vmstat 2 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 243 0 0 2368988672 51036 1100852 0 0 146 1 242 60 0 9 91 0 0 244 0 0 2368988672 51036 1100852 0 0 536 10 487745 14718 0 52 48 0 0 244 0 0 2368988672 51036 1100852 0 0 512 0 503067 46033 0 52 48 0 0 244 0 0 2368988672 51036 1100852 0 0 512 0 494807 12107 0 52 48 0 0 244 0 0 2368988672 51036 1100852 0 0 702 26 492845 10110 0 52 48 0 0 Lock contention (1 second sample taken on 8 cores) perf lock record -C0-7 sleep 1; perf lock contention contended total wait max wait avg wait type caller 442111 6.79 s 162.47 ms 15.35 us spinlock dev_hard_start_xmit+0xcd 5961 9.57 ms 8.12 us 1.60 us spinlock __dev_queue_xmit+0x3a0 244 560.63 us 7.63 us 2.30 us spinlock do_softirq+0x5b 13 25.09 us 3.21 us 1.93 us spinlock net_tx_action+0xf8 If netperf threads are pinned, spinlock stress is very high. perf lock record -C0-7 sleep 1; perf lock contention contended total wait max wait avg wait type caller 964508 7.10 s 147.25 ms 7.36 us spinlock dev_hard_start_xmit+0xcd 201 268.05 us 4.65 us 1.33 us spinlock __dev_queue_xmit+0x3a0 12 26.05 us 3.84 us 2.17 us spinlock do_softirq+0x5b @__dev_queue_xmit_ns: [256, 512) 21 | | [512, 1K) 631 | | [1K, 2K) 27328 |@ | [2K, 4K) 265392 |@@@@@@@@@@@@@@@@ | [4K, 8K) 417543 |@@@@@@@@@@@@@@@@@@@@@@@@@@ | [8K, 16K) 826292 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [16K, 32K) 733822 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [32K, 64K) 19055 |@ | [64K, 128K) 17240 |@ | [128K, 256K) 25633 |@ | [256K, 512K) 4 | | After: 29 Mpps (57 Mpps if each thread is pinned to a different cpu) vmstat 2 5 procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- r b swpd free buff cache si so bi bo in cs us sy id wa st 78 0 0 2369573632 32896 1350988 0 0 22 0 331 254 0 8 92 0 0 75 0 0 2369573632 32896 1350988 0 0 22 50 425713 280199 0 23 76 0 0 104 0 0 2369573632 32896 1350988 0 0 290 0 430238 298247 0 23 76 0 0 86 0 0 2369573632 32896 1350988 0 0 132 0 428019 291865 0 24 76 0 0 90 0 0 2369573632 32896 1350988 0 0 502 0 422498 278672 0 23 76 0 0 perf lock record -C0-7 sleep 1; perf lock contention contended total wait max wait avg wait type caller 2524 116.15 ms 486.61 us 46.02 us spinlock __dev_queue_xmit+0x55b 5821 107.18 ms 371.67 us 18.41 us spinlock dev_hard_start_xmit+0xcd 2377 9.73 ms 35.86 us 4.09 us spinlock ___slab_alloc+0x4e0 923 5.74 ms 20.91 us 6.22 us spinlock ___slab_alloc+0x5c9 121 3.42 ms 193.05 us 28.24 us spinlock net_tx_action+0xf8 6 564.33 us 167.60 us 94.05 us spinlock do_softirq+0x5b If netperf threads are pinned (~54 Mpps) perf lock record -C0-7 sleep 1; perf lock contention 32907 316.98 ms 195.98 us 9.63 us spinlock dev_hard_start_xmit+0xcd 4507 61.83 ms 212.73 us 13.72 us spinlock __dev_queue_xmit+0x554 2781 23.53 ms 40.03 us 8.46 us spinlock ___slab_alloc+0x5c9 3554 18.94 ms 34.69 us 5.33 us spinlock ___slab_alloc+0x4e0 233 9.09 ms 215.70 us 38.99 us spinlock do_softirq+0x5b 153 930.66 us 48.67 us 6.08 us spinlock net_tx_action+0xfd 84 331.10 us 14.22 us 3.94 us spinlock ___slab_alloc+0x5c9 140 323.71 us 9.94 us 2.31 us spinlock ___slab_alloc+0x4e0 @__dev_queue_xmit_ns: [128, 256) 1539830 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | [256, 512) 2299558 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [512, 1K) 483936 |@@@@@@@@@@ | [1K, 2K) 265345 |@@@@@@ | [2K, 4K) 145463 |@@@ | [4K, 8K) 54571 |@ | [8K, 16K) 10270 | | [16K, 32K) 9385 | | [32K, 64K) 7749 | | [64K, 128K) 26799 | | [128K, 256K) 2665 | | [256K, 512K) 665 | | Signed-off-by: Eric Dumazet Reviewed-by: Toke Høiland-Jørgensen Reviewed-by: Kuniyuki Iwashima Tested-by: Jamal Hadi Salim Acked-by: Jamal Hadi Salim Link: https://patch.msgid.link/20251014171907.3554413-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 31561291bc92..94966692ccdf 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -115,7 +115,9 @@ struct Qdisc { struct Qdisc *next_sched; struct sk_buff_head skb_bad_txq; - spinlock_t busylock ____cacheline_aligned_in_smp; + atomic_long_t defer_count ____cacheline_aligned_in_smp; + struct llist_head defer_list; + spinlock_t seqlock; struct rcu_head rcu; -- cgit v1.2.3 From f968a24cad3da72fdff12a0ae5ac0b679439cca1 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Fri, 3 Oct 2025 12:16:38 +0900 Subject: can: treewide: remove can_change_mtu() can_change_mtu() became obsolete by commit 23049938605b ("can: populate the minimum and maximum MTU values"). Now that net_device->min_mtu and net_device->max_mtu are populated, all the checks are already done by dev_validate_mtu() in net/core/dev.c. Remove the net_device_ops->ndo_change_mtu() callback of all the physical interfaces, then remove can_change_mtu(). Only keep the vcan_change_mtu() and vxcan_change_mtu() because the virtual interfaces use their own different MTU logic. The only functional change this patch introduces is that now the user will be able to change the MTU even if the interface is up. This does not matter for Classical CAN and CAN FD because their MTU range is composed of only one value, respectively CAN_MTU and CANFD_MTU. For the upcoming CAN XL, the MTU will be configurable within the CANXL_MIN_MTU to CANXL_MAX_MTU range at any time, even if the interface is up. This is consistent with the other net protocols and does not contradict ISO 11898-1:2024 as having a modifiable MTU is a kernel extension. Signed-off-by: Vincent Mailhol Link: https://patch.msgid.link/20251003-remove-can_change_mtu-v1-1-337f8bc21181@kernel.org Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index a2229a61ccde..0fe8f80f223e 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -127,7 +127,6 @@ struct can_priv *safe_candev_priv(struct net_device *dev); int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); void can_set_default_mtu(struct net_device *dev); -int can_change_mtu(struct net_device *dev, int new_mtu); int __must_check can_set_static_ctrlmode(struct net_device *dev, u32 static_mode); int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd); -- cgit v1.2.3 From 6c4fed5fee42f5785e881ef2c28359724b18b80e Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Mon, 15 Sep 2025 19:00:25 +0530 Subject: crypto: drbg - Export CTR DRBG DF functions Export drbg_ctr_df() derivative function to new module df_sp80090. Signed-off-by: Harsh Jain Signed-off-by: Herbert Xu --- include/crypto/df_sp80090a.h | 27 +++++++++++++++++++++ include/crypto/drbg.h | 25 +------------------ include/crypto/internal/drbg.h | 54 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 24 deletions(-) create mode 100644 include/crypto/df_sp80090a.h create mode 100644 include/crypto/internal/drbg.h (limited to 'include') diff --git a/include/crypto/df_sp80090a.h b/include/crypto/df_sp80090a.h new file mode 100644 index 000000000000..182865538662 --- /dev/null +++ b/include/crypto/df_sp80090a.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright Stephan Mueller , 2014 + */ + +#ifndef _CRYPTO_DF80090A_H +#define _CRYPTO_DF80090A_H + +#include + +static inline int crypto_drbg_ctr_df_datalen(u8 statelen, u8 blocklen) +{ + return statelen + /* df_data */ + blocklen + /* pad */ + blocklen + /* iv */ + statelen + blocklen; /* temp */ +} + +int crypto_drbg_ctr_df(struct crypto_cipher *tfm, + unsigned char *df_data, + size_t bytes_to_return, + struct list_head *seedlist, + u8 blocklen_bytes, + u8 statelen); + +#endif /* _CRYPTO_DF80090A_H */ diff --git a/include/crypto/drbg.h b/include/crypto/drbg.h index af5ad51d3eef..2d42518cbdce 100644 --- a/include/crypto/drbg.h +++ b/include/crypto/drbg.h @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -54,30 +55,6 @@ #include #include -/* - * Concatenation Helper and string operation helper - * - * SP800-90A requires the concatenation of different data. To avoid copying - * buffers around or allocate additional memory, the following data structure - * is used to point to the original memory with its size. In addition, it - * is used to build a linked list. The linked list defines the concatenation - * of individual buffers. The order of memory block referenced in that - * linked list determines the order of concatenation. - */ -struct drbg_string { - const unsigned char *buf; - size_t len; - struct list_head list; -}; - -static inline void drbg_string_fill(struct drbg_string *string, - const unsigned char *buf, size_t len) -{ - string->buf = buf; - string->len = len; - INIT_LIST_HEAD(&string->list); -} - struct drbg_state; typedef uint32_t drbg_flag_t; diff --git a/include/crypto/internal/drbg.h b/include/crypto/internal/drbg.h new file mode 100644 index 000000000000..371e52dcee6c --- /dev/null +++ b/include/crypto/internal/drbg.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * NIST SP800-90A DRBG derivation function + * + * Copyright (C) 2014, Stephan Mueller + */ + +#ifndef _INTERNAL_DRBG_H +#define _INTERNAL_DRBG_H + +/* + * Convert an integer into a byte representation of this integer. + * The byte representation is big-endian + * + * @val value to be converted + * @buf buffer holding the converted integer -- caller must ensure that + * buffer size is at least 32 bit + */ +static inline void drbg_cpu_to_be32(__u32 val, unsigned char *buf) +{ + struct s { + __be32 conv; + }; + struct s *conversion = (struct s *)buf; + + conversion->conv = cpu_to_be32(val); +} + +/* + * Concatenation Helper and string operation helper + * + * SP800-90A requires the concatenation of different data. To avoid copying + * buffers around or allocate additional memory, the following data structure + * is used to point to the original memory with its size. In addition, it + * is used to build a linked list. The linked list defines the concatenation + * of individual buffers. The order of memory block referenced in that + * linked list determines the order of concatenation. + */ +struct drbg_string { + const unsigned char *buf; + size_t len; + struct list_head list; +}; + +static inline void drbg_string_fill(struct drbg_string *string, + const unsigned char *buf, size_t len) +{ + string->buf = buf; + string->len = len; + INIT_LIST_HEAD(&string->list); +} + +#endif //_INTERNAL_DRBG_H -- cgit v1.2.3 From ba0570bdf1d9956a63db2ddc50fa6a78d8c93f30 Mon Sep 17 00:00:00 2001 From: Harsh Jain Date: Mon, 15 Sep 2025 19:00:26 +0530 Subject: crypto: drbg - Replace AES cipher calls with library calls Replace aes used in drbg with library calls. Signed-off-by: Harsh Jain Signed-off-by: Herbert Xu --- include/crypto/df_sp80090a.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/crypto/df_sp80090a.h b/include/crypto/df_sp80090a.h index 182865538662..6b25305fe611 100644 --- a/include/crypto/df_sp80090a.h +++ b/include/crypto/df_sp80090a.h @@ -8,6 +8,7 @@ #define _CRYPTO_DF80090A_H #include +#include static inline int crypto_drbg_ctr_df_datalen(u8 statelen, u8 blocklen) { @@ -17,7 +18,7 @@ static inline int crypto_drbg_ctr_df_datalen(u8 statelen, u8 blocklen) statelen + blocklen; /* temp */ } -int crypto_drbg_ctr_df(struct crypto_cipher *tfm, +int crypto_drbg_ctr_df(struct crypto_aes_ctx *aes, unsigned char *df_data, size_t bytes_to_return, struct list_head *seedlist, -- cgit v1.2.3 From 3662b54c16924b03197ec80f9764aabdf2c90231 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 15 Oct 2025 10:53:24 +0300 Subject: media: v4l2-mem2mem: Document that v4l2_m2m_get_vq() never returns NULL The v4l2_m2m_get_vq() never returns a NULL pointer, as the internal get_queue_ctx() helper always returns a non-NULL pointer. Many drivers check the return value against NULL, due to a combination of old code and cargo-cult programming. Even v4l2-mem2mem.c contains unneeded NULL checks. Clarify the API by documenting explicitly that a NULL check is not needed, and simplify the code by removing the unneeded NULL checks from v4l2-mem2mem.c. Signed-off-by: Laurent Pinchart Reviewed-by: Stefan Klug Signed-off-by: Hans Verkuil --- include/media/v4l2-mem2mem.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h index 500f81f399df..c82445929c68 100644 --- a/include/media/v4l2-mem2mem.h +++ b/include/media/v4l2-mem2mem.h @@ -153,6 +153,9 @@ void *v4l2_m2m_get_curr_priv(struct v4l2_m2m_dev *m2m_dev); * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @type: type of the V4L2 buffer, as defined by enum &v4l2_buf_type + * + * This function returns the capture queue when @type is a capture type, and the + * output queue otherwise. It never returns a NULL pointer. */ struct vb2_queue *v4l2_m2m_get_vq(struct v4l2_m2m_ctx *m2m_ctx, enum v4l2_buf_type type); -- cgit v1.2.3 From 1fdb55ed40fa5ebe6934bd6b93036c714ebb5ef8 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Wed, 15 Oct 2025 13:01:16 +0300 Subject: media: v4l2-mem2mem: Don't copy frame flags in v4l2_m2m_buf_copy_metadata() The v4l2_m2m_buf_copy_metadata() function takes a boolean copy_frame_flags argument. When true, it causes the function to copy the V4L2_BUF_FLAG_KEYFRAME, V4L2_BUF_FLAG_BFRAME and V4L2_BUF_FLAG_PFRAME flags from the output buffer to the capture buffer. There is no use cases in any upstream driver for copying the flags. KEY/P/B frames are properties of the bitstream buffer in some formats. Once decoded, this is no longer a property of the video frame and should be discarded. It was considered useful to know if an uncompressed frame was decoded from a KEY/P/B compressed frame, and to preserve that information if that same uncompressed frame was passed through another M2M device (e.g. a scaler). However, the V4L2 documentation makes it clear that the flags are meant for compressed frames only. Drop the copy_frame_flags argument from v4l2_m2m_buf_copy_metadata(). The change to drivers was performed with the following Coccinelle semantic patch: @@ expression src; expression dst; expression flag; @@ - v4l2_m2m_buf_copy_metadata(src, dst, flag); + v4l2_m2m_buf_copy_metadata(src, dst); include/media/v4l2-mem2mem.h and drivers/media/v4l2-core/v4l2-mem2mem.c have been updated manually. Signed-off-by: Laurent Pinchart Reviewed-by: Philipp Zabel Reviewed-by: Benjamin Gaignard Signed-off-by: Hans Verkuil --- include/media/v4l2-mem2mem.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/media/v4l2-mem2mem.h b/include/media/v4l2-mem2mem.h index c82445929c68..bf6a09a04dcf 100644 --- a/include/media/v4l2-mem2mem.h +++ b/include/media/v4l2-mem2mem.h @@ -845,19 +845,13 @@ v4l2_m2m_dst_buf_remove_by_idx(struct v4l2_m2m_ctx *m2m_ctx, unsigned int idx) * * @out_vb: the output buffer that is the source of the metadata. * @cap_vb: the capture buffer that will receive the metadata. - * @copy_frame_flags: copy the KEY/B/PFRAME flags as well. * * This helper function copies the timestamp, timecode (if the TIMECODE - * buffer flag was set), field and the TIMECODE, KEYFRAME, BFRAME, PFRAME - * and TSTAMP_SRC_MASK flags from @out_vb to @cap_vb. - * - * If @copy_frame_flags is false, then the KEYFRAME, BFRAME and PFRAME - * flags are not copied. This is typically needed for encoders that - * set this bits explicitly. + * buffer flag was set), field, and the TIMECODE and TSTAMP_SRC_MASK flags from + * @out_vb to @cap_vb. */ void v4l2_m2m_buf_copy_metadata(const struct vb2_v4l2_buffer *out_vb, - struct vb2_v4l2_buffer *cap_vb, - bool copy_frame_flags); + struct vb2_v4l2_buffer *cap_vb); /* v4l2 request helper */ -- cgit v1.2.3 From 0d30dae38fe01cd1de358c6039a0b1184689fe51 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Fri, 10 Oct 2025 13:52:54 +0800 Subject: HID: intel-ish-hid: Use dedicated unbound workqueues to prevent resume blocking During suspend/resume tests with S2IDLE, some ISH functional failures were observed because of delay in executing ISH resume handler. Here schedule_work() is used from resume handler to do actual work. schedule_work() uses system_wq, which is a per CPU work queue. Although the queuing is not bound to a CPU, but it prefers local CPU of the caller, unless prohibited. Users of this work queue are not supposed to queue long running work. But in practice, there are scenarios where long running work items are queued on other unbound workqueues, occupying the CPU. As a result, the ISH resume handler may not get a chance to execute in a timely manner. In one scenario, one of the ish_resume_handler() executions was delayed nearly 1 second because another work item on an unbound workqueue occupied the same CPU. This delay causes ISH functionality failures. A similar issue was previously observed where the ISH HID driver timed out while getting the HID descriptor during S4 resume in the recovery kernel, likely caused by the same workqueue contention problem. Create dedicated unbound workqueues for all ISH operations to allow work items to execute on any available CPU, eliminating CPU-specific bottlenecks and improving resume reliability under varying system loads. Also ISH has three different components, a bus driver which implements ISH protocols, a PCI interface layer and HID interface. Use one dedicated work queue for all of them. Signed-off-by: Zhang Lixu Signed-off-by: Jiri Kosina --- include/linux/intel-ish-client-if.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index dfbf7d9d7bb5..b235fd84f478 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -87,6 +87,8 @@ bool ishtp_wait_resume(struct ishtp_device *dev); ishtp_print_log ishtp_trace_callback(struct ishtp_cl_device *cl_device); /* Get device pointer of PCI device for DMA acces */ struct device *ishtp_get_pci_device(struct ishtp_cl_device *cl_device); +/* Get the ISHTP workqueue */ +struct workqueue_struct *ishtp_get_workqueue(struct ishtp_cl_device *cl_device); struct ishtp_cl *ishtp_cl_allocate(struct ishtp_cl_device *cl_device); void ishtp_cl_free(struct ishtp_cl *cl); -- cgit v1.2.3 From 011aa2aa2c4c2b3356c32f195f306df6e177ac38 Mon Sep 17 00:00:00 2001 From: Zhang Lixu Date: Fri, 17 Oct 2025 10:22:13 +0800 Subject: HID: intel-ish-hid: Add ishtp_get_connection_state() interface Add the ishtp_get_connection_state() function for struct ishtp_cl, allowing ishtp client drivers to retrieve the current connection state. Signed-off-by: Zhang Lixu Acked-by: Srinivas Pandruvada Signed-off-by: Jiri Kosina --- include/linux/intel-ish-client-if.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/intel-ish-client-if.h b/include/linux/intel-ish-client-if.h index b235fd84f478..2cd4f65aaa37 100644 --- a/include/linux/intel-ish-client-if.h +++ b/include/linux/intel-ish-client-if.h @@ -109,6 +109,7 @@ struct ishtp_device *ishtp_get_ishtp_device(struct ishtp_cl *cl); void ishtp_set_tx_ring_size(struct ishtp_cl *cl, int size); void ishtp_set_rx_ring_size(struct ishtp_cl *cl, int size); void ishtp_set_connection_state(struct ishtp_cl *cl, int state); +int ishtp_get_connection_state(struct ishtp_cl *cl); void ishtp_cl_set_fw_client_id(struct ishtp_cl *cl, int fw_client_id); void ishtp_put_device(struct ishtp_cl_device *cl_dev); -- cgit v1.2.3 From e4c4f5a1ae18a7828c2bfaf9dfe2473632b92d1b Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Fri, 3 Oct 2025 20:14:38 +0200 Subject: dt-bindings: clock: qcom,x1e80100-gcc: Add missing USB4 clocks/resets Some of the USB4 muxes, RCGs and resets were not initially described. Add indices for them to allow extending the driver. Acked-by: Rob Herring (Arm) Reviewed-by: Bryan O'Donoghue Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20251003-topic-hamoa_gcc_usb4-v2-1-61d27a14ee65@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,x1e80100-gcc.h | 61 +++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,x1e80100-gcc.h b/include/dt-bindings/clock/qcom,x1e80100-gcc.h index 710c340f24a5..62aa12425592 100644 --- a/include/dt-bindings/clock/qcom,x1e80100-gcc.h +++ b/include/dt-bindings/clock/qcom,x1e80100-gcc.h @@ -363,6 +363,30 @@ #define GCC_USB3_PRIM_PHY_PIPE_CLK_SRC 353 #define GCC_USB3_SEC_PHY_PIPE_CLK_SRC 354 #define GCC_USB3_TERT_PHY_PIPE_CLK_SRC 355 +#define GCC_USB34_PRIM_PHY_PIPE_CLK_SRC 356 +#define GCC_USB34_SEC_PHY_PIPE_CLK_SRC 357 +#define GCC_USB34_TERT_PHY_PIPE_CLK_SRC 358 +#define GCC_USB4_0_PHY_DP0_CLK_SRC 359 +#define GCC_USB4_0_PHY_DP1_CLK_SRC 360 +#define GCC_USB4_0_PHY_P2RR2P_PIPE_CLK_SRC 361 +#define GCC_USB4_0_PHY_PCIE_PIPE_MUX_CLK_SRC 362 +#define GCC_USB4_0_PHY_RX0_CLK_SRC 363 +#define GCC_USB4_0_PHY_RX1_CLK_SRC 364 +#define GCC_USB4_0_PHY_SYS_CLK_SRC 365 +#define GCC_USB4_1_PHY_DP0_CLK_SRC 366 +#define GCC_USB4_1_PHY_DP1_CLK_SRC 367 +#define GCC_USB4_1_PHY_P2RR2P_PIPE_CLK_SRC 368 +#define GCC_USB4_1_PHY_PCIE_PIPE_MUX_CLK_SRC 369 +#define GCC_USB4_1_PHY_RX0_CLK_SRC 370 +#define GCC_USB4_1_PHY_RX1_CLK_SRC 371 +#define GCC_USB4_1_PHY_SYS_CLK_SRC 372 +#define GCC_USB4_2_PHY_DP0_CLK_SRC 373 +#define GCC_USB4_2_PHY_DP1_CLK_SRC 374 +#define GCC_USB4_2_PHY_P2RR2P_PIPE_CLK_SRC 375 +#define GCC_USB4_2_PHY_PCIE_PIPE_MUX_CLK_SRC 376 +#define GCC_USB4_2_PHY_RX0_CLK_SRC 377 +#define GCC_USB4_2_PHY_RX1_CLK_SRC 378 +#define GCC_USB4_2_PHY_SYS_CLK_SRC 379 /* GCC power domains */ #define GCC_PCIE_0_TUNNEL_GDSC 0 @@ -484,4 +508,41 @@ #define GCC_VIDEO_BCR 87 #define GCC_VIDEO_AXI0_CLK_ARES 88 #define GCC_VIDEO_AXI1_CLK_ARES 89 +#define GCC_USB4_0_MISC_USB4_SYS_BCR 90 +#define GCC_USB4_0_MISC_RX_CLK_0_BCR 91 +#define GCC_USB4_0_MISC_RX_CLK_1_BCR 92 +#define GCC_USB4_0_MISC_USB_PIPE_BCR 93 +#define GCC_USB4_0_MISC_PCIE_PIPE_BCR 94 +#define GCC_USB4_0_MISC_TMU_BCR 95 +#define GCC_USB4_0_MISC_SB_IF_BCR 96 +#define GCC_USB4_0_MISC_HIA_MSTR_BCR 97 +#define GCC_USB4_0_MISC_AHB_BCR 98 +#define GCC_USB4_0_MISC_DP0_MAX_PCLK_BCR 99 +#define GCC_USB4_0_MISC_DP1_MAX_PCLK_BCR 100 +#define GCC_USB4_1_MISC_USB4_SYS_BCR 101 +#define GCC_USB4_1_MISC_RX_CLK_0_BCR 102 +#define GCC_USB4_1_MISC_RX_CLK_1_BCR 103 +#define GCC_USB4_1_MISC_USB_PIPE_BCR 104 +#define GCC_USB4_1_MISC_PCIE_PIPE_BCR 105 +#define GCC_USB4_1_MISC_TMU_BCR 106 +#define GCC_USB4_1_MISC_SB_IF_BCR 107 +#define GCC_USB4_1_MISC_HIA_MSTR_BCR 108 +#define GCC_USB4_1_MISC_AHB_BCR 109 +#define GCC_USB4_1_MISC_DP0_MAX_PCLK_BCR 110 +#define GCC_USB4_1_MISC_DP1_MAX_PCLK_BCR 111 +#define GCC_USB4_2_MISC_USB4_SYS_BCR 112 +#define GCC_USB4_2_MISC_RX_CLK_0_BCR 113 +#define GCC_USB4_2_MISC_RX_CLK_1_BCR 114 +#define GCC_USB4_2_MISC_USB_PIPE_BCR 115 +#define GCC_USB4_2_MISC_PCIE_PIPE_BCR 116 +#define GCC_USB4_2_MISC_TMU_BCR 117 +#define GCC_USB4_2_MISC_SB_IF_BCR 118 +#define GCC_USB4_2_MISC_HIA_MSTR_BCR 119 +#define GCC_USB4_2_MISC_AHB_BCR 120 +#define GCC_USB4_2_MISC_DP0_MAX_PCLK_BCR 121 +#define GCC_USB4_2_MISC_DP1_MAX_PCLK_BCR 122 +#define GCC_USB4PHY_PHY_PRIM_BCR 123 +#define GCC_USB4PHY_PHY_SEC_BCR 124 +#define GCC_USB4PHY_PHY_TERT_BCR 125 + #endif -- cgit v1.2.3 From 1c17f4373d4db1e1f0ebd3ddcd8e7a642927a826 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 14 Oct 2025 22:42:07 +0000 Subject: ipv6: Move ipv6_fl_list from ipv6_pinfo to inet_sock. In {tcp6,udp6,raw6}_sock, struct ipv6_pinfo is always placed at the beginning of a new cache line because 1. __alignof__(struct tcp_sock) is 64 due to ____cacheline_aligned of __cacheline_group_begin(tcp_sock_write_tx) 2. __alignof__(struct udp_sock) is 64 due to ____cacheline_aligned of struct numa_drop_counters 3. in raw6_sock, struct numa_drop_counters is placed before struct ipv6_pinfo . struct ipv6_pinfo is 136 bytes, but the last cache line is only used by ipv6_fl_list: $ pahole -C ipv6_pinfo vmlinux struct ipv6_pinfo { ... /* --- cacheline 2 boundary (128 bytes) --- */ struct ipv6_fl_socklist * ipv6_fl_list; /* 128 8 */ /* size: 136, cachelines: 3, members: 23 */ Let's move ipv6_fl_list from struct ipv6_pinfo to struct inet_sock to save a full cache line for {tcp6,udp6,raw6}_sock. Now, struct ipv6_pinfo is 128 bytes, and {tcp6,udp6,raw6}_sock have 64 bytes less, while {tcp,udp,raw}_sock retain the same size. Before: # grep -E "^(RAW|UDP[^L\-]|TCP)" /proc/slabinfo | awk '{print $1, "\t", $4}' RAWv6 1408 UDPv6 1472 TCPv6 2560 RAW 1152 UDP 1280 TCP 2368 After: # grep -E "^(RAW|UDP[^L\-]|TCP)" /proc/slabinfo | awk '{print $1, "\t", $4}' RAWv6 1344 UDPv6 1408 TCPv6 2496 RAW 1152 UDP 1280 TCP 2368 Also, ipv6_fl_list and inet_flags (SNDFLOW bit) are placed in the same cache line. $ pahole -C inet_sock vmlinux ... /* --- cacheline 11 boundary (704 bytes) was 56 bytes ago --- */ struct ipv6_pinfo * pinet6; /* 760 8 */ /* --- cacheline 12 boundary (768 bytes) --- */ struct ipv6_fl_socklist * ipv6_fl_list; /* 768 8 */ unsigned long inet_flags; /* 776 8 */ Doc churn is due to the insufficient Type column (only 1 space short). Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251014224210.2964778-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 1 - include/net/inet_sock.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 43b7bb828738..7294e4e89b79 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -271,7 +271,6 @@ struct ipv6_pinfo { struct ipv6_mc_socklist __rcu *ipv6_mc_list; struct ipv6_ac_socklist *ipv6_ac_list; - struct ipv6_fl_socklist __rcu *ipv6_fl_list; }; /* We currently use available bits from inet_sk(sk)->inet_flags, diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 1086256549fa..b6ec08072533 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -214,6 +214,7 @@ struct inet_sock { struct sock sk; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *pinet6; + struct ipv6_fl_socklist __rcu *ipv6_fl_list; #endif /* Socket demultiplex comparisons on incoming packets. */ #define inet_daddr sk.__sk_common.skc_daddr -- cgit v1.2.3 From 9c4609225ec1cb551006d6a03c7c4ad8cb5584c0 Mon Sep 17 00:00:00 2001 From: Xuanqiang Luo Date: Wed, 15 Oct 2025 10:02:34 +0800 Subject: rculist: Add hlist_nulls_replace_rcu() and hlist_nulls_replace_init_rcu() Add two functions to atomically replace RCU-protected hlist_nulls entries. Keep using WRITE_ONCE() to assign values to ->next and ->pprev, as mentioned in the patch below: commit efd04f8a8b45 ("rcu: Use WRITE_ONCE() for assignments to ->next for rculist_nulls") commit 860c8802ace1 ("rcu: Use WRITE_ONCE() for assignments to ->pprev for hlist_nulls") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Frederic Weisbecker Reviewed-by: Eric Dumazet Signed-off-by: Xuanqiang Luo Link: https://patch.msgid.link/20251015020236.431822-2-xuanqiang.luo@linux.dev Signed-off-by: Jakub Kicinski --- include/linux/rculist_nulls.h | 59 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'include') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 89186c499dd4..c26cb83ca071 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -52,6 +52,13 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n) #define hlist_nulls_next_rcu(node) \ (*((struct hlist_nulls_node __rcu __force **)&(node)->next)) +/** + * hlist_nulls_pprev_rcu - returns the dereferenced pprev of @node. + * @node: element of the list. + */ +#define hlist_nulls_pprev_rcu(node) \ + (*((struct hlist_nulls_node __rcu __force **)(node)->pprev)) + /** * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. @@ -152,6 +159,58 @@ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); } +/** + * hlist_nulls_replace_rcu - replace an old entry by a new one + * @old: the element to be replaced + * @new: the new element to insert + * + * Description: + * Replace the old entry with the new one in a RCU-protected hlist_nulls, while + * permitting racing traversals. + * + * The caller must take whatever precautions are necessary (such as holding + * appropriate locks) to avoid racing with another list-mutation primitive, such + * as hlist_nulls_add_head_rcu() or hlist_nulls_del_rcu(), running on this same + * list. However, it is perfectly legal to run concurrently with the _rcu + * list-traversal primitives, such as hlist_nulls_for_each_entry_rcu(). + */ +static inline void hlist_nulls_replace_rcu(struct hlist_nulls_node *old, + struct hlist_nulls_node *new) +{ + struct hlist_nulls_node *next = old->next; + + WRITE_ONCE(new->next, next); + WRITE_ONCE(new->pprev, old->pprev); + rcu_assign_pointer(hlist_nulls_pprev_rcu(new), new); + if (!is_a_nulls(next)) + WRITE_ONCE(next->pprev, &new->next); +} + +/** + * hlist_nulls_replace_init_rcu - replace an old entry by a new one and + * initialize the old + * @old: the element to be replaced + * @new: the new element to insert + * + * Description: + * Replace the old entry with the new one in a RCU-protected hlist_nulls, while + * permitting racing traversals, and reinitialize the old entry. + * + * Note: @old must be hashed. + * + * The caller must take whatever precautions are necessary (such as holding + * appropriate locks) to avoid racing with another list-mutation primitive, such + * as hlist_nulls_add_head_rcu() or hlist_nulls_del_rcu(), running on this same + * list. However, it is perfectly legal to run concurrently with the _rcu + * list-traversal primitives, such as hlist_nulls_for_each_entry_rcu(). + */ +static inline void hlist_nulls_replace_init_rcu(struct hlist_nulls_node *old, + struct hlist_nulls_node *new) +{ + hlist_nulls_replace_rcu(old, new); + WRITE_ONCE(old->pprev, NULL); +} + /** * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type * @tpos: the type * to use as a loop cursor. -- cgit v1.2.3 From 1532ed0d0753c83e72595f785f82b48c28bbe5dc Mon Sep 17 00:00:00 2001 From: Xuanqiang Luo Date: Wed, 15 Oct 2025 10:02:35 +0800 Subject: inet: Avoid ehash lookup race in inet_ehash_insert() Since ehash lookups are lockless, if one CPU performs a lookup while another concurrently deletes and inserts (removing reqsk and inserting sk), the lookup may fail to find the socket, an RST may be sent. The call trace map is drawn as follows: CPU 0 CPU 1 ----- ----- inet_ehash_insert() spin_lock() sk_nulls_del_node_init_rcu(osk) __inet_lookup_established() (lookup failed) __sk_nulls_add_node_rcu(sk, list) spin_unlock() As both deletion and insertion operate on the same ehash chain, this patch introduces a new sk_nulls_replace_node_init_rcu() helper functions to implement atomic replacement. Fixes: 5e0724d027f0 ("tcp/dccp: fix hashdance race for passive sessions") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Jiayuan Chen Signed-off-by: Xuanqiang Luo Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251015020236.431822-3-xuanqiang.luo@linux.dev Signed-off-by: Jakub Kicinski --- include/net/sock.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 30ac2eb4ef9b..335d0da82d79 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -856,6 +856,19 @@ static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) return rc; } +static inline bool sk_nulls_replace_node_init_rcu(struct sock *old, + struct sock *new) +{ + if (sk_hashed(old)) { + hlist_nulls_replace_init_rcu(&old->sk_nulls_node, + &new->sk_nulls_node); + __sock_put(old); + return true; + } + + return false; +} + static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); -- cgit v1.2.3 From 37a183d3b7cdb873e7f5f9daef1ad6d8f7c95fb7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 14 Oct 2025 14:58:36 -0700 Subject: tcp: Convert tcp-md5 to use MD5 library instead of crypto_ahash Make tcp-md5 use the MD5 library API (added in 6.18) instead of the crypto_ahash API. This is much simpler and also more efficient: - The library API just operates on struct md5_ctx. Just allocate this struct on the stack instead of using a pool of pre-allocated crypto_ahash and ahash_request objects. - The library API accepts standard pointers and doesn't require scatterlists. So, for hashing the headers just use an on-stack buffer instead of a pool of pre-allocated kmalloc'ed scratch buffers. - The library API never fails. Therefore, checking for MD5 hashing errors is no longer necessary. Update tcp_v4_md5_hash_skb(), tcp_v6_md5_hash_skb(), tcp_v4_md5_hash_hdr(), tcp_v6_md5_hash_hdr(), tcp_md5_hash_key(), tcp_sock_af_ops::calc_md5_hash, and tcp_request_sock_ops::calc_md5_hash to return void instead of int. - The library API provides direct access to the MD5 code, eliminating unnecessary overhead such as indirect function calls and scatterlist management. Microbenchmarks of tcp_v4_md5_hash_skb() on x86_64 show a speedup from 7518 to 7041 cycles (6% fewer) with skb->len == 1440, or from 1020 to 678 cycles (33% fewer) with skb->len == 140. Since tcp_sigpool_hash_skb_data() can no longer be used, add a function tcp_md5_hash_skb_data() which is specialized to MD5. Of course, to the extent that this duplicates any code, it's well worth it. To preserve the existing behavior of TCP-MD5 support being disabled when the kernel is booted with "fips=1", make tcp_md5_do_add() check fips_enabled itself. Previously it relied on the error from crypto_alloc_ahash("md5") being bubbled up. I don't know for sure that this is actually needed, but this preserves the existing behavior. Tested with bidirectional TCP-MD5, both IPv4 and IPv6, between a kernel that includes this commit and a kernel that doesn't include this commit. (Side note: please don't use TCP-MD5! It's cryptographically weak. But as long as Linux supports it, it might as well be implemented properly.) Signed-off-by: Eric Biggers Link: https://patch.msgid.link/20251014215836.115616-1-ebiggers@kernel.org Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 1e547138f4fb..67fdd2523d92 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1898,13 +1898,6 @@ struct tcp6_pseudohdr { __be32 protocol; /* including padding */ }; -union tcp_md5sum_block { - struct tcp4_pseudohdr ip4; -#if IS_ENABLED(CONFIG_IPV6) - struct tcp6_pseudohdr ip6; -#endif -}; - /* * struct tcp_sigpool - per-CPU pool of ahash_requests * @scratch: per-CPU temporary area, that can be used between @@ -1939,8 +1932,8 @@ int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c); void tcp_sigpool_end(struct tcp_sigpool *c); size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len); /* - functions */ -int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, - const struct sock *sk, const struct sk_buff *skb); +void tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen); @@ -1999,13 +1992,10 @@ static inline void tcp_md5_destruct_sock(struct sock *sk) } #endif -int tcp_md5_alloc_sigpool(void); -void tcp_md5_release_sigpool(void); -void tcp_md5_add_sigpool(void); -extern int tcp_md5_sigpool_id; - -int tcp_md5_hash_key(struct tcp_sigpool *hp, - const struct tcp_md5sig_key *key); +struct md5_ctx; +void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb, + unsigned int header_len); +void tcp_md5_hash_key(struct md5_ctx *ctx, const struct tcp_md5sig_key *key); /* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, @@ -2355,7 +2345,7 @@ struct tcp_sock_af_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk, const struct sock *addr_sk); - int (*calc_md5_hash)(char *location, + void (*calc_md5_hash)(char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); @@ -2383,7 +2373,7 @@ struct tcp_request_sock_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk, const struct sock *addr_sk); - int (*calc_md5_hash) (char *location, + void (*calc_md5_hash) (char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); -- cgit v1.2.3 From d52bb3daad3f28403676dff31fa0577bdaf8e7c6 Mon Sep 17 00:00:00 2001 From: Takashi Sakamoto Date: Sat, 18 Oct 2025 12:55:31 +0900 Subject: firewire: core: handle device quirk of TASCAM FW-1884/FW-1804/FW-1082 TASCAM FW-1884/FW-1804/FW-1082 is too lazy to repspond to asynchronous request at S400. The asynchronous transaction often results in timeout. This is a problematic quirk. This commit adds support for the quirk. When identifying the new quirk flag, then the transaction speed is configured at S200. Link: https://lore.kernel.org/r/20251018035532.287124-4-o-takashi@sakamocchi.jp Signed-off-by: Takashi Sakamoto --- include/linux/firewire.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/firewire.h b/include/linux/firewire.h index f1d8734c0ec6..6143b7d28eac 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -179,6 +179,9 @@ enum fw_device_quirk { // MOTU Audio Express transfers acknowledge packet with 0x10 for pending state. FW_DEVICE_QUIRK_ACK_PACKET_WITH_INVALID_PENDING_CODE = BIT(2), + + // TASCAM FW-1082/FW-1804/FW-1884 often freezes when receiving S400 packets. + FW_DEVICE_QUIRK_UNSTABLE_AT_S400 = BIT(3), }; enum fw_device_state { -- cgit v1.2.3 From b57100a3d9ced8c2b78e87d313f514a3338d016e Mon Sep 17 00:00:00 2001 From: Malaya Kumar Rout Date: Tue, 14 Oct 2025 01:00:27 +0530 Subject: PM: console: Fix memory allocation error handling in pm_vt_switch_required() The pm_vt_switch_required() function fails silently when memory allocation fails, offering no indication to callers that the operation was unsuccessful. This behavior prevents drivers from handling allocation errors correctly or implementing retry mechanisms. By ensuring that failures are reported back to the caller, drivers can make informed decisions, improve robustness, and avoid unexpected behavior during critical power management operations. Change the function signature to return an integer error code and modify the implementation to return -ENOMEM when kmalloc() fails. Update both the function declaration and the inline stub in include/linux/pm.h to maintain consistency across CONFIG_VT_CONSOLE_SLEEP configurations. The function now returns: - 0 on success (including when updating existing entries) - -ENOMEM when memory allocation fails This change improves error reporting without breaking existing callers, as the current callers in drivers/video/fbdev/core/fbmem.c already ignore the return value, making this a backward-compatible improvement. Reviewed-by: Lyude Paul Signed-off-by: Malaya Kumar Rout Reviewed-by: Dhruva Gole Reviewed-by: Lyude Paul Link: https://patch.msgid.link/20251013193028.89570-1-mrout@redhat.com Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pm.h b/include/linux/pm.h index cc7b2dc28574..a72e42eec130 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -25,11 +25,12 @@ extern void (*pm_power_off)(void); struct device; /* we have a circular dep with device.h */ #ifdef CONFIG_VT_CONSOLE_SLEEP -extern void pm_vt_switch_required(struct device *dev, bool required); +extern int pm_vt_switch_required(struct device *dev, bool required); extern void pm_vt_switch_unregister(struct device *dev); #else -static inline void pm_vt_switch_required(struct device *dev, bool required) +static inline int pm_vt_switch_required(struct device *dev, bool required) { + return 0; } static inline void pm_vt_switch_unregister(struct device *dev) { -- cgit v1.2.3 From 8b9cd112f1ac8d72244b189654e693012ea8dfe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Thu, 9 Oct 2025 10:31:27 +0100 Subject: soc: samsung: gs101-pmu: implement access tables for read and write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accessing non-existent PMU registers causes an SError, halting the system. Implement read and write access tables for the gs101-PMU to specify which registers are read- and/or writable to avoid that SError. Reviewed-by: Sam Protsenko Signed-off-by: André Draszik Link: https://patch.msgid.link/20251009-gs101-pmu-regmap-tables-v2-3-2d64f5261952@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/linux/soc/samsung/exynos-regs-pmu.h | 343 +++++++++++++++++++++++++++- 1 file changed, 335 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index 71e0c09a49eb..532c6c2d1195 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -672,14 +672,341 @@ /* For Tensor GS101 */ /* PMU ALIVE */ -#define GS101_SYSIP_DAT0 (0x810) -#define GS101_CPU0_INFORM (0x860) -#define GS101_CPU_INFORM(cpu) \ - (GS101_CPU0_INFORM + (cpu*4)) -#define GS101_SYSTEM_CONFIGURATION (0x3A00) -#define GS101_EINT_WAKEUP_MASK (0x3A80) -#define GS101_PHY_CTRL_USB20 (0x3EB0) -#define GS101_PHY_CTRL_USBDP (0x3EB4) +#define GS101_OM_STAT 0x0000 +#define GS101_VERSION 0x0004 +#define GS101_PORESET_CHECK 0x0008 +#define GS101_OTP_STATUS 0x000c +#define GS101_SYSTEM_INFO 0x0010 +#define GS101_IDLE_IP(n) (0x03e0 + ((n) & 3) * 4) +#define GS101_IDLE_IP_MASK(n) (0x03f0 + ((n) & 3) * 4) +#define GS101_SLC_CH_OFFSET(ch) (0x0400 + ((ch) & 3) * 0x10) +#define GS101_DATARAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x00) +#define GS101_TAGRAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x04) +#define GS101_LRURAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x08) +#define GS101_PPMPURAM_STATE_SLC_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x0c) +#define GS101_DATARAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x40) +#define GS101_TAGRAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x44) +#define GS101_LRURAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x48) +#define GS101_PPMPURAM_INFORM_SCL_CH(ch) (GS101_SLC_CH_OFFSET(ch) + 0x4c) +#define GS101_INFORM0 0x0800 +#define GS101_INFORM1 0x0804 +#define GS101_INFORM2 0x0808 +#define GS101_INFORM3 0x080c +#define GS101_SYSIP_DAT(n) (0x0810 + ((n) & 3) * 4) +#define GS101_PWR_HOLD_HW_TRIP 0x0820 +#define GS101_PWR_HOLD_SW_TRIP 0x0824 +#define GS101_GSA_INFORM(n) (0x0830 + ((n) & 1) * 4) +#define GS101_INFORM4 0x0840 +#define GS101_INFORM5 0x0844 +#define GS101_INFORM6 0x0848 +#define GS101_INFORM7 0x084c +#define GS101_INFORM8 0x0850 +#define GS101_INFORM9 0x0854 +#define GS101_INFORM10 0x0858 +#define GS101_INFORM11 0x085c +#define GS101_CPU_INFORM(cpu) (0x0860 + ((cpu) & 7) * 4) +#define GS101_IROM_INFORM 0x0880 +#define GS101_IROM_CPU_INFORM(cpu) (0x0890 + ((cpu) & 7) * 4) +#define GS101_PMU_SPARE(n) (0x0900 + ((n) & 3) * 4) +#define GS101_IROM_DATA_REG(n) (0x0980 + ((n) & 3) * 4) +#define GS101_IROM_PWRMODE 0x0990 +#define GS101_DREX_CALIBRATION(n) (0x09a0 + ((n) & 7) * 4) + +#define GS101_CLUSTER0_OFFSET 0x1000 +#define GS101_CLUSTER1_OFFSET 0x1300 +#define GS101_CLUSTER2_OFFSET 0x1500 +#define GS101_CLUSTER_CPU_OFFSET(cl, cpu) ((cl) + ((cpu) * 0x80)) +#define GS101_CLUSTER_CPU_CONFIGURATION(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x00) +#define GS101_CLUSTER_CPU_STATUS(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x04) +#define GS101_CLUSTER_CPU_STATES(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x08) +#define GS101_CLUSTER_CPU_OPTION(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x0c) +#define GS101_CLUSTER_CPU_OUT(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x20) +#define GS101_CLUSTER_CPU_IN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x24) +#define GS101_CLUSTER_CPU_INT_IN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x40) +#define GS101_CLUSTER_CPU_INT_EN(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x44) +#define GS101_CLUSTER_CPU_INT_TYPE(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x48) +#define GS101_CLUSTER_CPU_INT_DIR(cl, cpu) \ + (GS101_CLUSTER_CPU_OFFSET(cl, cpu) + 0x4c) + +#define GS101_CLUSTER_NONCPU_OFFSET(cl) (0x1200 + ((cl) * 0x200)) +#define GS101_CLUSTER_NONCPU_CONFIGURATION(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x00) +#define GS101_CLUSTER_NONCPU_STATUS(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x04) +#define GS101_CLUSTER_NONCPU_STATES(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x08) +#define GS101_CLUSTER_NONCPU_OPTION(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x0c) +#define GS101_CLUSTER_NONCPU_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x20) +#define GS101_CLUSTER_NONCPU_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x24) +#define GS101_CLUSTER_NONCPU_INT_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x40) +#define GS101_CLUSTER_NONCPU_INT_EN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x44) +#define GS101_CLUSTER_NONCPU_INT_TYPE(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x48) +#define GS101_CLUSTER_NONCPU_INT_DIR(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x4c) +#define GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x60) +#define GS101_CLUSTER_NONCPU_DUALRAIL_POS_OUT(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x64) +#define GS101_CLUSTER_NONCPU_DUALRAIL_CTRL_IN(cl) \ + (GS101_CLUSTER_NONCPU_OFFSET(cl) + 0x6c) +#define GS101_CLUSTER0_NONCPU_DSU_PCH \ + (GS101_CLUSTER_NONCPU_OFFSET(0) + 0x80) + +#define GS101_SUBBBLK_OFFSET_ALIVE 0x1800 +#define GS101_SUBBBLK_OFFSET_AOC 0x1880 +#define GS101_SUBBBLK_OFFSET_APM 0x1900 +#define GS101_SUBBBLK_OFFSET_CMU 0x1980 +#define GS101_SUBBBLK_OFFSET_BUS0 0x1a00 +#define GS101_SUBBBLK_OFFSET_BUS1 0x1a80 +#define GS101_SUBBBLK_OFFSET_BUS2 0x1b00 +#define GS101_SUBBBLK_OFFSET_CORE 0x1b80 +#define GS101_SUBBBLK_OFFSET_EH 0x1c00 +#define GS101_SUBBBLK_OFFSET_CPUCL0 0x1c80 +#define GS101_SUBBBLK_OFFSET_CPUCL1 0x1d00 +#define GS101_SUBBBLK_OFFSET_CPUCL2 0x1d80 +#define GS101_SUBBBLK_OFFSET_G3D 0x1e00 +#define GS101_SUBBBLK_OFFSET_EMBEDDED_CPUCL0 0x1e80 +#define GS101_SUBBBLK_OFFSET_EMBEDDED_G3D 0x2000 +#define GS101_SUBBBLK_OFFSET_HSI0 0x2080 +#define GS101_SUBBBLK_OFFSET_HSI1 0x2100 +#define GS101_SUBBBLK_OFFSET_HSI2 0x2180 +#define GS101_SUBBBLK_OFFSET_DPU 0x2200 +#define GS101_SUBBBLK_OFFSET_DISP 0x2280 +#define GS101_SUBBBLK_OFFSET_G2D 0x2300 +#define GS101_SUBBBLK_OFFSET_MFC 0x2380 +#define GS101_SUBBBLK_OFFSET_CSIS 0x2400 +#define GS101_SUBBBLK_OFFSET_PDP 0x2480 +#define GS101_SUBBBLK_OFFSET_DNS 0x2500 +#define GS101_SUBBBLK_OFFSET_G3AA 0x2580 +#define GS101_SUBBBLK_OFFSET_IPP 0x2600 +#define GS101_SUBBBLK_OFFSET_ITP 0x2680 +#define GS101_SUBBBLK_OFFSET_MCSC 0x2700 +#define GS101_SUBBBLK_OFFSET_GDC 0x2780 +#define GS101_SUBBBLK_OFFSET_TNR 0x2800 +#define GS101_SUBBBLK_OFFSET_BO 0x2880 +#define GS101_SUBBBLK_OFFSET_TPU 0x2900 +#define GS101_SUBBBLK_OFFSET_MIF0 0x2980 +#define GS101_SUBBBLK_OFFSET_MIF1 0x2a00 +#define GS101_SUBBBLK_OFFSET_MIF2 0x2a80 +#define GS101_SUBBBLK_OFFSET_MIF3 0x2b00 +#define GS101_SUBBBLK_OFFSET_MISC 0x2b80 +#define GS101_SUBBBLK_OFFSET_PERIC0 0x2c00 +#define GS101_SUBBBLK_OFFSET_PERIC1 0x2c80 +#define GS101_SUBBBLK_OFFSET_S2D 0x2d00 +#define GS101_SUBBLK_CONFIGURATION(blk) ((blk) + 0x00) +#define GS101_SUBBLK_STATUS(blk) ((blk) + 0x04) +#define GS101_SUBBLK_STATES(blk) ((blk) + 0x08) +#define GS101_SUBBLK_OPTION(blk) ((blk) + 0x0c) +#define GS101_SUBBLK_CTRL(blk) ((blk) + 0x10) +#define GS101_SUBBLK_OUT(blk) ((blk) + 0x20) +#define GS101_SUBBLK_IN(blk) ((blk) + 0x24) +#define GS101_SUBBLK_INT_IN(blk) ((blk) + 0x40) +#define GS101_SUBBLK_INT_EN(blk) ((blk) + 0x44) +#define GS101_SUBBLK_INT_TYPE(blk) ((blk) + 0x48) +#define GS101_SUBBLK_INT_DIR(blk) ((blk) + 0x4c) +#define GS101_SUBBLK_MEMORY_OUT(blk) ((blk) + 0x60) +#define GS101_SUBBLK_MEMORY_IN(blk) ((blk) + 0x64) + +#define GS101_SUBBBLK_CPU_OFFSET_APM 0x3000 +#define GS101_SUBBBLK_CPU_OFFSET_DBGCORE 0x3080 +#define GS101_SUBBBLK_CPU_OFFSET_SSS 0x3100 +#define GS101_SUBBLK_CPU_CONFIGURATION(blk) ((blk) + 0x00) +#define GS101_SUBBLK_CPU_STATUS(blk) ((blk) + 0x04) +#define GS101_SUBBLK_CPU_STATES(blk) ((blk) + 0x08) +#define GS101_SUBBLK_CPU_OPTION(blk) ((blk) + 0x0c) +#define GS101_SUBBLK_CPU_OUT(blk) ((blk) + 0x20) +#define GS101_SUBBLK_CPU_IN(blk) ((blk) + 0x24) +#define GS101_SUBBLK_CPU_INT_IN(blk) ((blk) + 0x40) +#define GS101_SUBBLK_CPU_INT_EN(blk) ((blk) + 0x44) +#define GS101_SUBBLK_CPU_INT_TYPE(blk) ((blk) + 0x48) +#define GS101_SUBBLK_CPU_INT_DIR(blk) ((blk) + 0x4c) + +#define GS101_MIF_CONFIGURATION 0x3800 +#define GS101_MIF_STATUS 0x3804 +#define GS101_MIF_STATES 0x3808 +#define GS101_MIF_OPTION 0x380c +#define GS101_MIF_CTRL 0x3810 +#define GS101_MIF_OUT 0x3820 +#define GS101_MIF_IN 0x3824 +#define GS101_MIF_INT_IN 0x3840 +#define GS101_MIF_INT_EN 0x3844 +#define GS101_MIF_INT_TYPE 0x3848 +#define GS101_MIF_INT_DIR 0x384c +#define GS101_TOP_CONFIGURATION 0x3900 +#define GS101_TOP_STATUS 0x3904 +#define GS101_TOP_STATES 0x3908 +#define GS101_TOP_OPTION 0x390c +#define GS101_TOP_OUT 0x3920 +#define GS101_TOP_IN 0x3924 +#define GS101_TOP_INT_IN 0x3940 +#define GS101_TOP_INT_EN 0x3944 +#define GS101_TOP_INT_TYPE 0x3948 +#define GS101_TOP_INT_DIR 0x394c +#define GS101_WAKEUP_STAT 0x3950 +#define GS101_WAKEUP2_STAT 0x3954 +#define GS101_WAKEUP2_INT_IN 0x3960 +#define GS101_WAKEUP2_INT_EN 0x3964 +#define GS101_WAKEUP2_INT_TYPE 0x3968 +#define GS101_WAKEUP2_INT_DIR 0x396c +#define GS101_SYSTEM_CONFIGURATION 0x3a00 +#define GS101_SYSTEM_STATUS 0x3a04 +#define GS101_SYSTEM_STATES 0x3a08 +#define GS101_SYSTEM_OPTION 0x3a0c +#define GS101_SYSTEM_CTRL 0x3a10 +#define GS101_SPARE_CTRL 0x3a14 +#define GS101_USER_DEFINED_OUT 0x3a18 +#define GS101_SYSTEM_OUT 0x3a20 +#define GS101_SYSTEM_IN 0x3a24 +#define GS101_SYSTEM_INT_IN 0x3a40 +#define GS101_SYSTEM_INT_EN 0x3a44 +#define GS101_SYSTEM_INT_TYPE 0x3a48 +#define GS101_SYSTEM_INT_DIR 0x3a4c +#define GS101_EINT_INT_IN 0x3a50 +#define GS101_EINT_INT_EN 0x3a54 +#define GS101_EINT_INT_TYPE 0x3a58 +#define GS101_EINT_INT_DIR 0x3a5c +#define GS101_EINT2_INT_IN 0x3a60 +#define GS101_EINT2_INT_EN 0x3a64 +#define GS101_EINT2_INT_TYPE 0x3a68 +#define GS101_EINT2_INT_DIR 0x3a6c +#define GS101_EINT3_INT_IN 0x3a70 +#define GS101_EINT3_INT_EN 0x3a74 +#define GS101_EINT3_INT_TYPE 0x3a78 +#define GS101_EINT3_INT_DIR 0x3a7c +#define GS101_EINT_WAKEUP_MASK 0x3a80 +#define GS101_EINT_WAKEUP_MASK2 0x3a84 +#define GS101_EINT_WAKEUP_MASK3 0x3a88 +#define GS101_USER_DEFINED_INT_IN 0x3a90 +#define GS101_USER_DEFINED_INT_EN 0x3a94 +#define GS101_USER_DEFINED_INT_TYPE 0x3a98 +#define GS101_USER_DEFINED_INT_DIR 0x3a9c +#define GS101_SCAN2DRAM_INT_IN 0x3aa0 +#define GS101_SCAN2DRAM_INT_EN 0x3aa4 +#define GS101_SCAN2DRAM_INT_TYPE 0x3aa8 +#define GS101_SCAN2DRAM_INT_DIR 0x3aac +#define GS101_HCU_START 0x3ab0 +#define GS101_CUSTOM_OUT 0x3ac0 +#define GS101_CUSTOM_IN 0x3ac4 +#define GS101_CUSTOM_INT_IN 0x3ad0 +#define GS101_CUSTOM_INT_EN 0x3ad4 +#define GS101_CUSTOM_INT_TYPE 0x3ad8 +#define GS101_CUSTOM_INT_DIR 0x3adc +#define GS101_ACK_LAST_CPU 0x3afc +#define GS101_HCU_R(n) (0x3b00 + ((n) & 3) * 4) +#define GS101_HCU_SP 0x3b14 +#define GS101_HCU_PC 0x3b18 +#define GS101_PMU_RAM_CTRL 0x3b20 +#define GS101_APM_HCU_CTRL 0x3b24 +#define GS101_APM_NMI_ENABLE 0x3b30 +#define GS101_DBGCORE_NMI_ENABLE 0x3b34 +#define GS101_HCU_NMI_ENABLE 0x3b38 +#define GS101_PWR_HOLD_WDT_ENABLE 0x3b3c +#define GS101_NMI_SRC_IN 0x3b40 +#define GS101_RST_STAT 0x3b44 +#define GS101_RST_STAT_PMU 0x3b48 +#define GS101_HPM_INT_IN 0x3b60 +#define GS101_HPM_INT_EN 0x3b64 +#define GS101_HPM_INT_TYPE 0x3b68 +#define GS101_HPM_INT_DIR 0x3b6c +#define GS101_S2D_AUTH 0x3b70 +#define GS101_BOOT_STAT 0x3b74 +#define GS101_PMLINK_OUT 0x3c00 +#define GS101_PMLINK_AOC_OUT 0x3c04 +#define GS101_PMLINK_AOC_CTRL 0x3c08 +#define GS101_TCXO_BUF_CTRL 0x3c10 +#define GS101_ADD_CTRL 0x3c14 +#define GS101_HCU_TIMEOUT_RESET 0x3c20 +#define GS101_HCU_TIMEOUT_SCAN2DRAM 0x3c24 +#define GS101_TIMER(n) (0x3c80 + ((n) & 3) * 4) +#define GS101_PPC_MIF(n) (0x3c90 + ((n) & 3) * 4) +#define GS101_PPC_CORE 0x3ca0 +#define GS101_PPC_EH 0x3ca4 +#define GS101_PPC_CPUCL1_0 0x3ca8 +#define GS101_PPC_CPUCL1_1 0x3cac +#define GS101_EXT_REGULATOR_MIF_DURATION 0x3cb0 +#define GS101_EXT_REGULATOR_TOP_DURATION 0x3cb4 +#define GS101_EXT_REGULATOR_CPUCL2_DURATION 0x3cb8 +#define GS101_EXT_REGULATOR_CPUCL1_DURATION 0x3cbc +#define GS101_EXT_REGULATOR_G3D_DURATION 0x3cc0 +#define GS101_EXT_REGULATOR_TPU_DURATION 0x3cc4 +#define GS101_TCXO_DURATION 0x3cc8 +#define GS101_BURNIN_CTRL 0x3cd0 +#define GS101_JTAG_DBG_DET 0x3cd4 +#define GS101_MMC_CONWKUP_CTRL 0x3cd8 +#define GS101_USBDPPHY0_USBDP_WAKEUP 0x3cdc +#define GS101_TMU_TOP_TRIP 0x3ce0 +#define GS101_TMU_SUB_TRIP 0x3ce4 +#define GS101_MEMORY_CEN 0x3d00 +#define GS101_MEMORY_PGEN 0x3d04 +#define GS101_MEMORY_RET 0x3d08 +#define GS101_MEMORY_PGEN_FEEDBACK 0x3d0c +#define GS101_MEMORY_SMX 0x3d10 +#define GS101_MEMORY_SMX_FEEDBACK 0x3d14 +#define GS101_SLC_PCH_CHANNEL 0x3d20 +#define GS101_SLC_PCH_CB 0x3d24 +#define GS101_FORCE_NOMC 0x3d3c +#define GS101_FORCE_BOOST 0x3d4c +#define GS101_PMLINK_SLC_REQ 0x3d50 +#define GS101_PMLINK_SLC_ACK 0x3d54 +#define GS101_PMLINK_SLC_BUSY 0x3d58 +#define GS101_BOOTSYNC_OUT 0x3d80 +#define GS101_BOOTSYNC_IN 0x3d84 +#define GS101_SCAN_READY_OUT 0x3d88 +#define GS101_SCAN_READY_IN 0x3d8c +#define GS101_GSA_RESTORE 0x3d90 +#define GS101_ALIVE_OTP_LATCH 0x3d94 +#define GS101_DEBUG_OVERRIDE 0x3d98 +#define GS101_WDT_OPTION 0x3d9c +#define GS101_AOC_WDT_CFG 0x3da0 +#define GS101_CTRL_SECJTAG_ALIVE 0x3da4 +#define GS101_CTRL_DIV_PLL_ALV_DIVLOW 0x3e00 +#define GS101_CTRL_MUX_CLK_APM_REFSRC_AUTORESTORE 0x3e04 +#define GS101_CTRL_MUX_CLK_APM_REFSRC 0x3e08 +#define GS101_CTRL_MUX_CLK_APM_REF 0x3e0c +#define GS101_CTRL_MUX_PLL_ALV_DIV4 0x3e10 +#define GS101_CTRL_PLL_ALV_DIV4 0x3e14 +#define GS101_CTRL_OSCCLK_APMGSA 0x3e18 +#define GS101_CTRL_BLK_AOC_CLKS 0x3e1c +#define GS101_CTRL_PLL_ALV_LOCK 0x3e20 +#define GS101_CTRL_CLKDIV__CLKRTC 0x3e24 +#define GS101_CTRL_SOC32K 0x3e30 +#define GS101_CTRL_STM_PMU 0x3e34 +#define GS101_CTRL_PMU_DEBUG 0x3e38 +#define GS101_CTRL_DEBUG_UART 0x3e3c +#define GS101_CTRL_TCK 0x3e40 +#define GS101_CTRL_SBU_SW_EN 0x3e44 +#define GS101_PAD_CTRL_CLKOUT0 0x3e80 +#define GS101_PAD_CTRL_CLKOUT1 0x3e84 +#define GS101_PAD_CTRL_APM_24MOUT_0 0x3e88 +#define GS101_PAD_CTRL_APM_24MOUT_1 0x3e8c +#define GS101_PAD_CTRL_IO_FORCE_RETENTION 0x3e90 +#define GS101_PAD_CTRL_APACTIVE_n 0x3e94 +#define GS101_PAD_CTRL_TCXO_ON 0x3e98 +#define GS101_PAD_CTRL_PWR_HOLD 0x3e9c +#define GS101_PAD_CTRL_RESETO_n 0x3ea0 +#define GS101_PAD_CTRL_WRESETO_n 0x3ea4 +#define GS101_PHY_CTRL_USB20 0x3eb0 +#define GS101_PHY_CTRL_USBDP 0x3eb4 +#define GS101_PHY_CTRL_MIPI_DCPHY_M4M4 0x3eb8 +#define GS101_PHY_CTRL_MIPI_DCPHY_S4S4S4S4 0x3ebc +#define GS101_PHY_CTRL_PCIE_GEN4_0 0x3ec0 +#define GS101_PHY_CTRL_PCIE_GEN4_1 0x3ec4 +#define GS101_PHY_CTRL_UFS 0x3ec8 /* PMU INTR GEN */ #define GS101_GRP1_INTR_BID_UPEND (0x0108) -- cgit v1.2.3 From edd548dc64a699d71ea4f537f815044e763d01e1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Oct 2025 12:13:23 -0700 Subject: firmware: qcom: tzmem: fix qcom_tzmem_policy kernel-doc Fix kernel-doc warnings by using correct kernel-doc syntax and formatting to prevent warnings: Warning: include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value 'QCOM_TZMEM_POLICY_STATIC' not described in enum 'qcom_tzmem_policy' Warning: ../include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value 'QCOM_TZMEM_POLICY_MULTIPLIER' not described in enum 'qcom_tzmem_policy' Warning: ../include/linux/firmware/qcom/qcom_tzmem.h:25 Enum value 'QCOM_TZMEM_POLICY_ON_DEMAND' not described in enum 'qcom_tzmem_policy' Fixes: 84f5a7b67b61 ("firmware: qcom: add a dedicated TrustZone buffer allocator") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20251017191323.1820167-1-rdunlap@infradead.org Signed-off-by: Bjorn Andersson --- include/linux/firmware/qcom/qcom_tzmem.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/firmware/qcom/qcom_tzmem.h b/include/linux/firmware/qcom/qcom_tzmem.h index 48ac0e5454c7..23173e0c3ddd 100644 --- a/include/linux/firmware/qcom/qcom_tzmem.h +++ b/include/linux/firmware/qcom/qcom_tzmem.h @@ -17,11 +17,20 @@ struct qcom_tzmem_pool; * enum qcom_tzmem_policy - Policy for pool growth. */ enum qcom_tzmem_policy { - /**< Static pool, never grow above initial size. */ + /** + * @QCOM_TZMEM_POLICY_STATIC: Static pool, + * never grow above initial size. + */ QCOM_TZMEM_POLICY_STATIC = 1, - /**< When out of memory, add increment * current size of memory. */ + /** + * @QCOM_TZMEM_POLICY_MULTIPLIER: When out of memory, + * add increment * current size of memory. + */ QCOM_TZMEM_POLICY_MULTIPLIER, - /**< When out of memory add as much as is needed until max_size. */ + /** + * @QCOM_TZMEM_POLICY_ON_DEMAND: When out of memory + * add as much as is needed until max_size. + */ QCOM_TZMEM_POLICY_ON_DEMAND, }; -- cgit v1.2.3 From d742ebcfe524dc54023f7c520d2ed2e4b7203c19 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 14 Oct 2025 04:28:04 +0000 Subject: ASoC: soc.h: remove snd_soc_kcontrol_component() All driver is now using snd_kcontrol_chip() instead of snd_soc_kcontrol_component() to get component. Remove snd_soc_kcontrol_component(). Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87bjmam7jf.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index ddc508ff7b9b..1aebf14fcf80 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1305,22 +1305,6 @@ static inline unsigned int snd_soc_enum_item_to_val(const struct soc_enum *e, return e->values[item]; } -/** - * snd_soc_kcontrol_component() - Returns the component that registered the - * control - * @kcontrol: The control for which to get the component - * - * Note: This function will work correctly if the control has been registered - * for a component. With snd_soc_add_codec_controls() or via table based - * setup for either a CODEC or component driver. Otherwise the behavior is - * undefined. - */ -static inline struct snd_soc_component *snd_soc_kcontrol_component( - struct snd_kcontrol *kcontrol) -{ - return snd_kcontrol_chip(kcontrol); -} - int snd_soc_util_init(void); void snd_soc_util_exit(void); -- cgit v1.2.3 From a703a4c2a3280835003d4d0eb8845bac0f1a6ef1 Mon Sep 17 00:00:00 2001 From: Meenakshi Aggarwal Date: Mon, 6 Oct 2025 09:17:52 +0200 Subject: KEYS: trusted: caam based protected key - CAAM supports two types of protected keys: -- Plain key encrypted with ECB -- Plain key encrypted with CCM Due to robustness, default encryption used for protected key is CCM. - Generate protected key blob and add it to trusted key payload. This is done as part of sealing operation, which is triggered when below two operations are requested: -- new key generation -- load key, Signed-off-by: Pankaj Gupta Signed-off-by: Meenakshi Aggarwal Signed-off-by: Herbert Xu --- include/soc/fsl/caam-blob.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/soc/fsl/caam-blob.h b/include/soc/fsl/caam-blob.h index 937cac52f36d..922f7ec3e231 100644 --- a/include/soc/fsl/caam-blob.h +++ b/include/soc/fsl/caam-blob.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2020 Pengutronix, Ahmad Fatoum + * Copyright 2024-2025 NXP */ #ifndef __CAAM_BLOB_GEN @@ -12,11 +13,34 @@ #define CAAM_BLOB_KEYMOD_LENGTH 16 #define CAAM_BLOB_OVERHEAD (32 + 16) #define CAAM_BLOB_MAX_LEN 4096 +#define CAAM_ENC_ALGO_CCM 0x1 +#define CAAM_ENC_ALGO_ECB 0x2 +#define CAAM_NONCE_SIZE 6 +#define CAAM_ICV_SIZE 6 +#define CAAM_CCM_OVERHEAD (CAAM_NONCE_SIZE + CAAM_ICV_SIZE) struct caam_blob_priv; +/** + * struct caam_pkey_info - information for CAAM protected key + * @is_pkey: flag to identify, if the key is protected. + * @key_enc_algo: identifies the algorithm, ccm or ecb + * @plain_key_sz: size of plain key. + * @key_buf: contains key data + */ +struct caam_pkey_info { + u8 is_pkey; + u8 key_enc_algo; + u16 plain_key_sz; + u8 key_buf[]; +} __packed; + +/* sizeof struct caam_pkey_info */ +#define CAAM_PKEY_HEADER 4 + /** * struct caam_blob_info - information for CAAM blobbing + * @pkey_info: pointer to keep protected key information * @input: pointer to input buffer (must be DMAable) * @input_len: length of @input buffer in bytes. * @output: pointer to output buffer (must be DMAable) @@ -26,6 +50,8 @@ struct caam_blob_priv; * May not exceed %CAAM_BLOB_KEYMOD_LENGTH */ struct caam_blob_info { + struct caam_pkey_info pkey_info; + void *input; size_t input_len; -- cgit v1.2.3 From aa653654ee67f9cbbebb7d4c18f360ad4fef3180 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sat, 11 Oct 2025 09:48:55 +0800 Subject: rhashtable: use likely for rhashtable lookup Sometimes, the result of the rhashtable_lookup() is expected to be found. Therefore, we can use likely() for such cases. Following new functions are introduced, which will use likely or unlikely during the lookup: rhashtable_lookup_likely rhltable_lookup_likely A micro-benchmark is made for these new functions: lookup a existed entry repeatedly for 100000000 times, and rhashtable_lookup_likely() gets ~30% speedup. Signed-off-by: Menglong Dong Signed-off-by: Herbert Xu --- include/linux/rhashtable.h | 70 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 05a221ce79a6..08e664b21f5a 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -355,12 +355,25 @@ static inline void rht_unlock(struct bucket_table *tbl, local_irq_restore(flags); } -static inline struct rhash_head *__rht_ptr( - struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) +enum rht_lookup_freq { + RHT_LOOKUP_NORMAL, + RHT_LOOKUP_LIKELY, +}; + +static __always_inline struct rhash_head *__rht_ptr( + struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt, + const enum rht_lookup_freq freq) { - return (struct rhash_head *) - ((unsigned long)p & ~BIT(0) ?: - (unsigned long)RHT_NULLS_MARKER(bkt)); + unsigned long p_val = (unsigned long)p & ~BIT(0); + + BUILD_BUG_ON(!__builtin_constant_p(freq)); + + if (freq == RHT_LOOKUP_LIKELY) + return (struct rhash_head *) + (likely(p_val) ? p_val : (unsigned long)RHT_NULLS_MARKER(bkt)); + else + return (struct rhash_head *) + (p_val ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } /* @@ -370,10 +383,17 @@ static inline struct rhash_head *__rht_ptr( * rht_ptr_exclusive() dereferences in a context where exclusive * access is guaranteed, such as when destroying the table. */ +static __always_inline struct rhash_head *__rht_ptr_rcu( + struct rhash_lock_head __rcu *const *bkt, + const enum rht_lookup_freq freq) +{ + return __rht_ptr(rcu_dereference_all(*bkt), bkt, freq); +} + static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { - return __rht_ptr(rcu_dereference_all(*bkt), bkt); + return __rht_ptr_rcu(bkt, RHT_LOOKUP_NORMAL); } static inline struct rhash_head *rht_ptr( @@ -381,13 +401,15 @@ static inline struct rhash_head *rht_ptr( struct bucket_table *tbl, unsigned int hash) { - return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); + return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt, + RHT_LOOKUP_NORMAL); } static inline struct rhash_head *rht_ptr_exclusive( struct rhash_lock_head __rcu *const *bkt) { - return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); + return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt, + RHT_LOOKUP_NORMAL); } static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, @@ -588,7 +610,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, /* Internal function, do not use. */ static __always_inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, - const struct rhashtable_params params) + const struct rhashtable_params params, + const enum rht_lookup_freq freq) { struct rhashtable_compare_arg arg = { .ht = ht, @@ -599,12 +622,13 @@ static __always_inline struct rhash_head *__rhashtable_lookup( struct rhash_head *he; unsigned int hash; + BUILD_BUG_ON(!__builtin_constant_p(freq)); tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { - rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { + rht_for_each_rcu_from(he, __rht_ptr_rcu(bkt, freq), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) @@ -643,11 +667,22 @@ static __always_inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { - struct rhash_head *he = __rhashtable_lookup(ht, key, params); + struct rhash_head *he = __rhashtable_lookup(ht, key, params, + RHT_LOOKUP_NORMAL); return he ? rht_obj(ht, he) : NULL; } +static __always_inline void *rhashtable_lookup_likely( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(ht, key, params, + RHT_LOOKUP_LIKELY); + + return likely(he) ? rht_obj(ht, he) : NULL; +} + /** * rhashtable_lookup_fast - search hash table, without RCU read lock * @ht: hash table @@ -693,11 +728,22 @@ static __always_inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { - struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params, + RHT_LOOKUP_NORMAL); return he ? container_of(he, struct rhlist_head, rhead) : NULL; } +static __always_inline struct rhlist_head *rhltable_lookup_likely( + struct rhltable *hlt, const void *key, + const struct rhashtable_params params) +{ + struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params, + RHT_LOOKUP_LIKELY); + + return likely(he) ? container_of(he, struct rhlist_head, rhead) : NULL; +} + /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes if there is a clash, * otherwise it returns an error via ERR_PTR(). -- cgit v1.2.3 From 83c4e3c39b2b55afe56ed0d14b93b5f219350c81 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 10 Oct 2025 12:46:31 +0000 Subject: dt-bindings: firmware: google,gs101-acpm-ipc: add ACPM clocks The firmware exposes clocks that can be controlled via the Alive Clock and Power Manager (ACPM) interface. Make the ACPM node a clock provider by adding the mandatory "#clock-cells" property, which allows devices to reference its clock outputs. Signed-off-by: Tudor Ambarus Reviewed-by: Rob Herring (Arm) Reviewed-by: Peter Griffin Tested-by: Peter Griffin # on gs101-oriole Link: https://patch.msgid.link/20251010-acpm-clk-v6-1-321ee8826fd4@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/dt-bindings/clock/google,gs101-acpm.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 include/dt-bindings/clock/google,gs101-acpm.h (limited to 'include') diff --git a/include/dt-bindings/clock/google,gs101-acpm.h b/include/dt-bindings/clock/google,gs101-acpm.h new file mode 100644 index 000000000000..e2ba89e09fa6 --- /dev/null +++ b/include/dt-bindings/clock/google,gs101-acpm.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright 2025 Linaro Ltd. + * + * Device Tree binding constants for Google gs101 ACPM clock controller. + */ + +#ifndef _DT_BINDINGS_CLOCK_GOOGLE_GS101_ACPM_H +#define _DT_BINDINGS_CLOCK_GOOGLE_GS101_ACPM_H + +#define GS101_CLK_ACPM_DVFS_MIF 0 +#define GS101_CLK_ACPM_DVFS_INT 1 +#define GS101_CLK_ACPM_DVFS_CPUCL0 2 +#define GS101_CLK_ACPM_DVFS_CPUCL1 3 +#define GS101_CLK_ACPM_DVFS_CPUCL2 4 +#define GS101_CLK_ACPM_DVFS_G3D 5 +#define GS101_CLK_ACPM_DVFS_G3DL2 6 +#define GS101_CLK_ACPM_DVFS_TPU 7 +#define GS101_CLK_ACPM_DVFS_INTCAM 8 +#define GS101_CLK_ACPM_DVFS_TNR 9 +#define GS101_CLK_ACPM_DVFS_CAM 10 +#define GS101_CLK_ACPM_DVFS_MFC 11 +#define GS101_CLK_ACPM_DVFS_DISP 12 +#define GS101_CLK_ACPM_DVFS_BO 13 + +#endif /* _DT_BINDINGS_CLOCK_GOOGLE_GS101_ACPM_H */ -- cgit v1.2.3 From 84a222d1b369ba83f8947948670f775367e653f1 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Fri, 10 Oct 2025 12:46:32 +0000 Subject: firmware: exynos-acpm: add DVFS protocol Add ACPM DVFS protocol handler. It constructs DVFS messages that the APM firmware can understand. Signed-off-by: Tudor Ambarus Reviewed-by: Peter Griffin Tested-by: Peter Griffin # on gs101-oriole Link: https://patch.msgid.link/20251010-acpm-clk-v6-2-321ee8826fd4@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/linux/firmware/samsung/exynos-acpm-protocol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index f628bf1862c2..b1e95435240f 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -13,6 +13,15 @@ struct acpm_handle; struct device_node; +struct acpm_dvfs_ops { + int (*set_rate)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, unsigned int clk_id, + unsigned long rate); + unsigned long (*get_rate)(const struct acpm_handle *handle, + unsigned int acpm_chan_id, + unsigned int clk_id); +}; + struct acpm_pmic_ops { int (*read_reg)(const struct acpm_handle *handle, unsigned int acpm_chan_id, u8 type, u8 reg, u8 chan, @@ -32,6 +41,7 @@ struct acpm_pmic_ops { }; struct acpm_ops { + struct acpm_dvfs_ops dvfs_ops; struct acpm_pmic_ops pmic_ops; }; -- cgit v1.2.3 From 7f3779a3ac3e474d043f0a2b77dd6e6bb020c577 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 27 Aug 2025 17:52:43 +0000 Subject: mm/filemap: Add NUMA mempolicy support to filemap_alloc_folio() Add a mempolicy parameter to filemap_alloc_folio() to enable NUMA-aware page cache allocations. This will be used by upcoming changes to support NUMA policies in guest-memfd, where guest_memory need to be allocated NUMA policy specified by VMM. All existing users pass NULL maintaining current behavior. Reviewed-by: Pankaj Gupta Reviewed-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/r/20250827175247.83322-4-shivankg@amd.com Signed-off-by: Sean Christopherson --- include/linux/pagemap.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..f1d0610210f7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -654,9 +654,11 @@ static inline void *detach_page_private(struct page *page) } #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy); #else -static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { return folio_alloc_noprof(gfp, order); } @@ -667,7 +669,7 @@ static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int o static inline struct page *__page_cache_alloc(gfp_t gfp) { - return &filemap_alloc_folio(gfp, 0)->page; + return &filemap_alloc_folio(gfp, 0, NULL)->page; } static inline gfp_t readahead_gfp_mask(struct address_space *x) -- cgit v1.2.3 From 16a542e22339cd5e73e56a956bbd335c7bd7c08c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 27 Aug 2025 17:52:44 +0000 Subject: mm/filemap: Extend __filemap_get_folio() to support NUMA memory policies Extend __filemap_get_folio() to support NUMA memory policies by renaming the implementation to __filemap_get_folio_mpol() and adding a mempolicy parameter. The original function becomes a static inline wrapper that passes NULL for the mempolicy. This infrastructure will enable future support for NUMA-aware page cache allocations in guest_memfd memory backend KVM guests. Reviewed-by: Pankaj Gupta Reviewed-by: Vlastimil Babka Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/r/20250827175247.83322-5-shivankg@amd.com Signed-off-by: Sean Christopherson --- include/linux/pagemap.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f1d0610210f7..a17fabbc0269 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -755,11 +755,17 @@ static inline fgf_t fgf_set_order(size_t size) } void *filemap_get_entry(struct address_space *mapping, pgoff_t index); -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp); +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp, struct mempolicy *policy); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, fgf_t fgp_flags, gfp_t gfp); +static inline struct folio *__filemap_get_folio(struct address_space *mapping, + pgoff_t index, fgf_t fgf_flags, gfp_t gfp) +{ + return __filemap_get_folio_mpol(mapping, index, fgf_flags, gfp, NULL); +} + /** * write_begin_get_folio - Get folio for write_begin with flags. * @iocb: The kiocb passed from write_begin (may be NULL). -- cgit v1.2.3 From a63ca4236e6799cf4343f9aec9d92afdfa582446 Mon Sep 17 00:00:00 2001 From: Ackerley Tng Date: Thu, 16 Oct 2025 10:28:44 -0700 Subject: KVM: guest_memfd: Use guest mem inodes instead of anonymous inodes guest_memfd's inode represents memory the guest_memfd is providing. guest_memfd's file represents a struct kvm's view of that memory. Using a custom inode allows customization of the inode teardown process via callbacks. For example, ->evict_inode() allows customization of the truncation process on file close, and ->destroy_inode() and ->free_inode() allow customization of the inode freeing process. Customizing the truncation process allows flexibility in management of guest_memfd memory and customization of the inode freeing process allows proper cleanup of memory metadata stored on the inode. Memory metadata is more appropriately stored on the inode (as opposed to the file), since the metadata is for the memory and is not unique to a specific binding and struct kvm. Acked-by: David Hildenbrand Co-developed-by: Fuad Tabba Signed-off-by: Fuad Tabba Signed-off-by: Ackerley Tng Signed-off-by: Shivank Garg Tested-by: Ashish Kalra [sean: drop helpers, open code logic in __kvm_gmem_create()] Link: https://lore.kernel.org/r/20251016172853.52451-4-seanjc@google.com Signed-off-by: Sean Christopherson --- include/uapi/linux/magic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index bb575f3ab45e..638ca21b7a90 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -103,5 +103,6 @@ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ +#define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */ #endif /* __LINUX_MAGIC_H__ */ -- cgit v1.2.3 From be180c847a6db6646d7bb4740a1d73f6f67d1030 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 19 Oct 2025 20:43:20 -0700 Subject: RDMA/uverbs: fix some kernel-doc warnings Fix 49 kernel-doc warnings in ib_verbs.h: - Add struct short description for rdma_stat_desc, rdma_hw_stats. - Fix kernel-doc format for struct members (use ':' instead of '-') for several structs. - Don't use "/**" kernel-doc notation for struct members in ib_device_ops (most members are not documented and most of the kernel-doc was not formatted correctly). - Spell function parameters correctly in ib_dma_map_sgtable_attrs(), ib_device_try_get(), rdma_roce_rescan_device(). - Add kernel-doc for the function parameter in rdma_flow_label_to_udp_sport(). Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251020034320.3011094-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 99 +++++++++++++++++++++++++------------------------ 1 file changed, 50 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6139223e92e4..0a85af610b6b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -586,10 +586,10 @@ enum ib_stat_flag { }; /** - * struct rdma_stat_desc - * @name - The name of the counter - * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL - * @priv - Driver private information; Core code should not use + * struct rdma_stat_desc - description of one rdma stat/counter + * @name: The name of the counter + * @flags: Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL + * @priv: Driver private information; Core code should not use */ struct rdma_stat_desc { const char *name; @@ -598,24 +598,24 @@ struct rdma_stat_desc { }; /** - * struct rdma_hw_stats - * @lock - Mutex to protect parallel write access to lifespan and values + * struct rdma_hw_stats - collection of hardware stats and their management + * @lock: Mutex to protect parallel write access to lifespan and values * of counters, which are 64bits and not guaranteed to be written * atomicaly on 32bits systems. - * @timestamp - Used by the core code to track when the last update was - * @lifespan - Used by the core code to determine how old the counters + * @timestamp: Used by the core code to track when the last update was + * @lifespan: Used by the core code to determine how old the counters * should be before being updated again. Stored in jiffies, defaults * to 10 milliseconds, drivers can override the default be specifying * their own value during their allocation routine. - * @descs - Array of pointers to static descriptors used for the counters + * @descs: Array of pointers to static descriptors used for the counters * in directory. - * @is_disabled - A bitmap to indicate each counter is currently disabled + * @is_disabled: A bitmap to indicate each counter is currently disabled * or not. - * @num_counters - How many hardware counters there are. If name is + * @num_counters: How many hardware counters there are. If name is * shorter than this number, a kernel oops will result. Driver authors * are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters) * in their code to prevent this. - * @value - Array of u64 counters that are accessed by the sysfs code and + * @value: Array of u64 counters that are accessed by the sysfs code and * filled in by the drivers get_stats routine */ struct rdma_hw_stats { @@ -2405,7 +2405,7 @@ struct ib_device_ops { int (*modify_port)(struct ib_device *device, u32 port_num, int port_modify_mask, struct ib_port_modify *port_modify); - /** + /* * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this * structure to avoid cache line misses when accessing struct ib_device @@ -2415,7 +2415,7 @@ struct ib_device_ops { struct ib_port_immutable *immutable); enum rdma_link_layer (*get_link_layer)(struct ib_device *device, u32 port_num); - /** + /* * When calling get_netdev, the HW vendor's driver should return the * net device of device @device at port @port_num or NULL if such * a net device doesn't exist. The vendor driver should call dev_hold @@ -2425,7 +2425,7 @@ struct ib_device_ops { */ struct net_device *(*get_netdev)(struct ib_device *device, u32 port_num); - /** + /* * rdma netdev operation * * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params @@ -2439,14 +2439,14 @@ struct ib_device_ops { int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, struct rdma_netdev_alloc_params *params); - /** + /* * query_gid should be return GID value for @device, when @port_num * link layer is either IB or iWarp. It is no-op if @port_num port * is RoCE link layer. */ int (*query_gid)(struct ib_device *device, u32 port_num, int index, union ib_gid *gid); - /** + /* * When calling add_gid, the HW vendor's driver should add the gid * of device of port at gid index available at @attr. Meta-info of * that gid (for example, the network device related to this gid) is @@ -2460,7 +2460,7 @@ struct ib_device_ops { * roce_gid_table is used. */ int (*add_gid)(const struct ib_gid_attr *attr, void **context); - /** + /* * When calling del_gid, the HW vendor's driver should delete the * gid of device @device at gid index gid_index of port port_num * available in @attr. @@ -2475,7 +2475,7 @@ struct ib_device_ops { struct ib_udata *udata); void (*dealloc_ucontext)(struct ib_ucontext *context); int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma); - /** + /* * This will be called once refcount of an entry in mmap_xa reaches * zero. The type of the memory that was mapped may differ between * entries and is opaque to the rdma_user_mmap interface. @@ -2516,12 +2516,12 @@ struct ib_device_ops { int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); - /** + /* * pre_destroy_cq - Prevent a cq from generating any new work * completions, but not free any kernel resources */ int (*pre_destroy_cq)(struct ib_cq *cq); - /** + /* * post_destroy_cq - Free all kernel resources */ void (*post_destroy_cq)(struct ib_cq *cq); @@ -2615,7 +2615,7 @@ struct ib_device_ops { struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset); - /** + /* * alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and * fill in the driver initialized data. The struct is kfree()'ed by * the sysfs core when the device is removed. A lifespan of -1 in the @@ -2624,7 +2624,7 @@ struct ib_device_ops { struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device); struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device, u32 port_num); - /** + /* * get_hw_stats - Fill in the counter value(s) in the stats struct. * @index - The index in the value array we wish to have updated, or * num_counters if we want all stats updated @@ -2639,14 +2639,14 @@ struct ib_device_ops { int (*get_hw_stats)(struct ib_device *device, struct rdma_hw_stats *stats, u32 port, int index); - /** + /* * modify_hw_stat - Modify the counter configuration * @enable: true/false when enable/disable a counter * Return codes - 0 on success or error code otherwise. */ int (*modify_hw_stat)(struct ib_device *device, u32 port, unsigned int counter_index, bool enable); - /** + /* * Allows rdma drivers to add their own restrack attributes. */ int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr); @@ -2682,39 +2682,39 @@ struct ib_device_ops { u8 pdata_len); int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog); int (*iw_destroy_listen)(struct iw_cm_id *cm_id); - /** + /* * counter_bind_qp - Bind a QP to a counter. * @counter - The counter to be bound. If counter->id is zero then * the driver needs to allocate a new counter and set counter->id */ int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp, u32 port); - /** + /* * counter_unbind_qp - Unbind the qp from the dynamically-allocated * counter and bind it onto the default one */ int (*counter_unbind_qp)(struct ib_qp *qp, u32 port); - /** + /* * counter_dealloc -De-allocate the hw counter */ int (*counter_dealloc)(struct rdma_counter *counter); - /** + /* * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in * the driver initialized data. */ struct rdma_hw_stats *(*counter_alloc_stats)( struct rdma_counter *counter); - /** + /* * counter_update_stats - Query the stats value of this counter */ int (*counter_update_stats)(struct rdma_counter *counter); - /** + /* * counter_init - Initialize the driver specific rdma counter struct. */ void (*counter_init)(struct rdma_counter *counter); - /** + /* * Allows rdma drivers to add their own restrack attributes * dumped via 'rdma stat' iproute2 command. */ @@ -2730,25 +2730,25 @@ struct ib_device_ops { */ int (*get_numa_node)(struct ib_device *dev); - /** + /* * add_sub_dev - Add a sub IB device */ struct ib_device *(*add_sub_dev)(struct ib_device *parent, enum rdma_nl_dev_type type, const char *name); - /** + /* * del_sub_dev - Delete a sub IB device */ void (*del_sub_dev)(struct ib_device *sub_dev); - /** + /* * ufile_cleanup - Attempt to cleanup ubojects HW resources inside * the ufile. */ void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile); - /** + /* * report_port_event - Drivers need to implement this if they have * some private stuff to handle when link status changes. */ @@ -3157,8 +3157,8 @@ static inline u32 rdma_start_port(const struct ib_device *device) /** * rdma_for_each_port - Iterate over all valid port numbers of the IB device - * @device - The struct ib_device * to iterate over - * @iter - The unsigned int to store the port number + * @device: The struct ib_device * to iterate over + * @iter: The unsigned int to store the port number */ #define rdma_for_each_port(device, iter) \ for (iter = rdma_start_port(device + \ @@ -3524,7 +3524,7 @@ static inline bool rdma_core_cap_opa_port(struct ib_device *device, /** * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value. * @device: Device - * @port_num: Port number + * @port: Port number * @mtu: enum value of MTU * * Return the MTU size supported by the port as an integer value. Will return @@ -3542,7 +3542,7 @@ static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port, /** * rdma_mtu_from_attr - Return the mtu of the port from the port attribute. * @device: Device - * @port_num: Port number + * @port: Port number * @attr: port attribute * * Return the MTU size supported by the port as an integer value. @@ -3919,7 +3919,7 @@ static inline int ib_destroy_qp(struct ib_qp *qp) /** * ib_open_qp - Obtain a reference to an existing sharable QP. - * @xrcd - XRC domain + * @xrcd: XRC domain * @qp_open_attr: Attributes identifying the QP to open. * * Returns a reference to a sharable QP. @@ -4273,9 +4273,9 @@ static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev, /** * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses * @dev: The device for which the DMA addresses are to be created - * @sg: The sg_table object describing the buffer + * @sgt: The sg_table object describing the buffer * @direction: The direction of the DMA - * @attrs: Optional DMA attributes for the map operation + * @dma_attrs: Optional DMA attributes for the map operation */ static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev, struct sg_table *sgt, @@ -4419,8 +4419,8 @@ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, /** * ib_update_fast_reg_key - updates the key portion of the fast_reg MR * R_Key and L_Key. - * @mr - struct ib_mr pointer to be updated. - * @newkey - new key to be used. + * @mr: struct ib_mr pointer to be updated. + * @newkey: new key to be used. */ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) { @@ -4431,7 +4431,7 @@ static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey) /** * ib_inc_rkey - increments the key portion of the given rkey. Can be used * for calculating a new rkey for type 2 memory windows. - * @rkey - the rkey to increment. + * @rkey: the rkey to increment. */ static inline u32 ib_inc_rkey(u32 rkey) { @@ -4525,7 +4525,7 @@ int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, /** * ib_device_try_get: Hold a registration lock - * device: The device to lock + * @dev: The device to lock * * A device under an active registration lock cannot become unregistered. It * is only possible to obtain a registration lock on a device that is fully @@ -4832,7 +4832,7 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector) * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. * - * @device: the rdma device + * @ibdev: the rdma device */ void rdma_roce_rescan_device(struct ib_device *ibdev); void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port); @@ -4885,7 +4885,7 @@ static inline struct ib_device *rdma_device_to_ibdev(struct device *device) /** * ibdev_to_node - return the NUMA node for a given ib_device - * @dev: device to get the NUMA node for. + * @ibdev: device to get the NUMA node for. */ static inline int ibdev_to_node(struct ib_device *ibdev) { @@ -4923,6 +4923,7 @@ static inline struct net *rdma_dev_net(struct ib_device *device) /** * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based * on the flow_label + * @fl: flow_label value * * This function will convert the 20 bit flow_label input to a valid RoCE v2 * UDP src port 14 bit value. All RoCE V2 drivers should use this same -- cgit v1.2.3 From 7be20254a743be4f02414b9d56cc3fe5f84e6500 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Sep 2025 04:25:22 -0600 Subject: io_uring: unify task_work cancelation checks Rather than do per-tw checking, which needs to dip into the task_struct for checking flags, do it upfront before running task_work. This places a 'cancel' member in io_tw_token_t, which is assigned before running task_work for that given ctx. This is both more efficient in doing it upfront rather than for every task_work, and it means that io_should_terminate_tw() can be made private in io_uring.c rather than need to be called by various callbacks of task_work. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index c2ea6280901d..25ee982eb435 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -474,6 +474,7 @@ struct io_ring_ctx { * ONLY core io_uring.c should instantiate this struct. */ struct io_tw_state { + bool cancel; }; /* Alias to use in code that doesn't instantiate struct io_tw_state */ typedef struct io_tw_state io_tw_token_t; -- cgit v1.2.3 From c5ebcc80fcf7d2c6ed917371f024d2da5bce9128 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Oct 2025 00:07:27 -0700 Subject: iio: adc: qcom-vadc-common: fix vadc_scale_fn_type kernel-doc Fix multiple warnings in enum vadc_scale_fn_type by adding a leading '@' to the kernel-doc descriptions. Fixed 14 warnings in this one enum, such as: Warning: include/linux/iio/adc/qcom-vadc-common.h:123 Enum value 'SCALE_DEFAULT' not described in enum 'vadc_scale_fn_type' Warning: ../include/linux/iio/adc/qcom-vadc-common.h:123 Enum value 'SCALE_THERM_100K_PULLUP' not described in enum 'vadc_scale_fn_type' Warning: ../include/linux/iio/adc/qcom-vadc-common.h:123 Enum value 'SCALE_PMIC_THERM' not described in enum 'vadc_scale_fn_type' Also prevent the warning on SCALE_HW_CALIB_INVALID by marking it "private:" so that kernel-doc notation is not needed for it. This leaves only one warning here, which I don't know the appropriate description of: qcom-vadc-common.h:125: warning: Enum value 'SCALE_HW_CALIB_PMIC_THERM_PM7' not described in enum 'vadc_scale_fn_type' Signed-off-by: Randy Dunlap Signed-off-by: Jonathan Cameron --- include/linux/iio/adc/qcom-vadc-common.h | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/iio/adc/qcom-vadc-common.h b/include/linux/iio/adc/qcom-vadc-common.h index aa21b032e861..3bf4c49726a7 100644 --- a/include/linux/iio/adc/qcom-vadc-common.h +++ b/include/linux/iio/adc/qcom-vadc-common.h @@ -83,27 +83,27 @@ struct vadc_linear_graph { /** * enum vadc_scale_fn_type - Scaling function to convert ADC code to * physical scaled units for the channel. - * SCALE_DEFAULT: Default scaling to convert raw adc code to voltage (uV). - * SCALE_THERM_100K_PULLUP: Returns temperature in millidegC. + * @SCALE_DEFAULT: Default scaling to convert raw adc code to voltage (uV). + * @SCALE_THERM_100K_PULLUP: Returns temperature in millidegC. * Uses a mapping table with 100K pullup. - * SCALE_PMIC_THERM: Returns result in milli degree's Centigrade. - * SCALE_XOTHERM: Returns XO thermistor voltage in millidegC. - * SCALE_PMI_CHG_TEMP: Conversion for PMI CHG temp - * SCALE_HW_CALIB_DEFAULT: Default scaling to convert raw adc code to + * @SCALE_PMIC_THERM: Returns result in milli degree's Centigrade. + * @SCALE_XOTHERM: Returns XO thermistor voltage in millidegC. + * @SCALE_PMI_CHG_TEMP: Conversion for PMI CHG temp + * @SCALE_HW_CALIB_DEFAULT: Default scaling to convert raw adc code to * voltage (uV) with hardware applied offset/slope values to adc code. - * SCALE_HW_CALIB_THERM_100K_PULLUP: Returns temperature in millidegC using + * @SCALE_HW_CALIB_THERM_100K_PULLUP: Returns temperature in millidegC using * lookup table. The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_XOTHERM: Returns XO thermistor voltage in millidegC using + * @SCALE_HW_CALIB_XOTHERM: Returns XO thermistor voltage in millidegC using * 100k pullup. The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_THERM_100K_PU_PM7: Returns temperature in millidegC using + * @SCALE_HW_CALIB_THERM_100K_PU_PM7: Returns temperature in millidegC using * lookup table for PMIC7. The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. + * @SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. * The hardware applies offset/slope to adc code. - * SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. + * @SCALE_HW_CALIB_PMIC_THERM: Returns result in milli degree's Centigrade. * The hardware applies offset/slope to adc code. This is for PMIC7. - * SCALE_HW_CALIB_PM5_CHG_TEMP: Returns result in millidegrees for PMIC5 + * @SCALE_HW_CALIB_PM5_CHG_TEMP: Returns result in millidegrees for PMIC5 * charger temperature. - * SCALE_HW_CALIB_PM5_SMB_TEMP: Returns result in millidegrees for PMIC5 + * @SCALE_HW_CALIB_PM5_SMB_TEMP: Returns result in millidegrees for PMIC5 * SMB1390 temperature. */ enum vadc_scale_fn_type { @@ -120,6 +120,7 @@ enum vadc_scale_fn_type { SCALE_HW_CALIB_PMIC_THERM_PM7, SCALE_HW_CALIB_PM5_CHG_TEMP, SCALE_HW_CALIB_PM5_SMB_TEMP, + /* private: */ SCALE_HW_CALIB_INVALID, }; -- cgit v1.2.3 From ca82a7ea2299b4586af1f77daee66ee781202320 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Fri, 19 Sep 2025 14:42:50 -0700 Subject: iomap: simplify iomap_iter_advance() Most callers of iomap_iter_advance() do not need the remaining length returned. Get rid of the extra iomap_length() call that iomap_iter_advance() does. Signed-off-by: Joanne Koong Reviewed-by: Brian Foster Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- include/linux/iomap.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 73dceabc21c8..4469b2318b08 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -245,7 +245,7 @@ struct iomap_iter { }; int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops); -int iomap_iter_advance(struct iomap_iter *iter, u64 *count); +int iomap_iter_advance(struct iomap_iter *iter, u64 count); /** * iomap_length_trim - trimmed length of the current iomap iteration @@ -282,9 +282,7 @@ static inline u64 iomap_length(const struct iomap_iter *iter) */ static inline int iomap_iter_advance_full(struct iomap_iter *iter) { - u64 length = iomap_length(iter); - - return iomap_iter_advance(iter, &length); + return iomap_iter_advance(iter, iomap_length(iter)); } /** -- cgit v1.2.3 From dc816f8d925cac34922ea73abd94ae23a96cacac Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 1 Oct 2025 01:53:14 +0200 Subject: fs: assert ->i_lock held in __iget() Also remove the now redundant comment. Signed-off-by: Mateusz Guzik Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..ac62b9d10b00 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3378,11 +3378,9 @@ static inline bool is_zero_ino(ino_t ino) return (u32)ino == 0; } -/* - * inode->i_lock must be held - */ static inline void __iget(struct inode *inode) { + lockdep_assert_held(&inode->i_lock); atomic_inc(&inode->i_count); } -- cgit v1.2.3 From 31e332b911fca54df467d264d7e2a2ef9317f3ca Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 6 Oct 2025 01:15:26 +0200 Subject: fs: add missing fences to I_NEW handling Suppose there are 2 CPUs racing inode hash lookup func (say ilookup5()) and unlock_new_inode(). In principle the latter can clear the I_NEW flag before prior stores into the inode were made visible. The former can in turn observe I_NEW is cleared and proceed to use the inode, while possibly reading from not-yet-published areas. Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/writeback.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 22dd4adc5667..e1e1231a6830 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -194,6 +194,10 @@ static inline void wait_on_inode(struct inode *inode) { wait_var_event(inode_state_wait_address(inode, __I_NEW), !(READ_ONCE(inode->i_state) & I_NEW)); + /* + * Pairs with routines clearing I_NEW. + */ + smp_rmb(); } #ifdef CONFIG_CGROUP_WRITEBACK -- cgit v1.2.3 From af6023e2ce0a3d4d948885d464b0ddca4b8b1fdf Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:15 +0200 Subject: fs: move wait_on_inode() from writeback.h to fs.h The only consumer outside of fs/inode.c is gfs2 and it already includes fs.h in the relevant file. Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 ++++++++++ include/linux/writeback.h | 11 ----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index ac62b9d10b00..b35014ba681b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -949,6 +949,16 @@ static inline void inode_fake_hash(struct inode *inode) hlist_add_fake(&inode->i_hash); } +static inline void wait_on_inode(struct inode *inode) +{ + wait_var_event(inode_state_wait_address(inode, __I_NEW), + !(READ_ONCE(inode->i_state) & I_NEW)); + /* + * Pairs with routines clearing I_NEW. + */ + smp_rmb(); +} + /* * inode->i_rwsem nesting subclasses for the lock validator: * diff --git a/include/linux/writeback.h b/include/linux/writeback.h index e1e1231a6830..06195c2a535b 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,17 +189,6 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, void inode_wait_for_writeback(struct inode *inode); void inode_io_list_del(struct inode *inode); -/* writeback.h requires fs.h; it, too, is not included from here. */ -static inline void wait_on_inode(struct inode *inode) -{ - wait_var_event(inode_state_wait_address(inode, __I_NEW), - !(READ_ONCE(inode->i_state) & I_NEW)); - /* - * Pairs with routines clearing I_NEW. - */ - smp_rmb(); -} - #ifdef CONFIG_CGROUP_WRITEBACK #include -- cgit v1.2.3 From cb5db358ab5769cbd3e8e864f14af321126cccdb Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:16 +0200 Subject: fs: spell out fenced ->i_state accesses with explicit smp_wmb/smp_rmb The incomming helpers don't ship with _release/_acquire variants, for the time being anyway. Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/backing-dev.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3e64f14739dd..065cba5dc111 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -277,10 +277,11 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) rcu_read_lock(); /* - * Paired with store_release in inode_switch_wbs_work_fn() and + * Paired with a release fence in inode_do_switch_wbs() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; + cookie->locked = inode->i_state & I_WB_SWITCH; + smp_rmb(); if (unlikely(cookie->locked)) xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); -- cgit v1.2.3 From d8753f788ab4916341d9fab81795be9f2f49c264 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:17 +0200 Subject: fs: provide accessors for ->i_state Open-coded accesses prevent asserting they are done correctly. One obvious aspect is locking, but significantly more can checked. For example it can be detected when the code is clearing flags which are already missing, or is setting flags when it is illegal (e.g., I_FREEING when ->i_count > 0). In order to keep things manageable this patchset merely gets the thing off the ground with only lockdep checks baked in. Current consumers can be trivially converted. Suppose flags I_A and I_B are to be handled. If ->i_lock is held, then: state = inode->i_state => state = inode_state_read(inode) inode->i_state |= (I_A | I_B) => inode_state_set(inode, I_A | I_B) inode->i_state &= ~(I_A | I_B) => inode_state_clear(inode, I_A | I_B) inode->i_state = I_A | I_B => inode_state_assign(inode, I_A | I_B) If ->i_lock is not held or only held conditionally: state = inode->i_state => state = inode_state_read_once(inode) inode->i_state |= (I_A | I_B) => inode_state_set_raw(inode, I_A | I_B) inode->i_state &= ~(I_A | I_B) => inode_state_clear_raw(inode, I_A | I_B) inode->i_state = I_A | I_B => inode_state_assign_raw(inode, I_A | I_B) The "_once" vs "_raw" discrepancy stems from the read variant differing by READ_ONCE as opposed to just lockdep checks. Finally, if you want to atomically clear flags and set new ones, the following: state = inode->i_state; state &= ~I_A; state |= I_B; inode->i_state = state; turns into: inode_state_replace(inode, I_A, I_B); Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/fs.h | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index b35014ba681b..909eb1e68637 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -759,7 +759,7 @@ enum inode_state_bits { /* reserved wait address bit 3 */ }; -enum inode_state_flags_t { +enum inode_state_flags_enum { I_NEW = (1U << __I_NEW), I_SYNC = (1U << __I_SYNC), I_LRU_ISOLATING = (1U << __I_LRU_ISOLATING), @@ -843,7 +843,7 @@ struct inode { #endif /* Misc */ - enum inode_state_flags_t i_state; + enum inode_state_flags_enum i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -902,6 +902,80 @@ struct inode { void *i_private; /* fs or device private pointer */ } __randomize_layout; +/* + * i_state handling + * + * We hide all of it behind helpers so that we can validate consumers. + */ +static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode) +{ + return READ_ONCE(inode->i_state); +} + +static inline enum inode_state_flags_enum inode_state_read(struct inode *inode) +{ + lockdep_assert_held(&inode->i_lock); + return inode->i_state; +} + +static inline void inode_state_set_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state, inode->i_state | flags); +} + +static inline void inode_state_set(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_set_raw(inode, flags); +} + +static inline void inode_state_clear_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state, inode->i_state & ~flags); +} + +static inline void inode_state_clear(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_clear_raw(inode, flags); +} + +static inline void inode_state_assign_raw(struct inode *inode, + enum inode_state_flags_enum flags) +{ + WRITE_ONCE(inode->i_state, flags); +} + +static inline void inode_state_assign(struct inode *inode, + enum inode_state_flags_enum flags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_assign_raw(inode, flags); +} + +static inline void inode_state_replace_raw(struct inode *inode, + enum inode_state_flags_enum clearflags, + enum inode_state_flags_enum setflags) +{ + enum inode_state_flags_enum flags; + flags = inode->i_state; + flags &= ~clearflags; + flags |= setflags; + inode_state_assign_raw(inode, flags); +} + +static inline void inode_state_replace(struct inode *inode, + enum inode_state_flags_enum clearflags, + enum inode_state_flags_enum setflags) +{ + lockdep_assert_held(&inode->i_lock); + inode_state_replace_raw(inode, clearflags, setflags); +} + static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { VFS_WARN_ON_INODE(strlen(link) != linklen, inode); -- cgit v1.2.3 From f5aa78e2be066f3801785094f1b55a3114fe461a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:19 +0200 Subject: Manual conversion to use ->i_state accessors of all places not covered by coccinelle Nothing to look at apart from iput_final(). Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/backing-dev.h | 2 +- include/linux/fs.h | 6 +++--- include/linux/writeback.h | 2 +- include/trace/events/writeback.h | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 065cba5dc111..0c8342747cab 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -280,7 +280,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) * Paired with a release fence in inode_do_switch_wbs() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ - cookie->locked = inode->i_state & I_WB_SWITCH; + cookie->locked = inode_state_read_once(inode) & I_WB_SWITCH; smp_rmb(); if (unlikely(cookie->locked)) diff --git a/include/linux/fs.h b/include/linux/fs.h index 909eb1e68637..77b6486dcae7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1026,7 +1026,7 @@ static inline void inode_fake_hash(struct inode *inode) static inline void wait_on_inode(struct inode *inode) { wait_var_event(inode_state_wait_address(inode, __I_NEW), - !(READ_ONCE(inode->i_state) & I_NEW)); + !(inode_state_read_once(inode) & I_NEW)); /* * Pairs with routines clearing I_NEW. */ @@ -2719,8 +2719,8 @@ static inline int icount_read(const struct inode *inode) */ static inline bool inode_is_dirtytime_only(struct inode *inode) { - return (inode->i_state & (I_DIRTY_TIME | I_NEW | - I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; + return (inode_state_read_once(inode) & + (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; } extern void inc_nlink(struct inode *inode); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 06195c2a535b..102071ffedcb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -227,7 +227,7 @@ static inline void inode_attach_wb(struct inode *inode, struct folio *folio) static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { - WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); + WARN_ON_ONCE(!(inode_state_read_once(inode) & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index c08aff044e80..311a341e6fe4 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -120,7 +120,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template, /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->flags = flags; ), @@ -748,7 +748,7 @@ TRACE_EVENT(writeback_sb_inodes_requeue, strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), @@ -787,7 +787,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template, strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; @@ -839,7 +839,7 @@ DECLARE_EVENT_CLASS(writeback_inode_template, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->state = inode->i_state; + __entry->state = inode_state_read_once(inode); __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), -- cgit v1.2.3 From 2ed81b4bef9b74ae0f095ad4667dbe2ae0b86a91 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 9 Oct 2025 09:59:28 +0200 Subject: fs: make plain ->i_state access fail to compile ... to make sure all accesses are properly validated. Merely renaming the var to __i_state still lets the compiler make the following suggestion: error: 'struct inode' has no member named 'i_state'; did you mean '__i_state'? Unfortunately some people will add the __'s and call it a day. In order to make it harder to mess up in this way, hide it behind a struct. The resulting error message should be convincing in terms of checking what to do: error: invalid operands to binary & (have 'struct inode_state_flags' and 'int') Of course people determined to do a plain access can still do it, but nothing can be done for that case. Signed-off-by: Mateusz Guzik Signed-off-by: Christian Brauner --- include/linux/fs.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 77b6486dcae7..21c73df3ce75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -785,6 +785,13 @@ enum inode_state_flags_enum { #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) +/* + * Use inode_state_read() & friends to access. + */ +struct inode_state_flags { + enum inode_state_flags_enum __state; +}; + /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -843,7 +850,7 @@ struct inode { #endif /* Misc */ - enum inode_state_flags_enum i_state; + struct inode_state_flags i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; @@ -909,19 +916,19 @@ struct inode { */ static inline enum inode_state_flags_enum inode_state_read_once(struct inode *inode) { - return READ_ONCE(inode->i_state); + return READ_ONCE(inode->i_state.__state); } static inline enum inode_state_flags_enum inode_state_read(struct inode *inode) { lockdep_assert_held(&inode->i_lock); - return inode->i_state; + return inode->i_state.__state; } static inline void inode_state_set_raw(struct inode *inode, enum inode_state_flags_enum flags) { - WRITE_ONCE(inode->i_state, inode->i_state | flags); + WRITE_ONCE(inode->i_state.__state, inode->i_state.__state | flags); } static inline void inode_state_set(struct inode *inode, @@ -934,7 +941,7 @@ static inline void inode_state_set(struct inode *inode, static inline void inode_state_clear_raw(struct inode *inode, enum inode_state_flags_enum flags) { - WRITE_ONCE(inode->i_state, inode->i_state & ~flags); + WRITE_ONCE(inode->i_state.__state, inode->i_state.__state & ~flags); } static inline void inode_state_clear(struct inode *inode, @@ -947,7 +954,7 @@ static inline void inode_state_clear(struct inode *inode, static inline void inode_state_assign_raw(struct inode *inode, enum inode_state_flags_enum flags) { - WRITE_ONCE(inode->i_state, flags); + WRITE_ONCE(inode->i_state.__state, flags); } static inline void inode_state_assign(struct inode *inode, @@ -962,7 +969,7 @@ static inline void inode_state_replace_raw(struct inode *inode, enum inode_state_flags_enum setflags) { enum inode_state_flags_enum flags; - flags = inode->i_state; + flags = inode->i_state.__state; flags &= ~clearflags; flags |= setflags; inode_state_assign_raw(inode, flags); -- cgit v1.2.3 From 1888635532fbbd6be4a4368621085c3a197279f8 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Tue, 30 Sep 2025 16:53:15 +0800 Subject: writeback: Wake up waiting tasks when finishing the writeback of a chunk. Writing back a large number of pages can take a lots of time. This issue is exacerbated when the underlying device is slow or subject to block layer rate limiting, which in turn triggers unexpected hung task warnings. We can trigger a wake-up once a chunk has been written back and the waiting time for writeback exceeds half of sysctl_hung_task_timeout_secs. This action allows the hung task detector to be aware of the writeback progress, thereby eliminating these unexpected hung task warnings. This patch has passed the xfstests 'check -g quick' test based on ext4, with no additional failures introduced. Signed-off-by: Julian Sun Reviewed-by: Jan Kara Suggested-by: Peter Zijlstra Signed-off-by: Christian Brauner --- include/linux/backing-dev-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c5c9d89c73ed..c8aa749790b1 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -63,6 +63,7 @@ enum wb_reason { struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; + unsigned long progress_stamp; /* The jiffies when slow progress is detected */ }; #define __WB_COMPLETION_INIT(_waitq) \ -- cgit v1.2.3 From d6e6215907640801b1f407dc9e871b19ca5a3805 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Tue, 30 Sep 2025 15:18:29 +0800 Subject: writeback: Add logging for slow writeback (exceeds sysctl_hung_task_timeout_secs) When a writeback work lasts for sysctl_hung_task_timeout_secs, we want to identify that there are tasks waiting for a long time-this helps us pinpoint potential issues. Additionally, recording the starting jiffies is useful when debugging a crashed vmcore. Signed-off-by: Julian Sun Signed-off-by: Christian Brauner --- include/linux/backing-dev-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c8aa749790b1..610ef62b6a32 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -64,6 +64,7 @@ struct wb_completion { atomic_t cnt; wait_queue_head_t *waitq; unsigned long progress_stamp; /* The jiffies when slow progress is detected */ + unsigned long wait_start; /* The jiffies when waiting for the writeback work to finish */ }; #define __WB_COMPLETION_INIT(_waitq) \ -- cgit v1.2.3 From a00f3dea0352a5fb0b67b84c72daeb6563f8e67f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 9 Oct 2025 21:25:56 +0200 Subject: ACPI: PM: s2idle: Drop acpi_get_lps0_constraint() Drop unused function acpi_get_lps0_constraint(). No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/5032801.GXAFRqVoOG@rafael.j.wysocki --- include/linux/acpi.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead..252768d007c7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1146,12 +1146,7 @@ struct acpi_s2idle_dev_ops { #if defined(CONFIG_SUSPEND) && defined(CONFIG_X86) int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg); void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg); -int acpi_get_lps0_constraint(struct acpi_device *adev); #else /* CONFIG_SUSPEND && CONFIG_X86 */ -static inline int acpi_get_lps0_constraint(struct device *dev) -{ - return ACPI_STATE_UNKNOWN; -} static inline int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg) { return -ENODEV; -- cgit v1.2.3 From 370157293175a702036203faec3e0495b081f135 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Oct 2025 20:59:17 -0700 Subject: nl802154: fix some kernel-doc warnings Correct multiple kernel-doc warnings in nl802154.h: - Fix a typo on one enum name to avoid a kernel-doc warning. - Drop 2 enum descriptions that are no longer needed. - Mark 2 internal enums as "private:" so that kernel-doc is not needed for them. Warning: nl802154.h:239 Enum value 'NL802154_CAP_ATTR_MAX_MAXBE' not described in enum 'nl802154_wpan_phy_capability_attr' Warning: nl802154.h:239 Excess enum value '%NL802154_CAP_ATTR_MIN_CCA_ED_LEVEL' description in 'nl802154_wpan_phy_capability_attr' Warning: nl802154.h:239 Excess enum value '%NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL' description in 'nl802154_wpan_phy_capability_attr' Warning: nl802154.h:369 Enum value '__NL802154_CCA_OPT_ATTR_AFTER_LAST' not described in enum 'nl802154_cca_opts' Warning: nl802154.h:369 Enum value 'NL802154_CCA_OPT_ATTR_MAX' not described in enum 'nl802154_cca_opts' Signed-off-by: Randy Dunlap Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251016035917.1148012-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/nl802154.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/nl802154.h b/include/net/nl802154.h index a994dea74596..442822746e92 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -191,14 +191,12 @@ enum nl802154_iftype { * @NL802154_CAP_ATTR_CHANNELS: a nested attribute for nl802154_channel_attr * @NL802154_CAP_ATTR_TX_POWERS: a nested attribute for * nl802154_wpan_phy_tx_power - * @NL802154_CAP_ATTR_MIN_CCA_ED_LEVEL: minimum value for cca_ed_level - * @NL802154_CAP_ATTR_MAX_CCA_ED_LEVEL: maximum value for cca_ed_level * @NL802154_CAP_ATTR_CCA_MODES: nl802154_cca_modes flags * @NL802154_CAP_ATTR_CCA_OPTS: nl802154_cca_opts flags * @NL802154_CAP_ATTR_MIN_MINBE: minimum of minbe value * @NL802154_CAP_ATTR_MAX_MINBE: maximum of minbe value * @NL802154_CAP_ATTR_MIN_MAXBE: minimum of maxbe value - * @NL802154_CAP_ATTR_MAX_MINBE: maximum of maxbe value + * @NL802154_CAP_ATTR_MAX_MAXBE: maximum of maxbe value * @NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS: minimum of csma backoff value * @NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS: maximum of csma backoffs value * @NL802154_CAP_ATTR_MIN_FRAME_RETRIES: minimum of frame retries value @@ -364,6 +362,7 @@ enum nl802154_cca_opts { NL802154_CCA_OPT_ENERGY_CARRIER_AND, NL802154_CCA_OPT_ENERGY_CARRIER_OR, + /* private: */ /* keep last */ __NL802154_CCA_OPT_ATTR_AFTER_LAST, NL802154_CCA_OPT_ATTR_MAX = __NL802154_CCA_OPT_ATTR_AFTER_LAST - 1 -- cgit v1.2.3 From 813882ae22756bcf9645d405e045c60e5aab0a93 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 16 Oct 2025 15:36:46 +0100 Subject: net: stmmac: remove broken PCS code Changing the netif_carrier_*() state behind phylink's back has always been prohibited because it messes up with phylinks state tracking, and means that phylink no longer guarantees to call the mac_link_down() and mac_link_up() methods at the appropriate times. This was later documented in the sfp-phylink network driver conversion guide. stmmac was converted to phylink in 2019, but nothing was done with the "PCS" code. Since then, apart from the updates as part of phylink development, nothing has happened with stmmac to improve its use of phylink, or even to address this point. A couple of years ago, a has_integrated_pcs boolean was added by Bart, which later became the STMMAC_FLAG_HAS_INTEGRATED_PCS flag, to avoid manipulating the netif_carrier_*() state. This flag is mis-named, because whenever the stmmac is synthesized for its native SGMII, TBI or RTBI interfaces, it has an "integrated PCS". This boolean/flag actually means "ignore the status from the integrated PCS". Discussing with Bart, the reasons for this are lost to the winds of time (which is why we should always document the reasons in the commit message.) RGMII also has in-band status, and the dwmac cores and stmmac code supports this but with one bug that saves the day. When dwmac cores are synthesised for RGMII only, they do not contain an integrated PCS, and so priv->dma_cap.pcs is clear, which prevents (incorrectly) the "RGMII PCS" being used, meaning we don't read the in-band status. However, a core synthesised for RGMII and also SGMII, TBI or RTBI will have this capability bit set, thus making these code paths reachable. The Jetson Xavier NX uses RGMII mode to talk to its PHY, and removing the incorrect check for priv->dma_cap.pcs reveals the theortical issue with netif_carrier_*() manipulation is real: dwc-eth-dwmac 2490000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0 dwc-eth-dwmac 2490000.ethernet eth0: PHY [stmmac-0:00] driver [RTL8211F Gigabit Ethernet] (irq=141) dwc-eth-dwmac 2490000.ethernet eth0: No Safety Features support found dwc-eth-dwmac 2490000.ethernet eth0: IEEE 1588-2008 Advanced Timestamp supported dwc-eth-dwmac 2490000.ethernet eth0: registered PTP clock dwc-eth-dwmac 2490000.ethernet eth0: configuring for phy/rgmii-id link mode 8021q: adding VLAN 0 to HW filter on device eth0 dwc-eth-dwmac 2490000.ethernet eth0: Adding VLAN ID 0 is not supported Link is Up - 1000/Full Link is Down Link is Up - 1000/Full This looks good until one realises that the phylink "Link" status messages are missing, even when the RJ45 cable is reconnected. Nothing one can do results in the interface working. The interrupt handler (which prints those "Link is" messages) always wins over phylink's resolve worker, meaning phylink never calls the mac_link_up() nor mac_link_down() methods. eth0 also sees no traffic received, and is unable to obtain a DHCP address: 3: eth0: mtu 1500 qdisc mq state UP group defa ult qlen 1000 link/ether e6:d3:6a:e6:92:de brd ff:ff:ff:ff:ff:ff RX: bytes packets errors dropped overrun mcast 0 0 0 0 0 0 TX: bytes packets errors dropped carrier collsns 27686 149 0 0 0 0 With the STMMAC_FLAG_HAS_INTEGRATED_PCS flag set, which disables the netif_carrier_*() manipulation then stmmac works normally: dwc-eth-dwmac 2490000.ethernet eth0: Register MEM_TYPE_PAGE_POOL RxQ-0 dwc-eth-dwmac 2490000.ethernet eth0: PHY [stmmac-0:00] driver [RTL8211F Gigabit Ethernet] (irq=141) dwc-eth-dwmac 2490000.ethernet eth0: No Safety Features support found dwc-eth-dwmac 2490000.ethernet eth0: IEEE 1588-2008 Advanced Timestamp supported dwc-eth-dwmac 2490000.ethernet eth0: registered PTP clock dwc-eth-dwmac 2490000.ethernet eth0: configuring for phy/rgmii-id link mode 8021q: adding VLAN 0 to HW filter on device eth0 dwc-eth-dwmac 2490000.ethernet eth0: Adding VLAN ID 0 is not supported Link is Up - 1000/Full dwc-eth-dwmac 2490000.ethernet eth0: Link is Up - 1Gbps/Full - flow control rx/tx and packets can be transferred. This clearly shows that when priv->hw->pcs is set, but STMMAC_FLAG_HAS_INTEGRATED_PCS is clear, the driver reliably fails. Discovering whether a platform falls into this is impossible as parsing all the dtsi and dts files to find out which use the stmmac driver, whether any of them use RGMII or SGMII and also depends whether an external interface is being used. The kernel likely doesn't contain all dts files either. The only driver that sets this flag uses the qcom,sa8775p-ethqos compatible, and uses SGMII or 2500BASE-X. but these are saved from this problem by the incorrect check for priv->dma_cap.pcs. So, we have to assume that for every other platform that uses SGMII with stmmac is using an external PCS. Moreover, ethtool output can be incorrect. With the full-duplex link negotiated, ethtool reports: Speed: 1000Mb/s Duplex: Half because with dwmac4, the full-duplex bit is in bit 16 of the status, priv->xstats.pcs_duplex becomes BIT(16) for full duplex, but the ethtool ksettings duplex member is u8 - so becomes zero. Moreover, the supported, advertised and link partner modes are all "not reported". Finally, ksettings_set() won't be able to set the advertisement on a PHY if this PCS code is activated, which is incorrect when SGMII is used with a PHY. Thus, remove: 1. the incorrect netif_carrier_*() manipulation. 2. the broken ethtool ksettings code. Given that all uses of STMMAC_FLAG_HAS_INTEGRATED_PCS are now gone, remove the flag from stmmac.h and dwmac-qcom-ethqos.c. Reviewed-by: Andrew Lunn Signed-off-by: Russell King (Oracle) Tested-by: Maxime Chevallier Tested-by: Lad Prabhakar Link: https://patch.msgid.link/E1v9P5y-0000000AolC-1QWH@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index fa1318bac06c..99022620457a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -171,7 +171,6 @@ struct dwmac4_addrs { u32 mtl_low_cred_offset; }; -#define STMMAC_FLAG_HAS_INTEGRATED_PCS BIT(0) #define STMMAC_FLAG_SPH_DISABLE BIT(1) #define STMMAC_FLAG_USE_PHY_WOL BIT(2) #define STMMAC_FLAG_HAS_SUN8I BIT(3) -- cgit v1.2.3 From d19f6451c6feefd6537b97efa5f3859681f243cb Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 16 Oct 2025 11:09:26 +0200 Subject: gpio: export gpiod_hwgpio() Reading the GPIO hardware number from a descriptor is a valid use-case outside of the GPIO core. Export the symbol to consumers of GPIO descriptors. Reviewed-by: Linus Walleij Reviewed-by: Andrew Jeffery Link: https://lore.kernel.org/r/20251016-aspeed-gpiolib-include-v1-2-31201c06d124@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 00df68c51405..994d46874d56 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -171,6 +171,8 @@ int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name); struct gpio_desc *gpio_to_desc(unsigned gpio); int desc_to_gpio(const struct gpio_desc *desc); +int gpiod_hwgpio(const struct gpio_desc *desc); + struct gpio_desc *fwnode_gpiod_get_index(struct fwnode_handle *fwnode, const char *con_id, int index, enum gpiod_flags flags, -- cgit v1.2.3 From 44472d1b83127e579c798ff92a07ae86d98b61b9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 6 Oct 2025 13:07:32 +0200 Subject: atomic: Skip alignment check for try_cmpxchg() old arg The 'old' argument in atomic_try_cmpxchg() and related functions is a pointer to a normal non-atomic integer number, which does not require to be naturally aligned, unlike the atomic_t/atomic64_t types themselves. In order to add an alignment check with CONFIG_DEBUG_ATOMIC into the normal instrument_atomic_read_write() helper, change this check to use the non-atomic instrument_read_write(), the same way that was done earlier for try_cmpxchg() in commit ec570320b09f ("locking/atomic: Correct (cmp)xchg() instrumentation"). This prevents warnings on m68k calling the 32-bit atomic_try_cmpxchg() with 16-bit aligned arguments as well as several more architectures including x86-32 when calling atomic64_try_cmpxchg() with 32-bit aligned u64 arguments. Reported-by: Finn Thain Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/all/cover.1757810729.git.fthain@linux-m68k.org/ --- include/linux/atomic/atomic-instrumented.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/atomic/atomic-instrumented.h b/include/linux/atomic/atomic-instrumented.h index 9409a6ddf3e0..37ab6314a9f7 100644 --- a/include/linux/atomic/atomic-instrumented.h +++ b/include/linux/atomic/atomic-instrumented.h @@ -1276,7 +1276,7 @@ atomic_try_cmpxchg(atomic_t *v, int *old, int new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg(v, old, new); } @@ -1298,7 +1298,7 @@ static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_acquire(v, old, new); } @@ -1321,7 +1321,7 @@ atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_release(v, old, new); } @@ -1343,7 +1343,7 @@ static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_try_cmpxchg_relaxed(v, old, new); } @@ -2854,7 +2854,7 @@ atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg(v, old, new); } @@ -2876,7 +2876,7 @@ static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_acquire(v, old, new); } @@ -2899,7 +2899,7 @@ atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_release(v, old, new); } @@ -2921,7 +2921,7 @@ static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic64_try_cmpxchg_relaxed(v, old, new); } @@ -4432,7 +4432,7 @@ atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { kcsan_mb(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg(v, old, new); } @@ -4454,7 +4454,7 @@ static __always_inline bool atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_acquire(v, old, new); } @@ -4477,7 +4477,7 @@ atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { kcsan_release(); instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_release(v, old, new); } @@ -4499,7 +4499,7 @@ static __always_inline bool atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); - instrument_atomic_read_write(old, sizeof(*old)); + instrument_read_write(old, sizeof(*old)); return raw_atomic_long_try_cmpxchg_relaxed(v, old, new); } @@ -5050,4 +5050,4 @@ atomic_long_dec_if_positive(atomic_long_t *v) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ -// 8829b337928e9508259079d32581775ececd415b +// f618ac667f868941a84ce0ab2242f1786e049ed4 -- cgit v1.2.3 From cc39f3872c0865bef992b713338df369554fa9e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Oct 2025 22:11:54 +0200 Subject: seqlock: Introduce scoped_seqlock_read() The read_seqbegin/need_seqretry/done_seqretry API is cumbersome and error prone. With the new helper the "typical" code like int seq, nextseq; unsigned long flags; nextseq = 0; do { seq = nextseq; flags = read_seqbegin_or_lock_irqsave(&seqlock, &seq); // read-side critical section nextseq = 1; } while (need_seqretry(&seqlock, seq)); done_seqretry_irqrestore(&seqlock, seq, flags); can be rewritten as scoped_seqlock_read (&seqlock, ss_lock_irqsave) { // read-side critical section } Original idea by Oleg Nesterov; with contributions from Linus. Originally-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) --- include/linux/seqlock.h | 111 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5ce48eab7a2a..b7bcc4111e90 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -1209,4 +1209,115 @@ done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) if (seq & 1) read_sequnlock_excl_irqrestore(lock, flags); } + +enum ss_state { + ss_done = 0, + ss_lock, + ss_lock_irqsave, + ss_lockless, +}; + +struct ss_tmp { + enum ss_state state; + unsigned long data; + spinlock_t *lock; + spinlock_t *lock_irqsave; +}; + +static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst) +{ + if (sst->lock) + spin_unlock(sst->lock); + if (sst->lock_irqsave) + spin_unlock_irqrestore(sst->lock_irqsave, sst->data); +} + +extern void __scoped_seqlock_invalid_target(void); + +#if defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000 +/* + * For some reason some GCC-8 architectures (nios2, alpha) have trouble + * determining that the ss_done state is impossible in __scoped_seqlock_next() + * below. + */ +static inline void __scoped_seqlock_bug(void) { } +#else +/* + * Canary for compiler optimization -- if the compiler doesn't realize this is + * an impossible state, it very likely generates sub-optimal code here. + */ +extern void __scoped_seqlock_bug(void); +#endif + +static inline void +__scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target) +{ + switch (sst->state) { + case ss_done: + __scoped_seqlock_bug(); + return; + + case ss_lock: + case ss_lock_irqsave: + sst->state = ss_done; + return; + + case ss_lockless: + if (!read_seqretry(lock, sst->data)) { + sst->state = ss_done; + return; + } + break; + } + + switch (target) { + case ss_done: + __scoped_seqlock_invalid_target(); + return; + + case ss_lock: + sst->lock = &lock->lock; + spin_lock(sst->lock); + sst->state = ss_lock; + return; + + case ss_lock_irqsave: + sst->lock_irqsave = &lock->lock; + spin_lock_irqsave(sst->lock_irqsave, sst->data); + sst->state = ss_lock_irqsave; + return; + + case ss_lockless: + sst->data = read_seqbegin(lock); + return; + } +} + +#define __scoped_seqlock_read(_seqlock, _target, _s) \ + for (struct ss_tmp _s __cleanup(__scoped_seqlock_cleanup) = \ + { .state = ss_lockless, .data = read_seqbegin(_seqlock) }; \ + _s.state != ss_done; \ + __scoped_seqlock_next(&_s, _seqlock, _target)) + +/** + * scoped_seqlock_read (lock, ss_state) - execute the read side critical + * section without manual sequence + * counter handling or calls to other + * helpers + * @lock: pointer to seqlock_t protecting the data + * @ss_state: one of {ss_lock, ss_lock_irqsave, ss_lockless} indicating + * the type of critical read section + * + * Example: + * + * scoped_seqlock_read (&lock, ss_lock) { + * // read-side critical section + * } + * + * Starts with a lockess pass first. If it fails, restarts the critical + * section with the lock held. + */ +#define scoped_seqlock_read(_seqlock, _target) \ + __scoped_seqlock_read(_seqlock, _target, __UNIQUE_ID(seqlock)) + #endif /* __LINUX_SEQLOCK_H */ -- cgit v1.2.3 From 0e85936a9d492acf6ff9519a5f630a7fedb62f7f Mon Sep 17 00:00:00 2001 From: Jishnu Prakash Date: Wed, 24 Sep 2025 16:17:07 -0700 Subject: dt-bindings: power: qcom,rpmpd: add new RPMH levels Add constants for voltage levels: LOW_SVS_D2_1, LOW_SVS_D1_1 and LOW_SVS_L0. Signed-off-by: Jishnu Prakash Signed-off-by: Jingyi Wang Signed-off-by: Ulf Hansson --- include/dt-bindings/power/qcom,rpmhpd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/power/qcom,rpmhpd.h b/include/dt-bindings/power/qcom,rpmhpd.h index 73cceb88953f..50e7c886709d 100644 --- a/include/dt-bindings/power/qcom,rpmhpd.h +++ b/include/dt-bindings/power/qcom,rpmhpd.h @@ -33,11 +33,14 @@ #define RPMH_REGULATOR_LEVEL_RETENTION 16 #define RPMH_REGULATOR_LEVEL_MIN_SVS 48 #define RPMH_REGULATOR_LEVEL_LOW_SVS_D3 50 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D2_1 51 #define RPMH_REGULATOR_LEVEL_LOW_SVS_D2 52 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_D1_1 54 #define RPMH_REGULATOR_LEVEL_LOW_SVS_D1 56 #define RPMH_REGULATOR_LEVEL_LOW_SVS_D0 60 #define RPMH_REGULATOR_LEVEL_LOW_SVS 64 #define RPMH_REGULATOR_LEVEL_LOW_SVS_P1 72 +#define RPMH_REGULATOR_LEVEL_LOW_SVS_L0 76 #define RPMH_REGULATOR_LEVEL_LOW_SVS_L1 80 #define RPMH_REGULATOR_LEVEL_LOW_SVS_L2 96 #define RPMH_REGULATOR_LEVEL_SVS 128 -- cgit v1.2.3 From 3ff9bcecce83f12169ab3e42671bd76554ca521a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Oct 2025 13:37:12 +0000 Subject: net: avoid extra access to sk->sk_wmem_alloc in sock_wfree() UDP TX packets destructor is sock_wfree(). It suffers from a cache line bouncing in sock_def_write_space_wfree(). Instead of reading sk->sk_wmem_alloc after we just did an atomic RMW on it, use __refcount_sub_and_test() to get the old value for free, and pass the new value to sock_def_write_space_wfree(). Add __sock_writeable() helper. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251017133712.2842665-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sock.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 5c564f114ae9..01ce231603db 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2607,12 +2607,16 @@ static inline struct page_frag *sk_page_frag(struct sock *sk) bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); +static inline bool __sock_writeable(const struct sock *sk, int wmem_alloc) +{ + return wmem_alloc < (READ_ONCE(sk->sk_sndbuf) >> 1); +} /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { - return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); + return __sock_writeable(sk, refcount_read(&sk->sk_wmem_alloc)); } static inline gfp_t gfp_any(void) -- cgit v1.2.3 From ebaec90ec0b5850ab80ca017e7b63183adcca131 Mon Sep 17 00:00:00 2001 From: Samuel Kayode Date: Wed, 1 Oct 2025 11:42:38 -0400 Subject: mfd: pf1550: Add core driver for the PF1550 PMIC There are 3 sub-devices for which the drivers will be added in subsequent patches. Signed-off-by: Samuel Kayode Reviewed-by: Frank Li Tested-by: Sean Nyekjaer Link: https://patch.msgid.link/20251001-pf1550-v12-2-a3302aa41687@savoirfairelinux.com Signed-off-by: Lee Jones --- include/linux/mfd/pf1550.h | 273 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 include/linux/mfd/pf1550.h (limited to 'include') diff --git a/include/linux/mfd/pf1550.h b/include/linux/mfd/pf1550.h new file mode 100644 index 000000000000..7cb2340ff2bd --- /dev/null +++ b/include/linux/mfd/pf1550.h @@ -0,0 +1,273 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Declarations for the PF1550 PMIC + * + * Copyright (C) 2016 Freescale Semiconductor, Inc. + * Robin Gong + * + * Portions Copyright (c) 2025 Savoir-faire Linux Inc. + * Samuel Kayode + */ + +#ifndef __LINUX_MFD_PF1550_H +#define __LINUX_MFD_PF1550_H + +#include +#include + +enum pf1550_pmic_reg { + /* PMIC regulator part */ + PF1550_PMIC_REG_DEVICE_ID = 0x00, + PF1550_PMIC_REG_OTP_FLAVOR = 0x01, + PF1550_PMIC_REG_SILICON_REV = 0x02, + + PF1550_PMIC_REG_INT_CATEGORY = 0x06, + PF1550_PMIC_REG_SW_INT_STAT0 = 0x08, + PF1550_PMIC_REG_SW_INT_MASK0 = 0x09, + PF1550_PMIC_REG_SW_INT_SENSE0 = 0x0a, + PF1550_PMIC_REG_SW_INT_STAT1 = 0x0b, + PF1550_PMIC_REG_SW_INT_MASK1 = 0x0c, + PF1550_PMIC_REG_SW_INT_SENSE1 = 0x0d, + PF1550_PMIC_REG_SW_INT_STAT2 = 0x0e, + PF1550_PMIC_REG_SW_INT_MASK2 = 0x0f, + PF1550_PMIC_REG_SW_INT_SENSE2 = 0x10, + PF1550_PMIC_REG_LDO_INT_STAT0 = 0x18, + PF1550_PMIC_REG_LDO_INT_MASK0 = 0x19, + PF1550_PMIC_REG_LDO_INT_SENSE0 = 0x1a, + PF1550_PMIC_REG_TEMP_INT_STAT0 = 0x20, + PF1550_PMIC_REG_TEMP_INT_MASK0 = 0x21, + PF1550_PMIC_REG_TEMP_INT_SENSE0 = 0x22, + PF1550_PMIC_REG_ONKEY_INT_STAT0 = 0x24, + PF1550_PMIC_REG_ONKEY_INT_MASK0 = 0x25, + PF1550_PMIC_REG_ONKEY_INT_SENSE0 = 0x26, + PF1550_PMIC_REG_MISC_INT_STAT0 = 0x28, + PF1550_PMIC_REG_MISC_INT_MASK0 = 0x29, + PF1550_PMIC_REG_MISC_INT_SENSE0 = 0x2a, + + PF1550_PMIC_REG_COINCELL_CONTROL = 0x30, + + PF1550_PMIC_REG_SW1_VOLT = 0x32, + PF1550_PMIC_REG_SW1_STBY_VOLT = 0x33, + PF1550_PMIC_REG_SW1_SLP_VOLT = 0x34, + PF1550_PMIC_REG_SW1_CTRL = 0x35, + PF1550_PMIC_REG_SW1_CTRL1 = 0x36, + PF1550_PMIC_REG_SW2_VOLT = 0x38, + PF1550_PMIC_REG_SW2_STBY_VOLT = 0x39, + PF1550_PMIC_REG_SW2_SLP_VOLT = 0x3a, + PF1550_PMIC_REG_SW2_CTRL = 0x3b, + PF1550_PMIC_REG_SW2_CTRL1 = 0x3c, + PF1550_PMIC_REG_SW3_VOLT = 0x3e, + PF1550_PMIC_REG_SW3_STBY_VOLT = 0x3f, + PF1550_PMIC_REG_SW3_SLP_VOLT = 0x40, + PF1550_PMIC_REG_SW3_CTRL = 0x41, + PF1550_PMIC_REG_SW3_CTRL1 = 0x42, + PF1550_PMIC_REG_VSNVS_CTRL = 0x48, + PF1550_PMIC_REG_VREFDDR_CTRL = 0x4a, + PF1550_PMIC_REG_LDO1_VOLT = 0x4c, + PF1550_PMIC_REG_LDO1_CTRL = 0x4d, + PF1550_PMIC_REG_LDO2_VOLT = 0x4f, + PF1550_PMIC_REG_LDO2_CTRL = 0x50, + PF1550_PMIC_REG_LDO3_VOLT = 0x52, + PF1550_PMIC_REG_LDO3_CTRL = 0x53, + PF1550_PMIC_REG_PWRCTRL0 = 0x58, + PF1550_PMIC_REG_PWRCTRL1 = 0x59, + PF1550_PMIC_REG_PWRCTRL2 = 0x5a, + PF1550_PMIC_REG_PWRCTRL3 = 0x5b, + PF1550_PMIC_REG_SW1_PWRDN_SEQ = 0x5f, + PF1550_PMIC_REG_SW2_PWRDN_SEQ = 0x60, + PF1550_PMIC_REG_SW3_PWRDN_SEQ = 0x61, + PF1550_PMIC_REG_LDO1_PWRDN_SEQ = 0x62, + PF1550_PMIC_REG_LDO2_PWRDN_SEQ = 0x63, + PF1550_PMIC_REG_LDO3_PWRDN_SEQ = 0x64, + PF1550_PMIC_REG_VREFDDR_PWRDN_SEQ = 0x65, + + PF1550_PMIC_REG_STATE_INFO = 0x67, + PF1550_PMIC_REG_I2C_ADDR = 0x68, + PF1550_PMIC_REG_IO_DRV0 = 0x69, + PF1550_PMIC_REG_IO_DRV1 = 0x6a, + PF1550_PMIC_REG_RC_16MHZ = 0x6b, + PF1550_PMIC_REG_KEY = 0x6f, + + /* Charger part */ + PF1550_CHARG_REG_CHG_INT = 0x80, + PF1550_CHARG_REG_CHG_INT_MASK = 0x82, + PF1550_CHARG_REG_CHG_INT_OK = 0x84, + PF1550_CHARG_REG_VBUS_SNS = 0x86, + PF1550_CHARG_REG_CHG_SNS = 0x87, + PF1550_CHARG_REG_BATT_SNS = 0x88, + PF1550_CHARG_REG_CHG_OPER = 0x89, + PF1550_CHARG_REG_CHG_TMR = 0x8a, + PF1550_CHARG_REG_CHG_EOC_CNFG = 0x8d, + PF1550_CHARG_REG_CHG_CURR_CNFG = 0x8e, + PF1550_CHARG_REG_BATT_REG = 0x8f, + PF1550_CHARG_REG_BATFET_CNFG = 0x91, + PF1550_CHARG_REG_THM_REG_CNFG = 0x92, + PF1550_CHARG_REG_VBUS_INLIM_CNFG = 0x94, + PF1550_CHARG_REG_VBUS_LIN_DPM = 0x95, + PF1550_CHARG_REG_USB_PHY_LDO_CNFG = 0x96, + PF1550_CHARG_REG_DBNC_DELAY_TIME = 0x98, + PF1550_CHARG_REG_CHG_INT_CNFG = 0x99, + PF1550_CHARG_REG_THM_ADJ_SETTING = 0x9a, + PF1550_CHARG_REG_VBUS2SYS_CNFG = 0x9b, + PF1550_CHARG_REG_LED_PWM = 0x9c, + PF1550_CHARG_REG_FAULT_BATFET_CNFG = 0x9d, + PF1550_CHARG_REG_LED_CNFG = 0x9e, + PF1550_CHARG_REG_CHGR_KEY2 = 0x9f, + + PF1550_TEST_REG_FMRADDR = 0xc4, + PF1550_TEST_REG_FMRDATA = 0xc5, + PF1550_TEST_REG_KEY3 = 0xdf, + + PF1550_PMIC_REG_END = 0xff, +}; + +/* One-Time Programmable(OTP) memory */ +enum pf1550_otp_reg { + PF1550_OTP_SW1_SW2 = 0x1e, + PF1550_OTP_SW2_SW3 = 0x1f, +}; + +#define PF1550_DEVICE_ID 0x7c + +/* Keys for reading OTP */ +#define PF1550_OTP_PMIC_KEY 0x15 +#define PF1550_OTP_CHGR_KEY 0x50 +#define PF1550_OTP_TEST_KEY 0xab + +/* Supported charger modes */ +#define PF1550_CHG_BAT_OFF 1 +#define PF1550_CHG_BAT_ON 2 + +#define PF1550_CHG_PRECHARGE 0 +#define PF1550_CHG_CONSTANT_CURRENT 1 +#define PF1550_CHG_CONSTANT_VOL 2 +#define PF1550_CHG_EOC 3 +#define PF1550_CHG_DONE 4 +#define PF1550_CHG_TIMER_FAULT 6 +#define PF1550_CHG_SUSPEND 7 +#define PF1550_CHG_OFF_INV 8 +#define PF1550_CHG_BAT_OVER 9 +#define PF1550_CHG_OFF_TEMP 10 +#define PF1550_CHG_LINEAR_ONLY 12 +#define PF1550_CHG_SNS_MASK 0xf +#define PF1550_CHG_INT_MASK 0x51 + +#define PF1550_BAT_NO_VBUS 0 +#define PF1550_BAT_LOW_THAN_PRECHARG 1 +#define PF1550_BAT_CHARG_FAIL 2 +#define PF1550_BAT_HIGH_THAN_PRECHARG 4 +#define PF1550_BAT_OVER_VOL 5 +#define PF1550_BAT_NO_DETECT 6 +#define PF1550_BAT_SNS_MASK 0x7 + +#define PF1550_VBUS_UVLO BIT(2) +#define PF1550_VBUS_IN2SYS BIT(3) +#define PF1550_VBUS_OVLO BIT(4) +#define PF1550_VBUS_VALID BIT(5) + +#define PF1550_CHARG_REG_BATT_REG_CHGCV_MASK 0x3f +#define PF1550_CHARG_REG_BATT_REG_VMINSYS_SHIFT 6 +#define PF1550_CHARG_REG_BATT_REG_VMINSYS_MASK GENMASK(7, 6) +#define PF1550_CHARG_REG_THM_REG_CNFG_REGTEMP_SHIFT 2 +#define PF1550_CHARG_REG_THM_REG_CNFG_REGTEMP_MASK GENMASK(3, 2) + +#define PF1550_ONKEY_RST_EN BIT(7) + +/* DVS enable masks */ +#define OTP_SW1_DVS_ENB BIT(1) +#define OTP_SW2_DVS_ENB BIT(3) + +/* Top level interrupt masks */ +#define IRQ_REGULATOR (BIT(1) | BIT(2) | BIT(3) | BIT(4) | BIT(6)) +#define IRQ_ONKEY BIT(5) +#define IRQ_CHG BIT(0) + +/* Regulator interrupt masks */ +#define PMIC_IRQ_SW1_LS BIT(0) +#define PMIC_IRQ_SW2_LS BIT(1) +#define PMIC_IRQ_SW3_LS BIT(2) +#define PMIC_IRQ_SW1_HS BIT(0) +#define PMIC_IRQ_SW2_HS BIT(1) +#define PMIC_IRQ_SW3_HS BIT(2) +#define PMIC_IRQ_LDO1_FAULT BIT(0) +#define PMIC_IRQ_LDO2_FAULT BIT(1) +#define PMIC_IRQ_LDO3_FAULT BIT(2) +#define PMIC_IRQ_TEMP_110 BIT(0) +#define PMIC_IRQ_TEMP_125 BIT(1) + +/* Onkey interrupt masks */ +#define ONKEY_IRQ_PUSHI BIT(0) +#define ONKEY_IRQ_1SI BIT(1) +#define ONKEY_IRQ_2SI BIT(2) +#define ONKEY_IRQ_3SI BIT(3) +#define ONKEY_IRQ_4SI BIT(4) +#define ONKEY_IRQ_8SI BIT(5) + +/* Charger interrupt masks */ +#define CHARG_IRQ_BAT2SOCI BIT(1) +#define CHARG_IRQ_BATI BIT(2) +#define CHARG_IRQ_CHGI BIT(3) +#define CHARG_IRQ_VBUSI BIT(5) +#define CHARG_IRQ_DPMI BIT(6) +#define CHARG_IRQ_THMI BIT(7) + +enum pf1550_irq { + PF1550_IRQ_CHG, + PF1550_IRQ_REGULATOR, + PF1550_IRQ_ONKEY, +}; + +enum pf1550_pmic_irq { + PF1550_PMIC_IRQ_SW1_LS, + PF1550_PMIC_IRQ_SW2_LS, + PF1550_PMIC_IRQ_SW3_LS, + PF1550_PMIC_IRQ_SW1_HS, + PF1550_PMIC_IRQ_SW2_HS, + PF1550_PMIC_IRQ_SW3_HS, + PF1550_PMIC_IRQ_LDO1_FAULT, + PF1550_PMIC_IRQ_LDO2_FAULT, + PF1550_PMIC_IRQ_LDO3_FAULT, + PF1550_PMIC_IRQ_TEMP_110, + PF1550_PMIC_IRQ_TEMP_125, +}; + +enum pf1550_onkey_irq { + PF1550_ONKEY_IRQ_PUSHI, + PF1550_ONKEY_IRQ_1SI, + PF1550_ONKEY_IRQ_2SI, + PF1550_ONKEY_IRQ_3SI, + PF1550_ONKEY_IRQ_4SI, + PF1550_ONKEY_IRQ_8SI, +}; + +enum pf1550_charg_irq { + PF1550_CHARG_IRQ_BAT2SOCI, + PF1550_CHARG_IRQ_BATI, + PF1550_CHARG_IRQ_CHGI, + PF1550_CHARG_IRQ_VBUSI, + PF1550_CHARG_IRQ_THMI, +}; + +enum pf1550_regulators { + PF1550_SW1, + PF1550_SW2, + PF1550_SW3, + PF1550_VREFDDR, + PF1550_LDO1, + PF1550_LDO2, + PF1550_LDO3, +}; + +struct pf1550_ddata { + struct regmap_irq_chip_data *irq_data_regulator; + struct regmap_irq_chip_data *irq_data_charger; + struct regmap_irq_chip_data *irq_data_onkey; + struct regmap_irq_chip_data *irq_data; + struct regmap *regmap; + struct device *dev; + bool dvs1_enable; + bool dvs2_enable; + int irq; +}; + +#endif /* __LINUX_MFD_PF1550_H */ -- cgit v1.2.3 From f7d72d0b3f438b881dba16c7c00493f16e41a821 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 19 Oct 2025 20:21:30 +0000 Subject: bpf: save the start of functions in bpf_prog_aux Introduce a new subprog_start field in bpf_prog_aux. This field may be used by JIT compilers wanting to know the real absolute xlated offset of the function being jitted. The func_info[func_id] may have served this purpose, but func_info may be NULL, so JIT compilers can't rely on it. Signed-off-by: Anton Protopopov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251019202145.3944697-3-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 204f9c759a41..3bda915cd7a8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1623,6 +1623,7 @@ struct bpf_prog_aux { u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; + u32 subprog_start; struct btf *attach_btf; struct bpf_ctx_arg_aux *ctx_arg_info; void __percpu *priv_stack_ptr; -- cgit v1.2.3 From 44481e4925327d833f2e37c8741406e4cabfe054 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 19 Oct 2025 20:21:31 +0000 Subject: bpf: generalize and export map_get_next_key for arrays The kernel/bpf/array.c file defines the array_map_get_next_key() function which finds the next key for array maps. It actually doesn't use any map fields besides the generic max_entries field. Generalize it, and export as bpf_array_get_next_key() such that it can be re-used by other array-like maps. Signed-off-by: Anton Protopopov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251019202145.3944697-4-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3bda915cd7a8..e53cda0aabb6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2107,6 +2107,12 @@ struct bpf_array { }; }; +/* + * The bpf_array_get_next_key() function may be used for all array-like + * maps, i.e., maps with u32 keys with range [0 ,..., max_entries) + */ +int bpf_array_get_next_key(struct bpf_map *map, void *key, void *next_key); + #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 -- cgit v1.2.3 From 2f69c5685427308d2f312646779313f3677536bc Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Sun, 19 Oct 2025 20:21:37 +0000 Subject: bpf: make bpf_insn_successors to return a pointer The bpf_insn_successors() function is used to return successors to a BPF instruction. So far, an instruction could have 0, 1 or 2 successors. Prepare the verifier code to introduction of instructions with more than 2 successors (namely, indirect jumps). To do this, introduce a new struct, struct bpf_iarray, containing an array of bpf instruction indexes and make bpf_insn_successors to return a pointer of that type. The storage for all instructions is allocated in the env->succ, which holds an array of size 2, to be used for all instructions. Signed-off-by: Anton Protopopov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251019202145.3944697-10-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b57222a25a4a..c6eb68b6389c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -509,6 +509,15 @@ struct bpf_map_ptr_state { #define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ BPF_ALU_SANITIZE_DST) +/* + * An array of BPF instructions. + * Primary usage: return value of bpf_insn_successors. + */ +struct bpf_iarray { + int cnt; + u32 items[]; +}; + struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ @@ -828,6 +837,7 @@ struct bpf_verifier_env { /* array of pointers to bpf_scc_info indexed by SCC id */ struct bpf_scc_info **scc_info; u32 scc_cnt; + struct bpf_iarray *succ; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) @@ -1050,7 +1060,7 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_st struct bpf_subprog_info *bpf_find_containing_subprog(struct bpf_verifier_env *env, int off); int bpf_jmp_offset(struct bpf_insn *insn); -int bpf_insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]); +struct bpf_iarray *bpf_insn_successors(struct bpf_verifier_env *env, u32 idx); void bpf_fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask); bool bpf_calls_callback(struct bpf_verifier_env *env, int insn_idx); -- cgit v1.2.3 From 28098defc79fe7d29e6bfe4eb6312991f6bdc3d3 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 17 Oct 2025 03:41:52 +0000 Subject: net: add a common function to compute features for upper devices Some high level software drivers need to compute features from lower devices. But each has their own implementations and may lost some feature compute. Let's use one common function to compute features for kinds of these devices. The new helper uses the current bond implementation as the reference one, as the latter already handles all the relevant aspects: netdev features, TSO limits and dst retention. Suggested-by: Paolo Abeni Signed-off-by: Hangbin Liu Reviewed-by: Sabrina Dubroca Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20251017034155.61990-2-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/netdev_features.h | 18 ++++++++++++++++++ include/linux/netdevice.h | 1 + 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 7a01c518e573..93e4da7046a1 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -255,6 +255,24 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) NETIF_F_GSO_UDP_TUNNEL | \ NETIF_F_GSO_UDP_TUNNEL_CSUM) +/* virtual device features */ +#define MASTER_UPPER_DEV_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \ + NETIF_F_GSO_ENCAP_ALL | \ + NETIF_F_HIGHDMA | NETIF_F_LRO) + +#define MASTER_UPPER_DEV_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE | \ + NETIF_F_GSO_PARTIAL) + +#define MASTER_UPPER_DEV_MPLS_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ + NETIF_F_GSO_SOFTWARE) + +#define MASTER_UPPER_DEV_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \ + NETIF_F_GSO_ESP) + +#define MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES (NETIF_F_GSO_ESP) + static inline netdev_features_t netdev_base_features(netdev_features_t features) { features &= ~NETIF_F_ONE_FOR_ALL; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d1a687444b27..7f5aad5cc9a1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5304,6 +5304,7 @@ static inline netdev_features_t netdev_add_tso_features(netdev_features_t featur int __netdev_update_features(struct net_device *dev); void netdev_update_features(struct net_device *dev); void netdev_change_features(struct net_device *dev); +void netdev_compute_master_upper_features(struct net_device *dev, bool update_header); void netif_stacked_transfer_operstate(const struct net_device *rootdev, struct net_device *dev); -- cgit v1.2.3 From ce085ecdba23a5d5462877d884ecff3ffceaad22 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 14 Oct 2025 15:04:25 -0700 Subject: scsi: core: Do not declare scsi_cmnd pointers const This change allows removing multiple casts and hence improves type checking by the compiler. Cc: Hannes Reinecke Suggested-by: John Garry Signed-off-by: Bart Van Assche Reviewed-by: John Garry Link: https://patch.msgid.link/20251014220426.3690007-1-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/scsi/scsi_dbg.h | 4 ++-- include/scsi/scsi_device.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/scsi/scsi_dbg.h b/include/scsi/scsi_dbg.h index bd29cdb513a5..efcdc78530d5 100644 --- a/include/scsi/scsi_dbg.h +++ b/include/scsi/scsi_dbg.h @@ -11,11 +11,11 @@ extern size_t __scsi_format_command(char *, size_t, const unsigned char *, size_t); extern void scsi_print_sense_hdr(const struct scsi_device *, const char *, const struct scsi_sense_hdr *); -extern void scsi_print_sense(const struct scsi_cmnd *); +extern void scsi_print_sense(struct scsi_cmnd *); extern void __scsi_print_sense(const struct scsi_device *, const char *name, const unsigned char *sense_buffer, int sense_len); -extern void scsi_print_result(const struct scsi_cmnd *, const char *, int); +extern void scsi_print_result(struct scsi_cmnd *, const char *, int); #ifdef CONFIG_SCSI_CONSTANTS extern bool scsi_opcode_sa_name(int, int, const char **, const char **); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 6d6500148c4b..4c106342c4ae 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -313,8 +313,8 @@ sdev_prefix_printk(const char *, const struct scsi_device *, const char *, #define sdev_printk(l, sdev, fmt, a...) \ sdev_prefix_printk(l, sdev, NULL, fmt, ##a) -__printf(3, 4) void -scmd_printk(const char *, const struct scsi_cmnd *, const char *, ...); +__printf(3, 4) void scmd_printk(const char *, struct scsi_cmnd *, const char *, + ...); #define scmd_dbg(scmd, fmt, a...) \ do { \ -- cgit v1.2.3 From bfe0d22f12559f44bf27ae88b9c4a9f8fdae65d0 Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Thu, 16 Oct 2025 10:32:31 +0800 Subject: scsi: ufs: core: Update CQ Entry to UFS 4.1 format Update the completion queue (CQ) entry format according to the UFS 4.1 specification. UFS 4.1 introduces new members in reserved record DW5. Also refine DW4 with detailed members defined in UFS 4.0. Modify the code to incorporate these changes by updating the overall_status in the CQ entry structure. Signed-off-by: Peter Wang Reviewed-by: Bart Van Assche Link: https://patch.msgid.link/20251016023507.1000664-2-peter.wang@mediatek.com Signed-off-by: Martin K. Petersen --- include/ufs/ufshci.h | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/ufs/ufshci.h b/include/ufs/ufshci.h index e64b70132101..bfc5401a9a0a 100644 --- a/include/ufs/ufshci.h +++ b/include/ufs/ufshci.h @@ -569,10 +569,26 @@ struct cq_entry { __le16 prd_table_offset; /* DW 4 */ - __le32 status; + u8 overall_status; + u8 extended_error_code; + __le16 reserved_1; - /* DW 5-7 */ - __le32 reserved[3]; + /* DW 5 */ + u8 task_tag; + u8 lun; +#if defined(__BIG_ENDIAN) + u8 ext_iid:4; + u8 iid:4; +#elif defined(__LITTLE_ENDIAN) + u8 iid:4; + u8 ext_iid:4; +#else +#error +#endif + u8 reserved_2; + + /* DW 6-7 */ + __le32 reserved_3[2]; }; static_assert(sizeof(struct cq_entry) == 32); -- cgit v1.2.3 From f8e82ae65eaf347fb8924a1d9c544da7bcb9f798 Mon Sep 17 00:00:00 2001 From: "Bao D. Nguyen" Date: Mon, 13 Oct 2025 12:38:15 -0700 Subject: scsi: ufs: core: Remove UFS_DEVICE_QUIRK_DELAY_AFTER_LPM quirk After the UFS device VCC is turned off, all the UFS device manufacturers require a period of power-off time before the VCC can be turned on again. This requirement has been confirmed with all the UFS device manufacturer's datasheets. Remove the UFS_DEVICE_QUIRK_DELAY_AFTER_LPM quirk in the UFS core driver and implement a universal delay that is required by all the UFS device manufacturers. In addition, remove the support for this quirk in the platform drivers. Signed-off-by: Bao D. Nguyen Reviewed-by: Peter Wang Reviewed-by: Bart Van Assche Link: https://patch.msgid.link/25f134d5a42e8b8365be64d512d1bb5fc2bce6ff.1760383740.git.quic_nguyenb@quicinc.com Signed-off-by: Martin K. Petersen --- include/ufs/ufs_quirks.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/ufs/ufs_quirks.h b/include/ufs/ufs_quirks.h index 83563247c36c..e9c59ec1ceae 100644 --- a/include/ufs/ufs_quirks.h +++ b/include/ufs/ufs_quirks.h @@ -100,13 +100,6 @@ struct ufs_dev_quirk { */ #define UFS_DEVICE_QUIRK_SUPPORT_EXTENDED_FEATURES (1 << 10) -/* - * Some UFS devices require delay after VCC power rail is turned-off. - * Enable this quirk to introduce 5ms delays after VCC power-off during - * suspend flow. - */ -#define UFS_DEVICE_QUIRK_DELAY_AFTER_LPM (1 << 11) - /* * Some ufs devices may need more time to be in hibern8 before exiting. * Enable this quirk to give it an additional 100us. -- cgit v1.2.3 From 4760b639b43c107c8bfccd658478bbb3152fa56f Mon Sep 17 00:00:00 2001 From: "Bao D. Nguyen" Date: Mon, 13 Oct 2025 12:38:16 -0700 Subject: scsi: ufs: core: Replace hard coded vcc-off delay with a variable After the UFS device VCC is powered off, all the UFS device manufacturers require a minimum of 1ms of power-off time before VCC can be powered on again. This requirement has been verified with all the UFS device manufacturer's datasheets. Replace the hard coded 5ms delay with a variable with a default setting of 2ms to improve the system resume latency. The platform drivers can override this setting as needed. Signed-off-by: Bao D. Nguyen Reviewed-by: Bart Van Assche Reviewed-by: Peter Wang Link: https://patch.msgid.link/72fa649406a0bf02271575b7d58f22c968aa5d7e.1760383740.git.quic_nguyenb@quicinc.com Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 9425cfd9d00e..752640a3e25f 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1117,6 +1117,8 @@ struct ufs_hba { int critical_health_count; atomic_t dev_lvl_exception_count; u64 dev_lvl_exception_id; + + u32 vcc_off_delay_us; }; /** -- cgit v1.2.3 From ca4709843b7e72f96976cd6b35bca148a4071673 Mon Sep 17 00:00:00 2001 From: David Yang Date: Fri, 17 Oct 2025 14:08:54 +0800 Subject: net: dsa: tag_yt921x: add support for Motorcomm YT921x tags Add support for Motorcomm YT921x tags, which includes a proper configurable ethertype field (default to 0x9988). Signed-off-by: David Yang Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20251017060859.326450-3-mmyangfl@gmail.com Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 ++ include/uapi/linux/if_ether.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index d73ea0880066..67762fdaf3c7 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -55,6 +55,7 @@ struct tc_action; #define DSA_TAG_PROTO_LAN937X_VALUE 27 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 #define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE 29 +#define DSA_TAG_PROTO_YT921X_VALUE 30 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -87,6 +88,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_RZN1_A5PSW = DSA_TAG_PROTO_RZN1_A5PSW_VALUE, DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE, DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE, + DSA_TAG_PROTO_YT921X = DSA_TAG_PROTO_YT921X_VALUE, }; struct dsa_switch; diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 69e0457eb200..cfd200c87e5e 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -114,6 +114,7 @@ #define ETH_P_QINQ1 0x9100 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_QINQ3 0x9300 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_YT921X 0x9988 /* Motorcomm YT921x DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_DSA_8021Q 0xDADB /* Fake VLAN Header for DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_DSA_A5PSW 0xE001 /* A5PSW Tag Value [ NOT AN OFFICIALLY REGISTERED ID ] */ -- cgit v1.2.3 From 7162536410768ec6b219524c36d3a871ff97adf8 Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Wed, 24 Sep 2025 17:43:23 +0800 Subject: scsi: ufs: host: mediatek: Correct clock scaling with PM QoS flow Correct clock scaling with PM QoS during suspend and resume. Ensure PM QoS is released during suspend if scaling up and re-applied after resume. This prevents performance issues and maintains proper power management. Signed-off-by: Peter Wang Reviewed-by: Bart Van Assche Acked-by: Chun-Hung Wu Link: https://patch.msgid.link/20250924094527.2992256-2-peter.wang@mediatek.com Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 9425cfd9d00e..ce7301d63c5c 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1487,5 +1487,6 @@ int ufshcd_write_ee_control(struct ufs_hba *hba); int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, const u16 *other_mask, u16 set, u16 clr); void ufshcd_force_error_recovery(struct ufs_hba *hba); +void ufshcd_pm_qos_update(struct ufs_hba *hba, bool on); #endif /* End of Header */ -- cgit v1.2.3 From 1fd05367d5b1a5edd3d14c966a5f510e5b8a0c5e Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Wed, 24 Sep 2025 17:43:26 +0800 Subject: scsi: ufs: host: mediatek: Adjust sync length for FASTAUTO mode Set the sync length for FASTAUTO G1 mode in the UFS Mediatek driver. This ensures the sync length meets minimum values for high-speed gears, improving stability during power mode changes. Signed-off-by: Peter Wang Reviewed-by: Bart Van Assche Acked-by: Chun-Hung Wu Link: https://patch.msgid.link/20250924094527.2992256-5-peter.wang@mediatek.com Signed-off-by: Martin K. Petersen --- include/ufs/unipro.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/ufs/unipro.h b/include/ufs/unipro.h index 360e1245fb40..498ec9028b3c 100644 --- a/include/ufs/unipro.h +++ b/include/ufs/unipro.h @@ -111,6 +111,9 @@ #define PA_TXLINKSTARTUPHS 0x1544 #define PA_AVAILRXDATALANES 0x1540 #define PA_MINRXTRAILINGCLOCKS 0x1543 +#define PA_TXHSG1SYNCLENGTH 0x1552 +#define PA_TXHSG2SYNCLENGTH 0x1554 +#define PA_TXHSG3SYNCLENGTH 0x1556 #define PA_LOCAL_TX_LCC_ENABLE 0x155E #define PA_ACTIVETXDATALANES 0x1560 #define PA_CONNECTEDTXDATALANES 0x1561 @@ -160,7 +163,9 @@ #define PA_PACPFRAMECOUNT 0x15C0 #define PA_PACPERRORCOUNT 0x15C1 #define PA_PHYTESTCONTROL 0x15C2 -#define PA_TXHSADAPTTYPE 0x15D4 +#define PA_TXHSG4SYNCLENGTH 0x15D0 +#define PA_TXHSADAPTTYPE 0x15D4 +#define PA_TXHSG5SYNCLENGTH 0x15D6 /* Adpat type for PA_TXHSADAPTTYPE attribute */ #define PA_REFRESH_ADAPT 0x00 -- cgit v1.2.3 From 9b2b03b36168bcda298546b121d6ecc530d01d25 Mon Sep 17 00:00:00 2001 From: Peter Wang Date: Wed, 24 Sep 2025 17:43:28 +0800 Subject: scsi: ufs: host: mediatek: Remove duplicate function Remove the duplicate ufs_mtk_us_to_ahit() function in the UFS Mediatek driver and export the existing ufshcd_us_to_ahit() function for shared use. This change reduces redundancy and maintains consistency across the codebase. Signed-off-by: Peter Wang Reviewed-by: Bart Van Assche Acked-by: Chun-Hung Wu Link: https://patch.msgid.link/20250924094527.2992256-7-peter.wang@mediatek.com Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index ce7301d63c5c..4b3a8daf8e0b 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1488,5 +1488,6 @@ int ufshcd_update_ee_control(struct ufs_hba *hba, u16 *mask, const u16 *other_mask, u16 set, u16 clr); void ufshcd_force_error_recovery(struct ufs_hba *hba); void ufshcd_pm_qos_update(struct ufs_hba *hba, bool on); +u32 ufshcd_us_to_ahit(unsigned int timer); #endif /* End of Header */ -- cgit v1.2.3 From 6837c006d4e72d6add451411bcf407e0dea4ad25 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Tue, 21 Oct 2025 15:24:47 +0000 Subject: firmware: exynos-acpm: add empty method to allow compile test Provide empty method for devm_acpm_get_by_node() if we aren't building in the CONFIG_EXYNOS_ACPM_PROTOCOL. This allows to test-build the CONFIG_EXYNOS_ACPM_CLK code. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202510211905.RgfWkgss-lkp@intel.com/ Fixes: 40498a742053 ("clk: samsung: add Exynos ACPM clock driver") Signed-off-by: Tudor Ambarus Link: https://patch.msgid.link/20251021-fix-acpm-clk-build-test-v1-1-236a3d6db7f5@linaro.org Signed-off-by: Krzysztof Kozlowski --- include/linux/firmware/samsung/exynos-acpm-protocol.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/samsung/exynos-acpm-protocol.h b/include/linux/firmware/samsung/exynos-acpm-protocol.h index b1e95435240f..2091da965a5a 100644 --- a/include/linux/firmware/samsung/exynos-acpm-protocol.h +++ b/include/linux/firmware/samsung/exynos-acpm-protocol.h @@ -55,7 +55,16 @@ struct acpm_handle { struct device; +#if IS_ENABLED(CONFIG_EXYNOS_ACPM_PROTOCOL) const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, struct device_node *np); +#else + +static inline const struct acpm_handle *devm_acpm_get_by_node(struct device *dev, + struct device_node *np) +{ + return NULL; +} +#endif #endif /* __EXYNOS_ACPM_PROTOCOL_H */ -- cgit v1.2.3 From fdd00d79dc0e8a3f90be65d5060c55bb115c0f43 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 15 Oct 2025 20:35:43 -0700 Subject: ipack: fix ipack.h kernel-doc warnings Fix various kernel-doc warnings in ipack.h: - Remove an empty kernel-doc comment. - Add 2 missing struct short descriptions. - Fix a typo in a description. - Add a missing struct field description. - Add some missing Return descriptions. - Clarify one function short description. Warning: ../include/linux/ipack.h:73 Cannot find identifier on line: */ Warning: ../include/linux/ipack.h:74 Cannot find identifier on line: struct ipack_region { Warning: ../include/linux/ipack.h:75 Cannot find identifier on line: phys_addr_t start; Warning: ../include/linux/ipack.h:76 Cannot find identifier on line: size_t size; Warning: ../include/linux/ipack.h:77 Cannot find identifier on line: }; Warning: ../include/linux/ipack.h:78 Cannot find identifier on line: Warning: ../include/linux/ipack.h:79 Cannot find identifier on line: /** Warning: ipack.h:80 missing initial short description on line: * struct ipack_device Warning: ipack.h:163 missing initial short description on line: * struct ipack_bus_device Warning: ipack.h:130 struct member 'id_table' not described in 'ipack_driver' Warning: ipack.h:189 No description found for return value of 'ipack_bus_register' Warning: ipack.h:194 No description found for return value of 'ipack_bus_unregister' *** Warning: ipack.h:202 No description found for return value of 'ipack_driver_register' Warning: ipack.h:221 No description found for return value of 'ipack_device_init' Warning: ipack.h:236 No description found for return value of 'ipack_device_add' Warning: ipack.h:271 No description found for return value of 'ipack_get_carrier' Signed-off-by: Randy Dunlap Acked-by: Vaibhav Gupta Link: https://patch.msgid.link/20251016033543.1142049-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/linux/ipack.h | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/ipack.h b/include/linux/ipack.h index 2c6936b8371f..455f6c2a1903 100644 --- a/include/linux/ipack.h +++ b/include/linux/ipack.h @@ -70,15 +70,13 @@ enum ipack_space { IPACK_SPACE_COUNT, }; -/** - */ struct ipack_region { phys_addr_t start; size_t size; }; /** - * struct ipack_device + * struct ipack_device - subsystem representation of an IPack device * * @slot: Slot where the device is plugged in the carrier board * @bus: ipack_bus_device where the device is plugged to. @@ -89,7 +87,7 @@ struct ipack_region { * * Warning: Direct access to mapped memory is possible but the endianness * is not the same with PCI carrier or VME carrier. The endianness is managed - * by the carrier board throught bus->ops. + * by the carrier board through bus->ops. */ struct ipack_device { unsigned int slot; @@ -124,6 +122,7 @@ struct ipack_driver_ops { * struct ipack_driver -- Specific data to each ipack device driver * * @driver: Device driver kernel representation + * @id_table: Device ID table for this driver * @ops: Callbacks provided by the IPack device driver */ struct ipack_driver { @@ -161,7 +160,7 @@ struct ipack_bus_ops { }; /** - * struct ipack_bus_device + * struct ipack_bus_device - IPack bus representation * * @dev: pointer to carrier device * @slots: number of slots available @@ -185,6 +184,8 @@ struct ipack_bus_device { * * The carrier board device should call this function to register itself as * available bus device in ipack. + * + * Return: %NULL on error or &struct ipack_bus_device on success */ struct ipack_bus_device *ipack_bus_register(struct device *parent, int slots, const struct ipack_bus_ops *ops, @@ -192,6 +193,8 @@ struct ipack_bus_device *ipack_bus_register(struct device *parent, int slots, /** * ipack_bus_unregister -- unregister an ipack bus + * + * Return: %0 */ int ipack_bus_unregister(struct ipack_bus_device *bus); @@ -200,6 +203,8 @@ int ipack_bus_unregister(struct ipack_bus_device *bus); * * Called by a ipack driver to register itself as a driver * that can manage ipack devices. + * + * Return: zero on success or error code on failure. */ int ipack_driver_register(struct ipack_driver *edrv, struct module *owner, const char *name); @@ -215,7 +220,7 @@ void ipack_driver_unregister(struct ipack_driver *edrv); * function. The rest of the fields will be allocated and populated * during initalization. * - * Return zero on success or error code on failure. + * Return: zero on success or error code on failure. * * NOTE: _Never_ directly free @dev after calling this function, even * if it returned an error! Always use ipack_put_device() to give up the @@ -230,7 +235,7 @@ int ipack_device_init(struct ipack_device *dev); * Add a new IPack device. The call is done by the carrier driver * after calling ipack_device_init(). * - * Return zero on success or error code on failure. + * Return: zero on success or error code on failure. * * NOTE: _Never_ directly free @dev after calling this function, even * if it returned an error! Always use ipack_put_device() to give up the @@ -266,9 +271,11 @@ void ipack_put_device(struct ipack_device *dev); .device = (dev) /** - * ipack_get_carrier - it increase the carrier ref. counter of + * ipack_get_carrier - try to increase the carrier ref. counter of * the carrier module * @dev: mezzanine device which wants to get the carrier + * + * Return: true on success. */ static inline int ipack_get_carrier(struct ipack_device *dev) { -- cgit v1.2.3 From 9fd2eb9e18a0a0b5a127937586388ed0181d9dac Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 14 Sep 2025 15:42:40 +0200 Subject: cdx: make cdx_bus_type constant Now that the driver core can properly handle constant struct bus_type, move the cdx_bus_type variable to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Cc: Nipun Gupta Cc: Nikhil Agarwal Acked-by: Nipun Gupta Signed-off-by: Greg Kroah-Hartman Link: https://patch.msgid.link/2025091439-sustained-acorn-4af4@gregkh Signed-off-by: Greg Kroah-Hartman --- include/linux/cdx/cdx_bus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h index 79bb80e56790..b1ba97f6c9ad 100644 --- a/include/linux/cdx/cdx_bus.h +++ b/include/linux/cdx/cdx_bus.h @@ -234,7 +234,7 @@ int __must_check __cdx_driver_register(struct cdx_driver *cdx_driver, */ void cdx_driver_unregister(struct cdx_driver *cdx_driver); -extern struct bus_type cdx_bus_type; +extern const struct bus_type cdx_bus_type; /** * cdx_dev_reset - Reset CDX device -- cgit v1.2.3 From 61e606305672342858a647af3629d9dfcc4e4265 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Barna=C5=9B?= Date: Fri, 19 Sep 2025 06:53:27 +0000 Subject: drivers: eisa: make eisa_bus_type const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because driver core can properly handle constant struct bus_type, move the eisa_bus_type to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Signed-off-by: Adrian Barnaś Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250919065327.672924-1-abarnas@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/eisa.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/eisa.h b/include/linux/eisa.h index 21a2ecc1e538..cf55630b595b 100644 --- a/include/linux/eisa.h +++ b/include/linux/eisa.h @@ -68,7 +68,7 @@ struct eisa_driver { /* These external functions are only available when EISA support is enabled. */ #ifdef CONFIG_EISA -extern struct bus_type eisa_bus_type; +extern const struct bus_type eisa_bus_type; int eisa_driver_register (struct eisa_driver *edrv); void eisa_driver_unregister (struct eisa_driver *edrv); -- cgit v1.2.3 From 8ce6b508f24b4ef3a78c2c0d92e67b9e324c4f7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20Barna=C5=9B?= Date: Fri, 19 Sep 2025 07:32:01 +0000 Subject: drivers: rapidio: make rio_bus_type const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because driver core can properly handle constant struct bus_type, move the rio_bus_type to be a constant structure as well, placing it into read-only memory which can not be modified at runtime. Signed-off-by: Adrian Barnaś Reviewed-by: Greg Kroah-Hartman Link: https://patch.msgid.link/20250919073201.751348-1-abarnas@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/rio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rio.h b/include/linux/rio.h index 3c29f40f3c94..2c29f21ba9e5 100644 --- a/include/linux/rio.h +++ b/include/linux/rio.h @@ -78,7 +78,7 @@ #define RIO_CTAG_RESRVD 0xfffe0000 /* Reserved */ #define RIO_CTAG_UDEVID 0x0001ffff /* Unique device identifier */ -extern struct bus_type rio_bus_type; +extern const struct bus_type rio_bus_type; extern struct class rio_mport_class; struct rio_mport; -- cgit v1.2.3 From cebd22dd3a0ac76e0e1f2f369bba710bc6b1dc66 Mon Sep 17 00:00:00 2001 From: Pei Xiao Date: Thu, 18 Sep 2025 10:54:02 +0800 Subject: platform: Use IOMEM_ERR_PTR for ioremap error returns Replace ERR_PTR() with IOMEM_ERR_PTR() in stubbed ioremap functions to maintain type consistency. The functions return void __iomem * pointers and IOMEM_ERR_PTR() provides proper type casting to avoid sparse warnings. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509060307.JubgnLhc-lkp@intel.com/ Signed-off-by: Pei Xiao Link: https://patch.msgid.link/320f2cc9ada5cb66845daa6bf259000b4cffd8b3.1758163939.git.xiaopei01@kylinos.cn Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_device.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 074754c23d33..1d424fed1435 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -80,7 +80,7 @@ static inline void __iomem * devm_platform_get_and_ioremap_resource(struct platform_device *pdev, unsigned int index, struct resource **res) { - return ERR_PTR(-EINVAL); + return IOMEM_ERR_PTR(-EINVAL); } @@ -88,14 +88,14 @@ static inline void __iomem * devm_platform_ioremap_resource(struct platform_device *pdev, unsigned int index) { - return ERR_PTR(-EINVAL); + return IOMEM_ERR_PTR(-EINVAL); } static inline void __iomem * devm_platform_ioremap_resource_byname(struct platform_device *pdev, const char *name) { - return ERR_PTR(-EINVAL); + return IOMEM_ERR_PTR(-EINVAL); } #endif -- cgit v1.2.3 From 6d0ef68955d30be1e218caf160ec32eec23ebc6e Mon Sep 17 00:00:00 2001 From: Yunhui Cui Date: Tue, 23 Sep 2025 09:54:09 +0800 Subject: arch_topology: move parse_acpi_topology() to common code Currently, RISC-V lacks arch-specific registers for CPU topology properties and must get them from ACPI. Thus, parse_acpi_topology() is moved from arm64/ to drivers/ for RISC-V reuse. Signed-off-by: Yunhui Cui Reviewed-by: Sudeep Holla Link: https://patch.msgid.link/20250923015409.15983-2-cuiyunhui@bytedance.com Signed-off-by: Greg Kroah-Hartman --- include/linux/arch_topology.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d72d6e5aa200..766ed9cf0e54 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -80,6 +80,11 @@ extern struct cpu_topology cpu_topology[NR_CPUS]; #define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_sibling) #define topology_cluster_cpumask(cpu) (&cpu_topology[cpu].cluster_sibling) #define topology_llc_cpumask(cpu) (&cpu_topology[cpu].llc_sibling) + +#ifndef arch_cpu_is_threaded +#define arch_cpu_is_threaded() (0) +#endif + void init_cpu_topology(void); void store_cpu_topology(unsigned int cpuid); const struct cpumask *cpu_coregroup_mask(int cpu); -- cgit v1.2.3 From f82890c98f3e3fd61983e9021354c632ecd47427 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Wed, 15 Oct 2025 04:30:13 +0000 Subject: tcpm: Parse and log AVS APDO The USB PD specification introduced new Adjustable Voltage Supply (AVS) types for both Standard Power Range (SPR) and Extended Power Range (EPR) sources. Add definitions to correctly parse and handle the new AVS APDO. Use bitfield macros to add inline helper functions to extract voltage, current, power, and peak current fields to parse and log the details of the new EPR AVS and SPR AVS APDO. Signed-off-by: Badhri Jagan Sridharan Reviewed-by: Amit Sunil Dhamne Reviewed-by: Kyle Tso Reviewed-by: RD Babiera Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20251015043017.3382908-1-badhri@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/pd.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/usb/pd.h b/include/linux/usb/pd.h index 3068c3084eb6..6ccd1b2af993 100644 --- a/include/linux/usb/pd.h +++ b/include/linux/usb/pd.h @@ -6,6 +6,7 @@ #ifndef __LINUX_USB_PD_H #define __LINUX_USB_PD_H +#include #include #include #include @@ -271,9 +272,11 @@ enum pd_pdo_type { enum pd_apdo_type { APDO_TYPE_PPS = 0, + APDO_TYPE_EPR_AVS = 1, + APDO_TYPE_SPR_AVS = 2, }; -#define PDO_APDO_TYPE_SHIFT 28 /* Only valid value currently is 0x0 - PPS */ +#define PDO_APDO_TYPE_SHIFT 28 #define PDO_APDO_TYPE_MASK 0x3 #define PDO_APDO_TYPE(t) ((t) << PDO_APDO_TYPE_SHIFT) @@ -297,6 +300,35 @@ enum pd_apdo_type { PDO_PPS_APDO_MIN_VOLT(min_mv) | PDO_PPS_APDO_MAX_VOLT(max_mv) | \ PDO_PPS_APDO_MAX_CURR(max_ma)) +/* + * Applicable only to EPR AVS APDO source cap as per + * Table 6.15 EPR Adjustable Voltage Supply APDO – Source + */ +#define PDO_EPR_AVS_APDO_PEAK_CURRENT GENMASK(27, 26) + +/* + * Applicable to both EPR AVS APDO source and sink cap as per + * Table 6.15 EPR Adjustable Voltage Supply APDO – Source + * Table 6.22 EPR Adjustable Voltage Supply APDO – Sink + */ +#define PDO_EPR_AVS_APDO_MAX_VOLT GENMASK(25, 17) /* 100mV unit */ +#define PDO_EPR_AVS_APDO_MIN_VOLT GENMASK(15, 8) /* 100mV unit */ +#define PDO_EPR_AVS_APDO_PDP GENMASK(7, 0) /* 1W unit */ + +/* + * Applicable only SPR AVS APDO source cap as per + * Table 6.14 SPR Adjustable Voltage Supply APDO – Source + */ +#define PDO_SPR_AVS_APDO_PEAK_CURRENT GENMASK(27, 26) + +/* + * Applicable to both SPR AVS APDO source and sink cap as per + * Table 6.14 SPR Adjustable Voltage Supply APDO – Source + * Table 6.21 SPR Adjustable Voltage Supply APDO – Sink + */ +#define PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR GENMASK(19, 10) /* 10mA unit */ +#define PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR GENMASK(9, 0) /* 10mA unit */ + static inline enum pd_pdo_type pdo_type(u32 pdo) { return (pdo >> PDO_TYPE_SHIFT) & PDO_TYPE_MASK; @@ -350,6 +382,41 @@ static inline unsigned int pdo_pps_apdo_max_current(u32 pdo) PDO_PPS_APDO_CURR_MASK) * 50; } +static inline unsigned int pdo_epr_avs_apdo_src_peak_current(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_PEAK_CURRENT, pdo); +} + +static inline unsigned int pdo_epr_avs_apdo_min_voltage_mv(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_MIN_VOLT, pdo) * 100; +} + +static inline unsigned int pdo_epr_avs_apdo_max_voltage_mv(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_MIN_VOLT, pdo) * 100; +} + +static inline unsigned int pdo_epr_avs_apdo_pdp_w(u32 pdo) +{ + return FIELD_GET(PDO_EPR_AVS_APDO_PDP, pdo); +} + +static inline unsigned int pdo_spr_avs_apdo_src_peak_current(u32 pdo) +{ + return FIELD_GET(PDO_SPR_AVS_APDO_PEAK_CURRENT, pdo); +} + +static inline unsigned int pdo_spr_avs_apdo_9v_to_15v_max_current_ma(u32 pdo) +{ + return FIELD_GET(PDO_SPR_AVS_APDO_9V_TO_15V_MAX_CURR, pdo) * 10; +} + +static inline unsigned int pdo_spr_avs_apdo_15v_to_20v_max_current_ma(u32 pdo) +{ + return FIELD_GET(PDO_SPR_AVS_APDO_15V_TO_20V_MAX_CURR, pdo) * 10; +} + /* RDO: Request Data Object */ #define RDO_OBJ_POS_SHIFT 28 #define RDO_OBJ_POS_MASK 0x7 -- cgit v1.2.3 From 832c8d3fce77cf03cc225fc555c1bffa1c547ba1 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Tue, 14 Oct 2025 18:06:47 +0200 Subject: usb: typec: ps883x: Add USB4 mode and TBT3 altmode support This chip can do some more than the driver currently describes. Add support for configuring it for various flavors of TBT3/USB4 operation. Reviewed-by: Jack Pham Signed-off-by: Konrad Dybcio Reviewed-by: Heikki Krogerus Link: https://patch.msgid.link/20251014-topic-ps883x_usb4-v1-3-e6adb1a4296e@oss.qualcomm.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/typec_tbt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/usb/typec_tbt.h b/include/linux/usb/typec_tbt.h index 55dcea12082c..0b570f1b8bc8 100644 --- a/include/linux/usb/typec_tbt.h +++ b/include/linux/usb/typec_tbt.h @@ -55,6 +55,7 @@ struct typec_thunderbolt_data { /* TBT3 Device Enter Mode VDO bits */ #define TBT_ENTER_MODE_CABLE_SPEED(s) TBT_SET_CABLE_SPEED(s) +#define TBT_ENTER_MODE_UNI_DIR_LSRX BIT(23) #define TBT_ENTER_MODE_ACTIVE_CABLE BIT(24) #endif /* __USB_TYPEC_TBT_H */ -- cgit v1.2.3 From 203dfbda03540f9a99341144a24877ee8b352189 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Thu, 25 Sep 2025 16:31:12 +0200 Subject: dt-bindings: power: Add support for MT8196 power controllers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for the power controllers found in the MediaTek MT8196 Chromebook SoC. This chip has three power controllers, two of which located in the SCP subsystems (where one can be directly controlled and the other can be controlled only through the HW Voter IP), and one located in the Multimedia HFRP subsystem, controllable only through the HW Voter IP. Acked-by: Rob Herring (Arm) Reviewed-by: Nícolas F. R. A. Prado Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Ulf Hansson --- include/dt-bindings/power/mediatek,mt8196-power.h | 58 +++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 include/dt-bindings/power/mediatek,mt8196-power.h (limited to 'include') diff --git a/include/dt-bindings/power/mediatek,mt8196-power.h b/include/dt-bindings/power/mediatek,mt8196-power.h new file mode 100644 index 000000000000..0f622a93c807 --- /dev/null +++ b/include/dt-bindings/power/mediatek,mt8196-power.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR MIT) */ +/* + * Copyright (c) 2025 Collabora Ltd + * AngeloGioacchino Del Regno + */ + +#ifndef _DT_BINDINGS_POWER_MT8196_POWER_H +#define _DT_BINDINGS_POWER_MT8196_POWER_H + +/* SCPSYS Secure Power Manager - Direct Control */ +#define MT8196_POWER_DOMAIN_MD 0 +#define MT8196_POWER_DOMAIN_CONN 1 +#define MT8196_POWER_DOMAIN_SSUSB_P0 2 +#define MT8196_POWER_DOMAIN_SSUSB_DP_PHY_P0 3 +#define MT8196_POWER_DOMAIN_SSUSB_P1 4 +#define MT8196_POWER_DOMAIN_SSUSB_P23 5 +#define MT8196_POWER_DOMAIN_SSUSB_PHY_P2 6 +#define MT8196_POWER_DOMAIN_PEXTP_MAC0 7 +#define MT8196_POWER_DOMAIN_PEXTP_MAC1 8 +#define MT8196_POWER_DOMAIN_PEXTP_MAC2 9 +#define MT8196_POWER_DOMAIN_PEXTP_PHY0 10 +#define MT8196_POWER_DOMAIN_PEXTP_PHY1 11 +#define MT8196_POWER_DOMAIN_PEXTP_PHY2 12 +#define MT8196_POWER_DOMAIN_AUDIO 13 +#define MT8196_POWER_DOMAIN_ADSP_TOP_DORMANT 14 +#define MT8196_POWER_DOMAIN_ADSP_INFRA 15 +#define MT8196_POWER_DOMAIN_ADSP_AO 16 + +/* SCPSYS Secure Power Manager - HW Voter */ +#define MT8196_POWER_DOMAIN_MM_PROC_DORMANT 0 +#define MT8196_POWER_DOMAIN_SSR 1 + +/* HFRPSYS MultiMedia Power Control (MMPC) - HW Voter */ +#define MT8196_POWER_DOMAIN_VDE0 0 +#define MT8196_POWER_DOMAIN_VDE1 1 +#define MT8196_POWER_DOMAIN_VDE_VCORE0 2 +#define MT8196_POWER_DOMAIN_VEN0 3 +#define MT8196_POWER_DOMAIN_VEN1 4 +#define MT8196_POWER_DOMAIN_VEN2 5 +#define MT8196_POWER_DOMAIN_DISP_VCORE 6 +#define MT8196_POWER_DOMAIN_DIS0_DORMANT 7 +#define MT8196_POWER_DOMAIN_DIS1_DORMANT 8 +#define MT8196_POWER_DOMAIN_OVL0_DORMANT 9 +#define MT8196_POWER_DOMAIN_OVL1_DORMANT 10 +#define MT8196_POWER_DOMAIN_DISP_EDPTX_DORMANT 11 +#define MT8196_POWER_DOMAIN_DISP_DPTX_DORMANT 12 +#define MT8196_POWER_DOMAIN_MML0_SHUTDOWN 13 +#define MT8196_POWER_DOMAIN_MML1_SHUTDOWN 14 +#define MT8196_POWER_DOMAIN_MM_INFRA0 15 +#define MT8196_POWER_DOMAIN_MM_INFRA1 16 +#define MT8196_POWER_DOMAIN_MM_INFRA_AO 17 +#define MT8196_POWER_DOMAIN_CSI_BS_RX 18 +#define MT8196_POWER_DOMAIN_CSI_LS_RX 19 +#define MT8196_POWER_DOMAIN_DSI_PHY0 20 +#define MT8196_POWER_DOMAIN_DSI_PHY1 21 +#define MT8196_POWER_DOMAIN_DSI_PHY2 22 + +#endif /* _DT_BINDINGS_POWER_MT8196_POWER_H */ -- cgit v1.2.3 From 295926ef36bb83d997f9c897b67fd1a0671db52e Mon Sep 17 00:00:00 2001 From: Finley Xiao Date: Fri, 17 Oct 2025 17:38:33 +0800 Subject: dt-bindings: power: rockchip: Add support for RV1126B Add power domain IDs for RV1126B SoC. Add a new compatible because register fields have changed. Signed-off-by: Finley Xiao Reviewed-by: Krzysztof Kozlowski Signed-off-by: Ulf Hansson --- .../power/rockchip,rv1126b-power-controller.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 include/dt-bindings/power/rockchip,rv1126b-power-controller.h (limited to 'include') diff --git a/include/dt-bindings/power/rockchip,rv1126b-power-controller.h b/include/dt-bindings/power/rockchip,rv1126b-power-controller.h new file mode 100644 index 000000000000..48ea87a4423c --- /dev/null +++ b/include/dt-bindings/power/rockchip,rv1126b-power-controller.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (c) 2024 Rockchip Electronics Co., Ltd. + * Author: Finley Xiao + */ + +#ifndef __DT_BINDINGS_POWER_RV1126B_POWER_CONTROLLER_H__ +#define __DT_BINDINGS_POWER_RV1126B_POWER_CONTROLLER_H__ + +/* VD_NPU */ +#define RV1126B_PD_NPU 0 + +/* VD_LOGIC */ +#define RV1126B_PD_VDO 1 +#define RV1126B_PD_AIISP 2 + +#endif -- cgit v1.2.3 From 9025688bf6d427e553aca911308cd92e92634f51 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 20 Oct 2025 10:53:40 -0700 Subject: module: Fix device table module aliases Commit 6717e8f91db7 ("kbuild: Remove 'kmod_' prefix from __KBUILD_MODNAME") inadvertently broke module alias generation for modules which rely on MODULE_DEVICE_TABLE(). It removed the "kmod_" prefix from __KBUILD_MODNAME, which caused MODULE_DEVICE_TABLE() to generate a symbol name which no longer matched the format expected by handle_moddevtable() in scripts/mod/file2alias.c. As a result, modpost failed to find the device tables, leading to missing module aliases. Fix this by explicitly adding the "kmod_" string within the MODULE_DEVICE_TABLE() macro itself, restoring the symbol name to the format expected by file2alias.c. Fixes: 6717e8f91db7 ("kbuild: Remove 'kmod_' prefix from __KBUILD_MODNAME") Reported-by: Alexander Stein Reported-by: Marek Szyprowski Reported-by: Mark Brown Reported-by: Cosmin Tanislav Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Tested-by: Cosmin Tanislav Tested-by: Marek Szyprowski Tested-by: Mark Brown Tested-by: Alexander Stein Tested-by: Chen-Yu Tsai Tested-by: Anders Roxell Link: https://patch.msgid.link/e52ee3edf32874da645a9e037a7d77c69893a22a.1760982784.git.jpoimboe@kernel.org --- include/linux/module.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index e135cc79acee..d80c3ea57472 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -251,10 +251,11 @@ struct module_kobject *lookup_or_create_module_kobject(const char *name); */ #define __mod_device_table(type, name) \ __PASTE(__mod_device_table__, \ + __PASTE(kmod_, \ __PASTE(__KBUILD_MODNAME, \ __PASTE(__, \ __PASTE(type, \ - __PASTE(__, name))))) + __PASTE(__, name)))))) /* Creates an alias so file2alias.c can find device table. */ #define MODULE_DEVICE_TABLE(type, name) \ -- cgit v1.2.3 From 1cba30bf9fdd6c982708f3587f609a30c370d889 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 16 Oct 2025 11:09:38 -0700 Subject: io_uring: add support for IORING_SETUP_SQE_MIXED Normal rings support 64b SQEs for posting submissions, while certain features require the ring to be configured with IORING_SETUP_SQE128, as they need to convey more information per submission. This, in turn, makes ALL the SQEs be 128b in size. This is somewhat wasteful and inefficient, particularly when only certain SQEs need to be of the bigger variant. This adds support for setting up a ring with mixed SQE sizes, using IORING_SETUP_SQE_MIXED. When setup in this mode, SQEs posted to the ring may be either 64b or 128b in size. If a SQE is 128b in size, then opcode will be set to a variante to indicate that this is the case. Any other non-128b opcode will assume the SQ's default size. SQEs on these types of mixed rings may also utilize NOP with skip success set. This can happen if the ring is one (small) SQE entry away from wrapping, and an attempt is made to get a 128b SQE. As SQEs must be contiguous in the SQ ring, a 128b SQE cannot wrap the ring. For this case, a single NOP SQE should be inserted with the SKIP_SUCCESS flag set. The kernel will process this as a normal NOP and without posting a CQE. Signed-off-by: Keith Busch [axboe: {} style fix and assign sqe before opcode read] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 263bed13473e..04797a9b76bc 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -231,6 +231,12 @@ enum io_uring_sqe_flags_bit { */ #define IORING_SETUP_CQE_MIXED (1U << 18) +/* + * Allow both 64b and 128b SQEs. If a 128b SQE is posted, it will have + * a 128b opcode. + */ +#define IORING_SETUP_SQE_MIXED (1U << 19) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -295,6 +301,8 @@ enum io_uring_op { IORING_OP_READV_FIXED, IORING_OP_WRITEV_FIXED, IORING_OP_PIPE, + IORING_OP_NOP128, + IORING_OP_URING_CMD128, /* this goes last, obviously */ IORING_OP_LAST, -- cgit v1.2.3 From 5c5028ee594ce5f907ca6ad1c32cca6a15098464 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 20 Oct 2025 13:47:15 -0700 Subject: block: rename min_segment_size Despite its name, the block layer is fine with segments smaller that the "min_segment_size" limit. The value is an optimization limit indicating the largest segment that can be used without considering boundary limits. Smaller segments can take a fast path, so give it a name that reflects that: max_fast_segment_size. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 70b671a9a7f7..99be263b31ab 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -378,7 +378,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; - unsigned int min_segment_size; + unsigned int max_fast_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; -- cgit v1.2.3 From 159e85110891ebc12500d02d4bf214b1d203e305 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Oct 2025 13:43:18 +0300 Subject: ACPI: property: Make acpi_get_next_subnode() static acpi_get_next_subnode() is only used in drivers/acpi/property.c. Remove its prototype from include/linux/acpi.h and make it static. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Reviewed-by: Laurent Pinchart Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251001104320.1272752-2-sakari.ailus@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead..703323b9fe0c 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1349,9 +1349,6 @@ acpi_data_add_props(struct acpi_device_data *data, const guid_t *guid, int acpi_node_prop_get(const struct fwnode_handle *fwnode, const char *propname, void **valptr); -struct fwnode_handle *acpi_get_next_subnode(const struct fwnode_handle *fwnode, - struct fwnode_handle *child); - struct acpi_probe_entry; typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *, struct acpi_probe_entry *); @@ -1450,13 +1447,6 @@ static inline int acpi_node_prop_get(const struct fwnode_handle *fwnode, return -ENXIO; } -static inline struct fwnode_handle * -acpi_get_next_subnode(const struct fwnode_handle *fwnode, - struct fwnode_handle *child) -{ - return NULL; -} - static inline struct fwnode_handle * acpi_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) -- cgit v1.2.3 From 0d8627cc936de8ea04f3cc1e6921c63fb72cc199 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:06 +0200 Subject: blktrace: add definitions for blk_user_trace_setup2 Add definitions for a version 2 of the blk_user_trace_setup ioctl. This new ioctl will enable a different struct layout of the binary data passed to user-space when using a new version of the blktrace utility requesting the new struct layout. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 16 ++++++++++++++++ include/uapi/linux/fs.h | 1 + 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 1bfb635e309b..a6958708d477 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -129,6 +129,7 @@ enum { }; #define BLKTRACE_BDEV_SIZE 32 +#define BLKTRACE_BDEV_SIZE2 64 /* * User setup structure passed with BLKTRACESETUP @@ -143,4 +144,19 @@ struct blk_user_trace_setup { __u32 pid; }; +/* + * User setup structure passed with BLKTRACESETUP2 + */ +struct blk_user_trace_setup2 { + char name[BLKTRACE_BDEV_SIZE2]; /* output */ + __u64 act_mask; /* input */ + __u32 buf_size; /* input */ + __u32 buf_nr; /* input */ + __u64 start_lba; + __u64 end_lba; + __u32 pid; + __u32 flags; /* currently unused */ + __u64 reserved[11]; +}; + #endif /* _UAPIBLKTRACE_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index beb4c2d1e41c..957ce3343a4f 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -300,6 +300,7 @@ struct file_attr { #define BLKGETDISKSEQ _IOR(0x12,128,__u64) /* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ /* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */ +#define BLKTRACESETUP2 _IOWR(0x12, 142, struct blk_user_trace_setup2) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3 From 113cbd62824afdf62d2f3f092809cf37cc7f1dd8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:07 +0200 Subject: blktrace: pass blk_user_trace2 to setup functions Pass struct blk_user_trace_setup2 to blktrace_setup_finalize(). This prepares for the incoming extension of the blktrace protocol with a 64bit act_mask. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 122c62e561fc..05c8754456aa 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -14,11 +14,12 @@ #include struct blk_trace { + int version; int trace_state; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; - u16 act_mask; + u64 act_mask; u64 start_lba; u64 end_lba; u32 pid; -- cgit v1.2.3 From c44347d606260f36a81f6d8415a5af33cb3015fa Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:08 +0200 Subject: blktrace: add definitions for struct blk_io_trace2 Add definitions for the extended version of the blktrace protocol using a wider action type to be able to record new actions in the kernel. Reviewed-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index a6958708d477..9f9834d76e00 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -94,6 +94,7 @@ enum blktrace_notify { #define BLK_IO_TRACE_MAGIC 0x65617400 #define BLK_IO_TRACE_VERSION 0x07 +#define BLK_IO_TRACE2_VERSION 0x08 /* * The trace itself @@ -113,6 +114,21 @@ struct blk_io_trace { /* cgroup id will be stored here if exists */ }; +struct blk_io_trace2 { + __u32 magic; /* MAGIC << 8 | BLK_IO_TRACE2_VERSION */ + __u32 sequence; /* event number */ + __u64 time; /* in nanoseconds */ + __u64 sector; /* disk offset */ + __u32 bytes; /* transfer length */ + __u32 pid; /* who did it */ + __u64 action; /* what happened */ + __u32 device; /* device number */ + __u32 cpu; /* on what cpu did it happen */ + __u16 error; /* completion error */ + __u16 pdu_len; /* length of data after this trace */ + __u8 pad[12]; + /* cgroup id will be stored here if it exists */ +}; /* * The remap event */ -- cgit v1.2.3 From f9ee38bbf70fb20584625849a253c8652176fa66 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:12 +0200 Subject: blktrace: add block trace commands for zone operations Add block trace commands for zone operations. These commands can only be handled with version 2 of the blktrace protocol. For version 1, warn if a command that does not fit into the 16 bits reserved for the command in this version is passed in. Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 9f9834d76e00..190a3c5ab0a0 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -26,11 +26,20 @@ enum blktrace_cat { BLK_TC_DRV_DATA = 1 << 14, /* binary per-driver data */ BLK_TC_FUA = 1 << 15, /* fua requests */ - BLK_TC_END = 1 << 15, /* we've run out of bits! */ + BLK_TC_END_V1 = 1 << 15, /* we've run out of bits! */ + + BLK_TC_ZONE_APPEND = 1ull << 16, /* zone append */ + BLK_TC_ZONE_RESET = 1ull << 17, /* zone reset */ + BLK_TC_ZONE_RESET_ALL = 1ull << 18, /* zone reset all */ + BLK_TC_ZONE_FINISH = 1ull << 19, /* zone finish */ + BLK_TC_ZONE_OPEN = 1ull << 20, /* zone open */ + BLK_TC_ZONE_CLOSE = 1ull << 21, /* zone close */ + + BLK_TC_END_V2 = 1ull << 21, }; #define BLK_TC_SHIFT (16) -#define BLK_TC_ACT(act) ((act) << BLK_TC_SHIFT) +#define BLK_TC_ACT(act) ((u64)(act) << BLK_TC_SHIFT) /* * Basic trace actions -- cgit v1.2.3 From 1c164fcc1b08e75f1cad1532718f09cddc0ddebe Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:13 +0200 Subject: blktrace: expose ZONE APPEND completions to blktrace Expose ZONE APPEND completions as a block trace completion action to blktrace. As tracing of zoned block commands needs the upper 32bit of the widened 64bit action, only add traces to blktrace if user-space has requested version 2 of the blktrace protocol. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 190a3c5ab0a0..289872e51fc5 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -97,6 +97,9 @@ enum blktrace_notify { #define BLK_TA_ABORT (__BLK_TA_ABORT | BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_TA_DRV_DATA (__BLK_TA_DRV_DATA | BLK_TC_ACT(BLK_TC_DRV_DATA)) +#define BLK_TA_ZONE_APPEND (__BLK_TA_COMPLETE |\ + BLK_TC_ACT(BLK_TC_ZONE_APPEND)) + #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_MESSAGE (__BLK_TN_MESSAGE | BLK_TC_ACT(BLK_TC_NOTIFY)) -- cgit v1.2.3 From 3f6722816a73e2017599d965683dbe71833afd7a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 22 Oct 2025 13:41:14 +0200 Subject: blktrace: trace zone write plugging operations Trace zone write plugging operations on block devices. As tracing of zoned block commands needs the upper 32bit of the widened 64bit action, only add traces to blktrace if user-space has requested version 2 of the blktrace protocol. Reviewed-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 289872e51fc5..30f3d2589365 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -62,6 +62,8 @@ enum blktrace_act { __BLK_TA_REMAP, /* bio was remapped */ __BLK_TA_ABORT, /* request aborted */ __BLK_TA_DRV_DATA, /* driver-specific binary data */ + __BLK_TA_ZONE_PLUG, /* zone write plug was plugged */ + __BLK_TA_ZONE_UNPLUG, /* zone write plug was unplugged */ __BLK_TA_CGROUP = 1 << 8, /* from a cgroup*/ }; @@ -99,6 +101,9 @@ enum blktrace_notify { #define BLK_TA_ZONE_APPEND (__BLK_TA_COMPLETE |\ BLK_TC_ACT(BLK_TC_ZONE_APPEND)) +#define BLK_TA_ZONE_PLUG (__BLK_TA_ZONE_PLUG | BLK_TC_ACT(BLK_TC_QUEUE)) +#define BLK_TA_ZONE_UNPLUG (__BLK_TA_ZONE_UNPLUG |\ + BLK_TC_ACT(BLK_TC_QUEUE)) #define BLK_TN_PROCESS (__BLK_TN_PROCESS | BLK_TC_ACT(BLK_TC_NOTIFY)) #define BLK_TN_TIMESTAMP (__BLK_TN_TIMESTAMP | BLK_TC_ACT(BLK_TC_NOTIFY)) -- cgit v1.2.3 From cbe5aeedecc72314c3a8fd0d41d9b270f576aee1 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:05 +0900 Subject: PM: EM: Assign a unique ID when creating a performance domain It is necessary to refer to a specific performance domain from a userspace. For example, the energy model of a particular performance domain is updated. To this end, assign a unique ID to each performance domain to address it, and manage them in a global linked list to look up a specific one by matching ID. IDA is used for ID assignment, and the mutex is used to protect the global list from concurrent access. Note that the mutex (em_pd_list_mutex) is not supposed to hold while holding em_pd_mutex to avoid ABBA deadlock. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-2-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 61d50571ad88..43aa6153dc57 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -54,6 +54,8 @@ struct em_perf_table { /** * struct em_perf_domain - Performance domain * @em_table: Pointer to the runtime modifiable em_perf_table + * @node: node in em_pd_list (in energy_model.c) + * @id: A unique ID number for each performance domain * @nr_perf_states: Number of performance states * @min_perf_state: Minimum allowed Performance State index * @max_perf_state: Maximum allowed Performance State index @@ -71,6 +73,8 @@ struct em_perf_table { */ struct em_perf_domain { struct em_perf_table __rcu *em_table; + struct list_head node; + int id; int nr_perf_states; int min_perf_state; int max_perf_state; -- cgit v1.2.3 From bd26631ccdfd11701fa29e665a7f041875ba9423 Mon Sep 17 00:00:00 2001 From: Changwoo Min Date: Tue, 21 Oct 2025 07:09:07 +0900 Subject: PM: EM: Add em.yaml and autogen files Add a generic netlink spec in YAML format and autogenerate boilerplate code using ynl-regen.sh to introduce a generic netlink for the energy model. It allows a userspace program to read the performance domain and its energy model. It notifies the userspace program when a performance domain is created or deleted or its energy model is updated through a multicast interface. Specifically, it supports two commands: - EM_CMD_GET_PDS: Get the list of information for all performance domains. - EM_CMD_GET_PD_TABLE: Get the energy model table of a performance domain. Also, it supports three notification events: - EM_CMD_PD_CREATED: When a performance domain is created. - EM_CMD_PD_DELETED: When a performance domain is deleted. - EM_CMD_PD_UPDATED: When the energy model table of a performance domain is updated. Finally, update MAINTAINERS to include new files. Signed-off-by: Changwoo Min Reviewed-by: Lukasz Luba Link: https://patch.msgid.link/20251020220914.320832-4-changwoo@igalia.com Signed-off-by: Rafael J. Wysocki --- include/uapi/linux/energy_model.h | 62 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 include/uapi/linux/energy_model.h (limited to 'include') diff --git a/include/uapi/linux/energy_model.h b/include/uapi/linux/energy_model.h new file mode 100644 index 000000000000..4ec4c0eabbbb --- /dev/null +++ b/include/uapi/linux/energy_model.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/em.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_ENERGY_MODEL_H +#define _UAPI_LINUX_ENERGY_MODEL_H + +#define EM_FAMILY_NAME "em" +#define EM_FAMILY_VERSION 1 + +enum { + EM_A_PDS_PD = 1, + + __EM_A_PDS_MAX, + EM_A_PDS_MAX = (__EM_A_PDS_MAX - 1) +}; + +enum { + EM_A_PD_PAD = 1, + EM_A_PD_PD_ID, + EM_A_PD_FLAGS, + EM_A_PD_CPUS, + + __EM_A_PD_MAX, + EM_A_PD_MAX = (__EM_A_PD_MAX - 1) +}; + +enum { + EM_A_PD_TABLE_PD_ID = 1, + EM_A_PD_TABLE_PS, + + __EM_A_PD_TABLE_MAX, + EM_A_PD_TABLE_MAX = (__EM_A_PD_TABLE_MAX - 1) +}; + +enum { + EM_A_PS_PAD = 1, + EM_A_PS_PERFORMANCE, + EM_A_PS_FREQUENCY, + EM_A_PS_POWER, + EM_A_PS_COST, + EM_A_PS_FLAGS, + + __EM_A_PS_MAX, + EM_A_PS_MAX = (__EM_A_PS_MAX - 1) +}; + +enum { + EM_CMD_GET_PDS = 1, + EM_CMD_GET_PD_TABLE, + EM_CMD_PD_CREATED, + EM_CMD_PD_UPDATED, + EM_CMD_PD_DELETED, + + __EM_CMD_MAX, + EM_CMD_MAX = (__EM_CMD_MAX - 1) +}; + +#define EM_MCGRP_EVENT "event" + +#endif /* _UAPI_LINUX_ENERGY_MODEL_H */ -- cgit v1.2.3 From e090dc10c65eac35dcdb7c1b9cd6adcf0b590d3a Mon Sep 17 00:00:00 2001 From: Luca Weiss Date: Fri, 19 Sep 2025 11:57:23 +0200 Subject: dt-bindings: clock: dispcc-sm6350: Add MDSS_CORE & MDSS_RSCC resets Add the indexes for two resets inside the dispcc on SM6350 SoC. Signed-off-by: Luca Weiss Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250919-sm6350-mdss-reset-v1-1-48dcac917c73@fairphone.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,dispcc-sm6350.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,dispcc-sm6350.h b/include/dt-bindings/clock/qcom,dispcc-sm6350.h index cb54aae2723e..61426a80e620 100644 --- a/include/dt-bindings/clock/qcom,dispcc-sm6350.h +++ b/include/dt-bindings/clock/qcom,dispcc-sm6350.h @@ -42,6 +42,10 @@ #define DISP_CC_SLEEP_CLK 31 #define DISP_CC_XO_CLK 32 +/* Resets */ +#define DISP_CC_MDSS_CORE_BCR 0 +#define DISP_CC_MDSS_RSCC_BCR 1 + /* GDSCs */ #define MDSS_GDSC 0 -- cgit v1.2.3 From 2238840342af8e8d37a9355f0a2ad4285c32f854 Mon Sep 17 00:00:00 2001 From: Jens Reidel Date: Fri, 19 Sep 2025 14:34:30 +0200 Subject: dt-bindings: clock: sm7150-dispcc: Add MDSS_CORE reset Add the index for a reset inside the dispcc on SM7150 SoC. Signed-off-by: Jens Reidel Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250919-sm7150-dispcc-fixes-v1-1-308ad47c5fce@mainlining.org Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,sm7150-dispcc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,sm7150-dispcc.h b/include/dt-bindings/clock/qcom,sm7150-dispcc.h index fc1fefe8fd72..1e4e6432d506 100644 --- a/include/dt-bindings/clock/qcom,sm7150-dispcc.h +++ b/include/dt-bindings/clock/qcom,sm7150-dispcc.h @@ -53,6 +53,9 @@ #define DISPCC_SLEEP_CLK 41 #define DISPCC_SLEEP_CLK_SRC 42 +/* DISPCC resets */ +#define DISPCC_MDSS_CORE_BCR 0 + /* DISPCC GDSCR */ #define MDSS_GDSC 0 -- cgit v1.2.3 From 2985e76c66e15a6953c77d0b924e3a78d495208e Mon Sep 17 00:00:00 2001 From: Luo Jie Date: Tue, 14 Oct 2025 22:35:28 +0800 Subject: dt-bindings: interconnect: Add Qualcomm IPQ5424 NSSNOC IDs Add the NSSNOC master/slave ids for Qualcomm IPQ5424 network subsystem (NSS) hardware blocks. These will be used by the gcc-ipq5424 driver that provides the interconnect services by using the icc-clk framework. Acked-by: Rob Herring (Arm) Signed-off-by: Luo Jie Acked-by: Georgi Djakov Link: https://lore.kernel.org/r/20251014-qcom_ipq5424_nsscc-v7-3-081f4956be02@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/interconnect/qcom,ipq5424.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/interconnect/qcom,ipq5424.h b/include/dt-bindings/interconnect/qcom,ipq5424.h index afd7e0683a24..c5e0dec0b300 100644 --- a/include/dt-bindings/interconnect/qcom,ipq5424.h +++ b/include/dt-bindings/interconnect/qcom,ipq5424.h @@ -20,6 +20,26 @@ #define SLAVE_CNOC_PCIE3 15 #define MASTER_CNOC_USB 16 #define SLAVE_CNOC_USB 17 +#define MASTER_NSSNOC_NSSCC 18 +#define SLAVE_NSSNOC_NSSCC 19 +#define MASTER_NSSNOC_SNOC_0 20 +#define SLAVE_NSSNOC_SNOC_0 21 +#define MASTER_NSSNOC_SNOC_1 22 +#define SLAVE_NSSNOC_SNOC_1 23 +#define MASTER_NSSNOC_PCNOC_1 24 +#define SLAVE_NSSNOC_PCNOC_1 25 +#define MASTER_NSSNOC_QOSGEN_REF 26 +#define SLAVE_NSSNOC_QOSGEN_REF 27 +#define MASTER_NSSNOC_TIMEOUT_REF 28 +#define SLAVE_NSSNOC_TIMEOUT_REF 29 +#define MASTER_NSSNOC_XO_DCD 30 +#define SLAVE_NSSNOC_XO_DCD 31 +#define MASTER_NSSNOC_ATB 32 +#define SLAVE_NSSNOC_ATB 33 +#define MASTER_CNOC_LPASS_CFG 34 +#define SLAVE_CNOC_LPASS_CFG 35 +#define MASTER_SNOC_LPASS 36 +#define SLAVE_SNOC_LPASS 37 #define MASTER_CPU 0 #define SLAVE_L3 1 -- cgit v1.2.3 From 60c8b7569c10c4b2ad5645cd093ff4577487314b Mon Sep 17 00:00:00 2001 From: Luo Jie Date: Tue, 14 Oct 2025 22:35:30 +0800 Subject: dt-bindings: clock: gcc-ipq5424: Add definition for GPLL0_OUT_AUX The GCC clock GPLL0_OUT_AUX is one of source clocks for IPQ5424 NSS clock controller. Acked-by: Rob Herring (Arm) Signed-off-by: Luo Jie Link: https://lore.kernel.org/r/20251014-qcom_ipq5424_nsscc-v7-5-081f4956be02@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,ipq5424-gcc.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,ipq5424-gcc.h b/include/dt-bindings/clock/qcom,ipq5424-gcc.h index c15ad16923bd..3ae33a0fa002 100644 --- a/include/dt-bindings/clock/qcom,ipq5424-gcc.h +++ b/include/dt-bindings/clock/qcom,ipq5424-gcc.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* * Copyright (c) 2018,2020 The Linux Foundation. All rights reserved. - * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. */ #ifndef _DT_BINDINGS_CLOCK_IPQ_GCC_IPQ5424_H @@ -152,5 +152,6 @@ #define GCC_PCIE3_RCHNG_CLK 143 #define GCC_IM_SLEEP_CLK 144 #define GCC_XO_CLK 145 +#define GPLL0_OUT_AUX 146 #endif -- cgit v1.2.3 From 06ac2566e73d9d9fa2be62315e182945f7934882 Mon Sep 17 00:00:00 2001 From: Luo Jie Date: Tue, 14 Oct 2025 22:35:32 +0800 Subject: dt-bindings: clock: qcom: Add NSS clock controller for IPQ5424 SoC NSS clock controller provides the clocks and resets to the networking blocks such as PPE (Packet Process Engine) and UNIPHY (PCS) on IPQ5424 devices. Add support for the compatible string "qcom,ipq5424-nsscc" based on the existing IPQ9574 NSS clock controller Device Tree binding. Additionally, update the clock names for PPE and NSS for newer SoC additions like IPQ5424 to use generic and reusable identifiers "nss" and "ppe" without the clock rate suffix. Also add master/slave ids for IPQ5424 networking interfaces, which is used by nss-ipq5424 driver for providing interconnect services using icc-clk framework. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Luo Jie Link: https://lore.kernel.org/r/20251014-qcom_ipq5424_nsscc-v7-7-081f4956be02@quicinc.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,ipq5424-nsscc.h | 65 +++++++++++++++++++++++++ include/dt-bindings/interconnect/qcom,ipq5424.h | 13 +++++ include/dt-bindings/reset/qcom,ipq5424-nsscc.h | 46 +++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,ipq5424-nsscc.h create mode 100644 include/dt-bindings/reset/qcom,ipq5424-nsscc.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,ipq5424-nsscc.h b/include/dt-bindings/clock/qcom,ipq5424-nsscc.h new file mode 100644 index 000000000000..eeae0dc38042 --- /dev/null +++ b/include/dt-bindings/clock/qcom,ipq5424-nsscc.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLOCK_QCOM_IPQ5424_NSSCC_H +#define _DT_BINDINGS_CLOCK_QCOM_IPQ5424_NSSCC_H + +/* NSS_CC clocks */ +#define NSS_CC_CE_APB_CLK 0 +#define NSS_CC_CE_AXI_CLK 1 +#define NSS_CC_CE_CLK_SRC 2 +#define NSS_CC_CFG_CLK_SRC 3 +#define NSS_CC_DEBUG_CLK 4 +#define NSS_CC_EIP_BFDCD_CLK_SRC 5 +#define NSS_CC_EIP_CLK 6 +#define NSS_CC_NSS_CSR_CLK 7 +#define NSS_CC_NSSNOC_CE_APB_CLK 8 +#define NSS_CC_NSSNOC_CE_AXI_CLK 9 +#define NSS_CC_NSSNOC_EIP_CLK 10 +#define NSS_CC_NSSNOC_NSS_CSR_CLK 11 +#define NSS_CC_NSSNOC_PPE_CFG_CLK 12 +#define NSS_CC_NSSNOC_PPE_CLK 13 +#define NSS_CC_PORT1_MAC_CLK 14 +#define NSS_CC_PORT1_RX_CLK 15 +#define NSS_CC_PORT1_RX_CLK_SRC 16 +#define NSS_CC_PORT1_RX_DIV_CLK_SRC 17 +#define NSS_CC_PORT1_TX_CLK 18 +#define NSS_CC_PORT1_TX_CLK_SRC 19 +#define NSS_CC_PORT1_TX_DIV_CLK_SRC 20 +#define NSS_CC_PORT2_MAC_CLK 21 +#define NSS_CC_PORT2_RX_CLK 22 +#define NSS_CC_PORT2_RX_CLK_SRC 23 +#define NSS_CC_PORT2_RX_DIV_CLK_SRC 24 +#define NSS_CC_PORT2_TX_CLK 25 +#define NSS_CC_PORT2_TX_CLK_SRC 26 +#define NSS_CC_PORT2_TX_DIV_CLK_SRC 27 +#define NSS_CC_PORT3_MAC_CLK 28 +#define NSS_CC_PORT3_RX_CLK 29 +#define NSS_CC_PORT3_RX_CLK_SRC 30 +#define NSS_CC_PORT3_RX_DIV_CLK_SRC 31 +#define NSS_CC_PORT3_TX_CLK 32 +#define NSS_CC_PORT3_TX_CLK_SRC 33 +#define NSS_CC_PORT3_TX_DIV_CLK_SRC 34 +#define NSS_CC_PPE_CLK_SRC 35 +#define NSS_CC_PPE_EDMA_CFG_CLK 36 +#define NSS_CC_PPE_EDMA_CLK 37 +#define NSS_CC_PPE_SWITCH_BTQ_CLK 38 +#define NSS_CC_PPE_SWITCH_CFG_CLK 39 +#define NSS_CC_PPE_SWITCH_CLK 40 +#define NSS_CC_PPE_SWITCH_IPE_CLK 41 +#define NSS_CC_UNIPHY_PORT1_RX_CLK 42 +#define NSS_CC_UNIPHY_PORT1_TX_CLK 43 +#define NSS_CC_UNIPHY_PORT2_RX_CLK 44 +#define NSS_CC_UNIPHY_PORT2_TX_CLK 45 +#define NSS_CC_UNIPHY_PORT3_RX_CLK 46 +#define NSS_CC_UNIPHY_PORT3_TX_CLK 47 +#define NSS_CC_XGMAC0_PTP_REF_CLK 48 +#define NSS_CC_XGMAC0_PTP_REF_DIV_CLK_SRC 49 +#define NSS_CC_XGMAC1_PTP_REF_CLK 50 +#define NSS_CC_XGMAC1_PTP_REF_DIV_CLK_SRC 51 +#define NSS_CC_XGMAC2_PTP_REF_CLK 52 +#define NSS_CC_XGMAC2_PTP_REF_DIV_CLK_SRC 53 + +#endif diff --git a/include/dt-bindings/interconnect/qcom,ipq5424.h b/include/dt-bindings/interconnect/qcom,ipq5424.h index c5e0dec0b300..07b786bee7d6 100644 --- a/include/dt-bindings/interconnect/qcom,ipq5424.h +++ b/include/dt-bindings/interconnect/qcom,ipq5424.h @@ -44,4 +44,17 @@ #define MASTER_CPU 0 #define SLAVE_L3 1 +#define MASTER_NSSNOC_PPE 0 +#define SLAVE_NSSNOC_PPE 1 +#define MASTER_NSSNOC_PPE_CFG 2 +#define SLAVE_NSSNOC_PPE_CFG 3 +#define MASTER_NSSNOC_NSS_CSR 4 +#define SLAVE_NSSNOC_NSS_CSR 5 +#define MASTER_NSSNOC_CE_AXI 6 +#define SLAVE_NSSNOC_CE_AXI 7 +#define MASTER_NSSNOC_CE_APB 8 +#define SLAVE_NSSNOC_CE_APB 9 +#define MASTER_NSSNOC_EIP 10 +#define SLAVE_NSSNOC_EIP 11 + #endif /* INTERCONNECT_QCOM_IPQ5424_H */ diff --git a/include/dt-bindings/reset/qcom,ipq5424-nsscc.h b/include/dt-bindings/reset/qcom,ipq5424-nsscc.h new file mode 100644 index 000000000000..9627e3b0ad30 --- /dev/null +++ b/include/dt-bindings/reset/qcom,ipq5424-nsscc.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_RESET_QCOM_IPQ5424_NSSCC_H +#define _DT_BINDINGS_RESET_QCOM_IPQ5424_NSSCC_H + +#define NSS_CC_CE_APB_CLK_ARES 0 +#define NSS_CC_CE_AXI_CLK_ARES 1 +#define NSS_CC_DEBUG_CLK_ARES 2 +#define NSS_CC_EIP_CLK_ARES 3 +#define NSS_CC_NSS_CSR_CLK_ARES 4 +#define NSS_CC_NSSNOC_CE_APB_CLK_ARES 5 +#define NSS_CC_NSSNOC_CE_AXI_CLK_ARES 6 +#define NSS_CC_NSSNOC_EIP_CLK_ARES 7 +#define NSS_CC_NSSNOC_NSS_CSR_CLK_ARES 8 +#define NSS_CC_NSSNOC_PPE_CLK_ARES 9 +#define NSS_CC_NSSNOC_PPE_CFG_CLK_ARES 10 +#define NSS_CC_PORT1_MAC_CLK_ARES 11 +#define NSS_CC_PORT1_RX_CLK_ARES 12 +#define NSS_CC_PORT1_TX_CLK_ARES 13 +#define NSS_CC_PORT2_MAC_CLK_ARES 14 +#define NSS_CC_PORT2_RX_CLK_ARES 15 +#define NSS_CC_PORT2_TX_CLK_ARES 16 +#define NSS_CC_PORT3_MAC_CLK_ARES 17 +#define NSS_CC_PORT3_RX_CLK_ARES 18 +#define NSS_CC_PORT3_TX_CLK_ARES 19 +#define NSS_CC_PPE_BCR 20 +#define NSS_CC_PPE_EDMA_CLK_ARES 21 +#define NSS_CC_PPE_EDMA_CFG_CLK_ARES 22 +#define NSS_CC_PPE_SWITCH_BTQ_CLK_ARES 23 +#define NSS_CC_PPE_SWITCH_CLK_ARES 24 +#define NSS_CC_PPE_SWITCH_CFG_CLK_ARES 25 +#define NSS_CC_PPE_SWITCH_IPE_CLK_ARES 26 +#define NSS_CC_UNIPHY_PORT1_RX_CLK_ARES 27 +#define NSS_CC_UNIPHY_PORT1_TX_CLK_ARES 28 +#define NSS_CC_UNIPHY_PORT2_RX_CLK_ARES 29 +#define NSS_CC_UNIPHY_PORT2_TX_CLK_ARES 30 +#define NSS_CC_UNIPHY_PORT3_RX_CLK_ARES 31 +#define NSS_CC_UNIPHY_PORT3_TX_CLK_ARES 32 +#define NSS_CC_XGMAC0_PTP_REF_CLK_ARES 33 +#define NSS_CC_XGMAC1_PTP_REF_CLK_ARES 34 +#define NSS_CC_XGMAC2_PTP_REF_CLK_ARES 35 + +#endif -- cgit v1.2.3 From c88b6ee3ba3c7bf6386ea0e6de8111acc3d832bc Mon Sep 17 00:00:00 2001 From: Jingyi Wang Date: Wed, 24 Sep 2025 16:24:55 -0700 Subject: soc: qcom: llcc-qcom: Add support for Kaanapali Add system cache table and configs for Kaanapali SoC. Signed-off-by: Jingyi Wang Reviewed-by: Konrad Dybcio Link: https://lore.kernel.org/r/20250924-knp-llcc-v1-2-ae6a016e5138@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/llcc-qcom.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/soc/qcom/llcc-qcom.h b/include/linux/soc/qcom/llcc-qcom.h index 7a69210a250c..0287f9182c4d 100644 --- a/include/linux/soc/qcom/llcc-qcom.h +++ b/include/linux/soc/qcom/llcc-qcom.h @@ -74,7 +74,14 @@ #define LLCC_CAMSRTIP 73 #define LLCC_CAMRTRF 74 #define LLCC_CAMSRTRF 75 +#define LLCC_VIDEO_APV 83 +#define LLCC_COMPUTE1 87 +#define LLCC_CPUSS_OPP 88 #define LLCC_CPUSSMPAM 89 +#define LLCC_CAM_IPE_STROV 92 +#define LLCC_CAM_OFE_STROV 93 +#define LLCC_CPUSS_HEU 94 +#define LLCC_MDM_PNG_FIXED 100 /** * struct llcc_slice_desc - Cache slice descriptor -- cgit v1.2.3 From 67a4b6a89b99aff0883114e4ecba4b11aedc29a5 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 6 Feb 2025 16:44:10 -0500 Subject: lsm: split the init code out into lsm_init.c Continue to pull code out of security/security.c to help improve readability by pulling all of the LSM framework initialization code out into a new file. No code changes. Reviewed-by: Kees Cook Reviewed-by: John Johansen Reviewed-by: Casey Schaufler Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 79ec5a2bdcca..0112926ed923 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -170,11 +170,10 @@ struct lsm_info { __used __section(".early_lsm_info.init") \ __aligned(sizeof(unsigned long)) + /* DO NOT tamper with these variables outside of the LSM framework */ extern char *lsm_names; extern struct lsm_static_calls_table static_calls_table __ro_after_init; -extern struct lsm_info __start_lsm_info[], __end_lsm_info[]; -extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[]; /** * lsm_get_xattr_slot - Return the next available slot and increment the index -- cgit v1.2.3 From 9f9dc69e06ecbc61e7a50b823b82a78daf130dc0 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 12 Feb 2025 14:45:06 -0500 Subject: lsm: replace the name field with a pointer to the lsm_id struct Reduce the duplication between the lsm_id struct and the DEFINE_LSM() definition by linking the lsm_id struct directly into the individual LSM's DEFINE_LSM() instance. Linking the lsm_id into the LSM definition also allows us to simplify the security_add_hooks() function by removing the code which populates the lsm_idlist[] array and moving it into the normal LSM startup code where the LSM list is parsed and the individual LSMs are enabled, making for a cleaner implementation with less overhead at boot. Reviewed-by: Kees Cook Reviewed-by: John Johansen Reviewed-by: Casey Schaufler Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 0112926ed923..7343dd60b1d5 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -152,7 +152,7 @@ enum lsm_order { }; struct lsm_info { - const char *name; /* Required. */ + const struct lsm_id *id; enum lsm_order order; /* Optional: default is LSM_ORDER_MUTABLE */ unsigned long flags; /* Optional: flags describing LSM */ int *enabled; /* Optional: controlled by CONFIG_LSM */ -- cgit v1.2.3 From 250898ca335f337bc032a9693dc0a30a1cb85825 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 12 Feb 2025 15:36:51 -0500 Subject: lsm: rework lsm_active_cnt and lsm_idlist[] Move the LSM active count and lsm_id list declarations out of a header that is visible across the kernel and into a header that is limited to the LSM framework. This not only helps keep the include/linux headers smaller and cleaner, it helps prevent misuse of these variables. Reviewed-by: Casey Schaufler Reviewed-by: John Johansen Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/security.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/security.h b/include/linux/security.h index 92ac3f27b973..556890ea2e83 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -167,8 +167,6 @@ struct lsm_prop { }; extern const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1]; -extern u32 lsm_active_cnt; -extern const struct lsm_id *lsm_idlist[]; /* These functions are in security/commoncap.c */ extern int cap_capable(const struct cred *cred, struct user_namespace *ns, -- cgit v1.2.3 From 935d508d4d7ab9d19c603bd7eb2937249551d507 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Thu, 13 Feb 2025 17:34:12 -0500 Subject: lsm: get rid of the lsm_names list and do some cleanup The LSM currently has a lot of code to maintain a list of the currently active LSMs in a human readable string, with the only user being the "/sys/kernel/security/lsm" code. Let's drop all of that code and generate the string on first use and then cache it for subsequent use. Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 7343dd60b1d5..65a8227bece7 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -172,7 +172,6 @@ struct lsm_info { /* DO NOT tamper with these variables outside of the LSM framework */ -extern char *lsm_names; extern struct lsm_static_calls_table static_calls_table __ro_after_init; /** -- cgit v1.2.3 From 291271e691740003021cf5b48fa7cf7e3371eaa7 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 11 Feb 2025 17:49:11 -0500 Subject: lsm: cleanup the LSM blob size code Convert the lsm_blob_size fields to unsigned integers as there is no current need for them to be negative, change "lsm_set_blob_size()" to "lsm_blob_size_update()" to better reflect reality, and perform some other minor cleanups to the associated code. Reviewed-by: Kees Cook Reviewed-by: John Johansen Reviewed-by: Casey Schaufler Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 65a8227bece7..86e457aa8809 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -102,23 +102,23 @@ struct security_hook_list { * Security blob size or offset data. */ struct lsm_blob_sizes { - int lbs_cred; - int lbs_file; - int lbs_ib; - int lbs_inode; - int lbs_sock; - int lbs_superblock; - int lbs_ipc; - int lbs_key; - int lbs_msg_msg; - int lbs_perf_event; - int lbs_task; - int lbs_xattr_count; /* number of xattr slots in new_xattrs array */ - int lbs_tun_dev; - int lbs_bdev; - int lbs_bpf_map; - int lbs_bpf_prog; - int lbs_bpf_token; + unsigned int lbs_cred; + unsigned int lbs_file; + unsigned int lbs_ib; + unsigned int lbs_inode; + unsigned int lbs_sock; + unsigned int lbs_superblock; + unsigned int lbs_ipc; + unsigned int lbs_key; + unsigned int lbs_msg_msg; + unsigned int lbs_perf_event; + unsigned int lbs_task; + unsigned int lbs_xattr_count; /* num xattr slots in new_xattrs array */ + unsigned int lbs_tun_dev; + unsigned int lbs_bdev; + unsigned int lbs_bpf_map; + unsigned int lbs_bpf_prog; + unsigned int lbs_bpf_token; }; /* -- cgit v1.2.3 From cdc028812f727907d1575cf454a5f01ddffa7750 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Tue, 11 Feb 2025 12:18:35 -0500 Subject: lsm: introduce an initcall mechanism into the LSM framework Currently the individual LSMs register their own initcalls, and while this should be harmless, it can be wasteful in the case where a LSM is disabled at boot as the initcall will still be executed. This patch introduces support for managing the initcalls in the LSM framework, and future patches will convert the existing LSMs over to this new mechanism. Only initcall types which are used by the current in-tree LSMs are supported, additional initcall types can easily be added in the future if needed. Reviewed-by: Kees Cook Reviewed-by: Casey Schaufler Reviewed-by: John Johansen Reviewed-by: Mimi Zohar Signed-off-by: Paul Moore --- include/linux/lsm_hooks.h | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index 86e457aa8809..b92008641242 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -151,13 +151,36 @@ enum lsm_order { LSM_ORDER_LAST = 1, /* This is only for integrity. */ }; +/** + * struct lsm_info - Define an individual LSM for the LSM framework. + * @id: LSM name/ID info + * @order: ordering with respect to other LSMs, optional + * @flags: descriptive flags, optional + * @blobs: LSM blob sharing, optional + * @enabled: controlled by CONFIG_LSM, optional + * @init: LSM specific initialization routine + * @initcall_pure: LSM callback for initcall_pure() setup, optional + * @initcall_early: LSM callback for early_initcall setup, optional + * @initcall_core: LSM callback for core_initcall() setup, optional + * @initcall_subsys: LSM callback for subsys_initcall() setup, optional + * @initcall_fs: LSM callback for fs_initcall setup, optional + * @nitcall_device: LSM callback for device_initcall() setup, optional + * @initcall_late: LSM callback for late_initcall() setup, optional + */ struct lsm_info { const struct lsm_id *id; - enum lsm_order order; /* Optional: default is LSM_ORDER_MUTABLE */ - unsigned long flags; /* Optional: flags describing LSM */ - int *enabled; /* Optional: controlled by CONFIG_LSM */ - int (*init)(void); /* Required. */ - struct lsm_blob_sizes *blobs; /* Optional: for blob sharing. */ + enum lsm_order order; + unsigned long flags; + struct lsm_blob_sizes *blobs; + int *enabled; + int (*init)(void); + int (*initcall_pure)(void); + int (*initcall_early)(void); + int (*initcall_core)(void); + int (*initcall_subsys)(void); + int (*initcall_fs)(void); + int (*initcall_device)(void); + int (*initcall_late)(void); }; #define DEFINE_LSM(lsm) \ -- cgit v1.2.3 From dfa024bc3f67a97e1a975dd66b83af8b3845eb19 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Fri, 21 Feb 2025 11:53:29 -0500 Subject: lsm: add a LSM_STARTED_ALL notification event Add a new LSM notifier event, LSM_STARTED_ALL, which is fired once at boot when all of the LSMs have been started. Reviewed-by: Kees Cook Reviewed-by: Casey Schaufler Reviewed-by: John Johansen Signed-off-by: Paul Moore --- include/linux/security.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/security.h b/include/linux/security.h index 556890ea2e83..eb36451ce41f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -85,6 +85,7 @@ struct timezone; enum lsm_event { LSM_POLICY_CHANGE, + LSM_STARTED_ALL, }; struct dm_verity_digest { -- cgit v1.2.3 From 094e94d13b606b820e3d1383e3a361f680ff023a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thi=C3=A9baud=20Weksteen?= Date: Thu, 18 Sep 2025 12:04:34 +1000 Subject: memfd,selinux: call security_inode_init_security_anon() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prior to this change, no security hooks were called at the creation of a memfd file. It means that, for SELinux as an example, it will receive the default type of the filesystem that backs the in-memory inode. In most cases, that would be tmpfs, but if MFD_HUGETLB is passed, it will be hugetlbfs. Both can be considered implementation details of memfd. It also means that it is not possible to differentiate between a file coming from memfd_create and a file coming from a standard tmpfs mount point. Additionally, no permission is validated at creation, which differs from the similar memfd_secret syscall. Call security_inode_init_security_anon during creation. This ensures that the file is setup similarly to other anonymous inodes. On SELinux, it means that the file will receive the security context of its task. The ability to limit fexecve on memfd has been of interest to avoid potential pitfalls where /proc/self/exe or similar would be executed [1][2]. Reuse the "execute_no_trans" and "entrypoint" access vectors, similarly to the file class. These access vectors may not make sense for the existing "anon_inode" class. Therefore, define and assign a new class "memfd_file" to support such access vectors. Guard these changes behind a new policy capability named "memfd_class". [1] https://crbug.com/1305267 [2] https://lore.kernel.org/lkml/20221215001205.51969-1-jeffxu@google.com/ Signed-off-by: Thiébaud Weksteen Reviewed-by: Stephen Smalley Tested-by: Stephen Smalley Acked-by: Hugh Dickins [PM: subj tweak] Signed-off-by: Paul Moore --- include/linux/memfd.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/memfd.h b/include/linux/memfd.h index 6f606d9573c3..cc74de3dbcfe 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -4,6 +4,8 @@ #include +#define MEMFD_ANON_NAME "[memfd]" + #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); -- cgit v1.2.3 From 26ab9830beabda863766be4a79dc590c7645f4d9 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 21 Oct 2025 08:26:49 +0100 Subject: net: stmmac: replace has_xxxx with core_type Replace the has_gmac, has_gmac4 and has_xgmac ints, of which only one can be set when matching a core to its driver backend, with an enumerated type carrying the DWMAC core type. Tested-by: Maxime Chevallier Signed-off-by: Russell King (Oracle) Acked-by: Chen-Yu Tsai Reviewed-by: Maxime Chevallier Tested-by: Mohd Ayaan Anwar Reviewed-by: Bartosz Golaszewski Link: https://patch.msgid.link/E1vB6ld-0000000BIPy-2Qi4@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 99022620457a..151c81c560c8 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -171,6 +171,13 @@ struct dwmac4_addrs { u32 mtl_low_cred_offset; }; +enum dwmac_core_type { + DWMAC_CORE_MAC100, + DWMAC_CORE_GMAC, + DWMAC_CORE_GMAC4, + DWMAC_CORE_XGMAC, +}; + #define STMMAC_FLAG_SPH_DISABLE BIT(1) #define STMMAC_FLAG_USE_PHY_WOL BIT(2) #define STMMAC_FLAG_HAS_SUN8I BIT(3) @@ -186,6 +193,7 @@ struct dwmac4_addrs { #define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY BIT(13) struct plat_stmmacenet_data { + enum dwmac_core_type core_type; int bus_id; int phy_addr; /* MAC ----- optional PCS ----- SerDes ----- optional PHY ----- Media @@ -219,7 +227,6 @@ struct plat_stmmacenet_data { struct stmmac_dma_cfg *dma_cfg; struct stmmac_safety_feature_cfg *safety_feat_cfg; int clk_csr; - int has_gmac; int enh_desc; int tx_coe; int rx_coe; @@ -282,10 +289,8 @@ struct plat_stmmacenet_data { struct reset_control *stmmac_rst; struct reset_control *stmmac_ahb_rst; struct stmmac_axi *axi; - int has_gmac4; int rss_en; int mac_port_sel_speed; - int has_xgmac; u8 vlan_fail_q; struct pci_dev *pdev; int int_snapshot_num; -- cgit v1.2.3 From 114573962a68a527835f2f1433a89bc2f9feac1b Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Tue, 21 Oct 2025 19:46:26 +0800 Subject: net/sched: Remove unused inline helper qdisc_from_priv() Since commit fb38306ceb9e ("net/sched: Retire ATM qdisc"), this is not used and can be removed. Signed-off-by: Yue Haibing Link: https://patch.msgid.link/20251021114626.3148894-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- include/net/pkt_sched.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 8a75c73fc555..c660ac871083 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -25,11 +25,6 @@ struct qdisc_walker { const struct Qdisc * : (const void *)&q->privdata, \ struct Qdisc * : (void *)&q->privdata) -static inline struct Qdisc *qdisc_from_priv(void *priv) -{ - return container_of(priv, struct Qdisc, privdata); -} - /* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth -- cgit v1.2.3 From 7958b4bb806c1af800ca23c8333a98231b3ab0b1 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 22 Oct 2025 15:23:42 +0200 Subject: pinctrl: pinmux: Add missing .function_is_gpio kerneldoc This callback was undocumented, add the docs. Reviewed-by: Bartosz Golaszewski Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinmux.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/pinctrl/pinmux.h b/include/linux/pinctrl/pinmux.h index 6db6c3e1ccc2..094bbe2fd6fd 100644 --- a/include/linux/pinctrl/pinmux.h +++ b/include/linux/pinctrl/pinmux.h @@ -35,6 +35,16 @@ struct pinctrl_gpio_range; * name can be used with the generic @pinctrl_ops to retrieve the * actual pins affected. The applicable groups will be returned in * @groups and the number of groups in @num_groups + * @function_is_gpio: determine if the indicated function selector passed + * corresponds to the GPIO function which is used by the accelerated GPIO + * functions @gpio_request_enable, @gpio_disable_free and + * @gpio_set_direction. When the pin control core can properly determine + * if a function is a GPIO function, it is easier to use the @strict mode + * on the pin controller. Since a single function is passed, this is + * only useful on pin controllers that use a specific function for GPIO, + * and that usually presupposes that a one-group-per-pin approach is + * used, so that a single function can be set on a single pin to turn + * it to GPIO mode. * @set_mux: enable a certain muxing function with a certain pin group. The * driver does not need to figure out whether enabling this function * conflicts some other use of the pins in that group, such collisions -- cgit v1.2.3 From 243ce64b2b371cdf2cbc39c9422cb3047cab6de7 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 13 Oct 2025 12:51:57 +0200 Subject: backlight: Do not include in header file The backlight interfaces don't require anything from , so don't include it. Signed-off-by: Thomas Zimmermann Reviewed-by: Daniel Thompson (RISCstar) Reviewed-by: Simona Vetter Link: https://patch.msgid.link/20251013105553.836715-1-tzimmermann@suse.de Signed-off-by: Lee Jones --- include/linux/backlight.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 10e626db7eee..f29a9ef1052e 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -10,7 +10,6 @@ #define _LINUX_BACKLIGHT_H #include -#include #include #include -- cgit v1.2.3 From c2afdd73e5ba2146c7e8b43b2607da5d4b720d9d Mon Sep 17 00:00:00 2001 From: Avri Altman Date: Thu, 19 Jun 2025 11:56:19 +0300 Subject: mmc: core: Skip to set the default 200mA SD current limit Let's avoid updating the SD current limit when the maximum power is 200mA (0.72W) or less, as this is already the default value for the SD card. In this way we avoid sending an unnecessary command during initialization. Signed-off-by: Avri Altman Signed-off-by: Ulf Hansson --- include/linux/mmc/card.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index ddcdf23d731c..e9e964c20e53 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -182,7 +182,6 @@ struct sd_switch_caps { #define SD_SET_CURRENT_LIMIT_400 1 #define SD_SET_CURRENT_LIMIT_600 2 #define SD_SET_CURRENT_LIMIT_800 3 -#define SD_SET_CURRENT_NO_CHANGE (-1) #define SD_MAX_CURRENT_200 (1 << SD_SET_CURRENT_LIMIT_200) #define SD_MAX_CURRENT_400 (1 << SD_SET_CURRENT_LIMIT_400) -- cgit v1.2.3 From 7b2c4224faa7bc6cdaf1fb6106ec7b46c63a28cb Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 14 Oct 2025 13:00:55 -0700 Subject: scsi: ufs: core: Improve documentation in include/ufs/ufshci.h Make it easier to find the sections in the UFSHCI standard where these constants come from. Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20251014200118.3390839-4-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/ufs/ufshci.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/ufs/ufshci.h b/include/ufs/ufshci.h index e64b70132101..ff96056b2ac3 100644 --- a/include/ufs/ufshci.h +++ b/include/ufs/ufshci.h @@ -83,12 +83,14 @@ enum { }; enum { + /* Submission Queue (SQ) Configuration Registers */ REG_SQATTR = 0x0, REG_SQLBA = 0x4, REG_SQUBA = 0x8, REG_SQDAO = 0xC, REG_SQISAO = 0x10, + /* Completion Queue (CQ) Configuration Registers */ REG_CQATTR = 0x20, REG_CQLBA = 0x24, REG_CQUBA = 0x28, @@ -96,6 +98,7 @@ enum { REG_CQISAO = 0x30, }; +/* Operation and Runtime Registers - Submission Queues and Completion Queues */ enum { REG_SQHP = 0x0, REG_SQTP = 0x4, -- cgit v1.2.3 From b3b0842bcb0696e25b1977238ce2907a4c02d8c4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 14 Oct 2025 13:00:56 -0700 Subject: scsi: ufs: core: Change the type of uic_command::cmd_active Since uic_command::cmd_active is used as a boolean variable, change its type from 'int' into 'bool'. No functionality has been changed. Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20251014200118.3390839-5-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 9425cfd9d00e..4d215a18522c 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -78,7 +78,7 @@ struct uic_command { const u32 argument1; u32 argument2; u32 argument3; - int cmd_active; + bool cmd_active; struct completion done; }; -- cgit v1.2.3 From b30006b5bec1dcba207bc42e7f7cd96a568acc27 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 14 Oct 2025 13:00:58 -0700 Subject: scsi: ufs: core: Move the ufshcd_enable_intr() declaration ufshcd_enable_intr() is not exported and hence should not be declared in include/ufs/ufshcd.h. Fixes: 253757797973 ("scsi: ufs: core: Change MCQ interrupt enable flow") Signed-off-by: Bart Van Assche Reviewed-by: Peter Wang Link: https://patch.msgid.link/20251014200118.3390839-7-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 4d215a18522c..edfbc3a216be 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1295,7 +1295,6 @@ static inline void ufshcd_rmwl(struct ufs_hba *hba, u32 mask, u32 val, u32 reg) void ufshcd_enable_irq(struct ufs_hba *hba); void ufshcd_disable_irq(struct ufs_hba *hba); -void ufshcd_enable_intr(struct ufs_hba *hba, u32 intrs); int ufshcd_alloc_host(struct device *, struct ufs_hba **); int ufshcd_hba_enable(struct ufs_hba *hba); int ufshcd_init(struct ufs_hba *, void __iomem *, unsigned int); -- cgit v1.2.3 From 4da42aaa82d6e3fa2e822e6e771d031c2e20a6c7 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Thu, 16 Oct 2025 11:47:54 -0300 Subject: printk: nbcon: Export console_is_usable The helper will be used on KDB code in the next commits. Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Marcos Paulo de Souza Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-1-866aac60a80e@suse.com Signed-off-by: Petr Mladek --- include/linux/console.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index 8f10d0a85bb4..5c3a718c22fc 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -605,6 +606,48 @@ extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); + +/* + * Check if the given console is currently capable and allowed to print + * records. Note that this function does not consider the current context, + * which can also play a role in deciding if @con can be used to print + * records. + */ +static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) +{ + if (!(flags & CON_ENABLED)) + return false; + + if ((flags & CON_SUSPENDED)) + return false; + + if (flags & CON_NBCON) { + /* The write_atomic() callback is optional. */ + if (use_atomic && !con->write_atomic) + return false; + + /* + * For the !use_atomic case, @printk_kthreads_running is not + * checked because the write_thread() callback is also used + * via the legacy loop when the printer threads are not + * available. + */ + } else { + if (!con->write) + return false; + } + + /* + * Console drivers may assume that per-cpu resources have been + * allocated. So unless they're explicitly marked as being able to + * cope (CON_ANYTIME) don't call them until this CPU is officially up. + */ + if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) + return false; + + return true; +} + #else static inline void nbcon_cpu_emergency_enter(void) { } static inline void nbcon_cpu_emergency_exit(void) { } @@ -612,6 +655,8 @@ static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } +static inline bool console_is_usable(struct console *con, short flags, + bool use_atomic) { return false; } #endif extern int console_set_on_cmdline; -- cgit v1.2.3 From 49f7d3054e84617395a37a058251c81320a3614a Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Thu, 16 Oct 2025 11:47:55 -0300 Subject: printk: nbcon: Introduce KDB helpers These helpers will be used when calling console->write_atomic on KDB code in the next patch. It's basically the same implementation as nbcon_device_try_acquire, but using NBCON_PRIO_EMERGENCY when acquiring the context. If the acquire succeeds, the message and message length are assigned to nbcon_write_context so ->write_atomic can print the message. After release try to flush the console since there may be a backlog of messages in the ringbuffer. The kthread console printers do not get a chance to run while kdb is active. Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Marcos Paulo de Souza Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-2-866aac60a80e@suse.com Signed-off-by: Petr Mladek --- include/linux/console.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index 5c3a718c22fc..9406342b27db 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -606,6 +606,9 @@ extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); +extern bool nbcon_kdb_try_acquire(struct console *con, + struct nbcon_write_context *wctxt); +extern void nbcon_kdb_release(struct nbcon_write_context *wctxt); /* * Check if the given console is currently capable and allowed to print @@ -655,6 +658,9 @@ static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } +static inline bool nbcon_kdb_try_acquire(struct console *con, + struct nbcon_write_context *wctxt) { return false; } +static inline void nbcon_kdb_release(struct console *con) { } static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) { return false; } #endif -- cgit v1.2.3 From 286b113d70007e932d18aa0acfce1a3f5b25d8d1 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Thu, 16 Oct 2025 11:47:56 -0300 Subject: printk: nbcon: Allow KDB to acquire the NBCON context KDB can interrupt any console to execute the "mirrored printing" at any time, so add an exception to nbcon_context_try_acquire_direct to allow to get the context if the current CPU is the same as kdb_printf_cpu. This change will be necessary for the next patch, which fixes kdb_msg_write to work with NBCON consoles by calling ->write_atomic on such consoles. But to print it first needs to acquire the ownership of the console, so nbcon_context_try_acquire_direct is fixed here. Reviewed-by: John Ogness Signed-off-by: Marcos Paulo de Souza Reviewed-by: Petr Mladek Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-3-866aac60a80e@suse.com [pmladek@suse.com: Fix compilation with !CONFIG_KGDB_KDB.] Signed-off-by: Petr Mladek --- include/linux/kdb.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index ecbf819deeca..741c58e86431 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -14,6 +14,7 @@ */ #include +#include /* Shifted versions of the command enable bits are be used if the command * has no arguments (see kdb_check_flags). This allows commands, such as @@ -207,11 +208,26 @@ static inline const char *kdb_walk_kallsyms(loff_t *pos) /* Dynamic kdb shell command registration */ extern int kdb_register(kdbtab_t *cmd); extern void kdb_unregister(kdbtab_t *cmd); + +/* Return true when KDB as locked for printing a message on this CPU. */ +static inline +bool kdb_printf_on_this_cpu(void) +{ + /* + * We can use raw_smp_processor_id() here because the task could + * not get migrated when KDB has locked for printing on this CPU. + */ + return unlikely(READ_ONCE(kdb_printf_cpu) == raw_smp_processor_id()); +} + #else /* ! CONFIG_KGDB_KDB */ static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; } static inline void kdb_init(int level) {} static inline int kdb_register(kdbtab_t *cmd) { return 0; } static inline void kdb_unregister(kdbtab_t *cmd) {} + +static inline bool kdb_printf_on_this_cpu(void) { return false; } + #endif /* CONFIG_KGDB_KDB */ enum { KDB_NOT_INITIALIZED, -- cgit v1.2.3 From 4349cf0df34f37d2470d246bc9be8d9836dfa49e Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Thu, 16 Oct 2025 11:47:57 -0300 Subject: printk: nbcon: Export nbcon_write_context_set_buf This function will be used in the next patch to allow a driver to set both the message and message length of a nbcon_write_context. This is necessary because the function also initializes the ->unsafe_takeover struct member. By using this helper we ensure that the struct is initialized correctly. Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Marcos Paulo de Souza Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-4-866aac60a80e@suse.com Signed-off-by: Petr Mladek --- include/linux/console.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index 9406342b27db..4585eb8e109e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -603,6 +603,8 @@ static inline bool console_is_registered(const struct console *con) extern void nbcon_cpu_emergency_enter(void); extern void nbcon_cpu_emergency_exit(void); extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt); +extern void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len); extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); @@ -655,6 +657,8 @@ static inline bool console_is_usable(struct console *con, short flags, bool use_ static inline void nbcon_cpu_emergency_enter(void) { } static inline void nbcon_cpu_emergency_exit(void) { } static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; } +static inline void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, + char *buf, unsigned int len) { } static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; } static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } -- cgit v1.2.3 From 62627bf0cadf6eae87d92fecf604c42160fe16ef Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Thu, 16 Oct 2025 11:47:58 -0300 Subject: kdb: Adapt kdb_msg_write to work with NBCON consoles Function kdb_msg_write was calling con->write for any found console, but it won't work on NBCON consoles. In this case we should acquire the ownership of the console using NBCON_PRIO_EMERGENCY, since printing kdb messages should only be interrupted by a panic. At this point, the console is required to use the atomic callback. The console is skipped if the write_atomic callback is not set or if the context could not be acquired. The validation of NBCON is done by the console_is_usable helper. The context is released right after write_atomic finishes. The oops_in_progress handling is only needed in the legacy consoles, so it was moved around the con->write callback. Suggested-by: Petr Mladek Reviewed-by: Petr Mladek Reviewed-by: John Ogness Signed-off-by: Marcos Paulo de Souza Link: https://patch.msgid.link/20251016-nbcon-kgdboc-v6-5-866aac60a80e@suse.com [pmladek@suse.com: Fixed compilation with !CONFIG_PRINTK.] Signed-off-by: Petr Mladek --- include/linux/console.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index 4585eb8e109e..d17f1f525bec 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -664,7 +664,7 @@ static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { } static inline bool nbcon_kdb_try_acquire(struct console *con, struct nbcon_write_context *wctxt) { return false; } -static inline void nbcon_kdb_release(struct console *con) { } +static inline void nbcon_kdb_release(struct nbcon_write_context *wctxt) { } static inline bool console_is_usable(struct console *con, short flags, bool use_atomic) { return false; } #endif -- cgit v1.2.3 From 245f14f5fe283c782b16143280f283bee29dbb5f Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Tue, 30 Sep 2025 12:30:55 +0800 Subject: interconnect: Optimize kbps_to_icc() macro The current expansion of kbps_to_icc() introduces unnecessary logic when compiled from a general expression. Rewriting it allows compilers to emit shorter and more efficient code across architectures. For example, with gcc -O2: arm64: old: tst x0, 7 add w1, w0, 7 cset w2, ne cmp w0, 0 csel w0, w1, w0, lt add w0, w2, w0, asr 3 new: add w1, w0, 14 adds w0, w0, 7 csel w0, w1, w0, mi asr w0, w0, 3 x86-64: old: xor eax, eax test dil, 7 lea edx, [rdi+7] setne al test edi, edi cmovns edx, edi sar edx, 3 add eax, edx new: lea eax, [rdi+14] add edi, 7 cmovns eax, edi sar eax, 3 In both cases the old form relies on extra test and compare instructions (tst, test, cmp) combined with conditional moves or sets, while the new form uses fewer instructions by folding the addition and flag update together (adds on arm64, add on x86). This reduces the instruction sequence, prevents multiple evaluations of x when it is an expression or a function call, and keeps the macro simpler. Signed-off-by: Kuan-Wei Chiu Link: https://lore.kernel.org/r/20250930043055.2200322-1-visitorckw@gmail.com Signed-off-by: Georgi Djakov --- include/linux/interconnect.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/interconnect.h b/include/linux/interconnect.h index e4b8808823ad..4b12821528a6 100644 --- a/include/linux/interconnect.h +++ b/include/linux/interconnect.h @@ -16,7 +16,7 @@ #define MBps_to_icc(x) ((x) * 1000) #define GBps_to_icc(x) ((x) * 1000 * 1000) #define bps_to_icc(x) (1) -#define kbps_to_icc(x) ((x) / 8 + ((x) % 8 ? 1 : 0)) +#define kbps_to_icc(x) (((x) + 7) / 8) #define Mbps_to_icc(x) ((x) * 1000 / 8) #define Gbps_to_icc(x) ((x) * 1000 * 1000 / 8) -- cgit v1.2.3 From 70e0a80a1f3580ccf5bc1f34dbb433c67d9d8d00 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 Oct 2025 19:06:51 +0100 Subject: treewide: Remove in_irq() This old alias for in_hardirq() has been marked as deprecated since 2020; remove the stragglers. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251024180654.1691095-1-willy@infradead.org --- include/linux/lockdep.h | 2 +- include/linux/preempt.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 67964dc4db95..dd634103b014 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -616,7 +616,7 @@ do { \ #define lockdep_assert_in_softirq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && \ - (!in_softirq() || in_irq() || in_nmi())); \ + (!in_softirq() || in_hardirq() || in_nmi())); \ } while (0) extern void lockdep_assert_in_softirq_func(void); diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 102202185d7a..d964f965c8ff 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -134,11 +134,9 @@ static __always_inline unsigned char interrupt_context_level(void) /* * The following macros are deprecated and should not be used in new code: - * in_irq() - Obsolete version of in_hardirq() * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled */ -#define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) -- cgit v1.2.3 From e30f8e61e2518a837837daa26cda3c8cc30f3226 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Oct 2025 20:43:40 -0400 Subject: tracing: Add a tracepoint verification check at build time If a tracepoint is defined via DECLARE_TRACE() or TRACE_EVENT() but never called (via the trace_() function), its metadata is still around in memory and not discarded. When created via TRACE_EVENT() the situation is worse because the TRACE_EVENT() creates metadata that can be around 5k per trace event. Having unused trace events causes several thousand of wasted bytes. Add a verifier that injects a string of the name of the tracepoint it calls that is added to the discarded section "__tracepoint_check". For every builtin tracepoint, its name (which is saved in the in-memory section "__tracepoint_strings") will have its name also in the "__tracepoint_check" section if it is used. Add a new program that is run on build called tracepoint-update. This is executed on the vmlinux.o before the __tracepoint_check section is discarded (the section is discarded before vmlinux is created). This program will create an array of each string in the __tracepoint_check section and then sort it. Then it will walk the strings in the __tracepoint_strings section and do a binary search to check if its name is in the __tracepoint_check section. If it is not, then it is unused and a warning is printed. Note, this currently only handles tracepoints that are builtin and not in modules. Enabling this currently with a given config produces: warning: tracepoint 'sched_move_numa' is unused. warning: tracepoint 'sched_stick_numa' is unused. warning: tracepoint 'sched_swap_numa' is unused. warning: tracepoint 'pelt_hw_tp' is unused. warning: tracepoint 'pelt_irq_tp' is unused. warning: tracepoint 'rcu_preempt_task' is unused. warning: tracepoint 'rcu_unlock_preempted_task' is unused. warning: tracepoint 'xdp_bulk_tx' is unused. warning: tracepoint 'xdp_redirect_map' is unused. warning: tracepoint 'xdp_redirect_map_err' is unused. warning: tracepoint 'vma_mas_szero' is unused. warning: tracepoint 'vma_store' is unused. warning: tracepoint 'hugepage_set_pmd' is unused. warning: tracepoint 'hugepage_set_pud' is unused. warning: tracepoint 'hugepage_update_pmd' is unused. warning: tracepoint 'hugepage_update_pud' is unused. warning: tracepoint 'block_rq_remap' is unused. warning: tracepoint 'xhci_dbc_handle_event' is unused. warning: tracepoint 'xhci_dbc_handle_transfer' is unused. warning: tracepoint 'xhci_dbc_gadget_ep_queue' is unused. warning: tracepoint 'xhci_dbc_alloc_request' is unused. warning: tracepoint 'xhci_dbc_free_request' is unused. warning: tracepoint 'xhci_dbc_queue_request' is unused. warning: tracepoint 'xhci_dbc_giveback_request' is unused. warning: tracepoint 'tcp_ao_wrong_maclen' is unused. warning: tracepoint 'tcp_ao_mismatch' is unused. warning: tracepoint 'tcp_ao_key_not_found' is unused. warning: tracepoint 'tcp_ao_rnext_request' is unused. warning: tracepoint 'tcp_ao_synack_no_key' is unused. warning: tracepoint 'tcp_ao_snd_sne_update' is unused. warning: tracepoint 'tcp_ao_rcv_sne_update' is unused. Some of the above is totally unused but others are not used due to their "trace_" functions being inside configs, in which case, the defined tracepoints should also be inside those same configs. Others are architecture specific but defined in generic code, where they should either be moved to the architecture or be surrounded by #ifdef for the architectures they are for. This tool could be updated to process modules in the future. I'd like to thank Mathieu Desnoyers for suggesting using strings instead of pointers, as using pointers in vmlinux.o required handling relocations and it required implementing almost a full feature linker to do so. To enable this check, run the build with: make UT=1 Note, when all the existing unused tracepoints are removed from the build, the "UT=1" will be removed and this will always be enabled when tracepoints are configured to warn on any new tracepoints. The reason this isn't always enabled now is because it will introduce a lot of warnings for the current unused tracepoints, and all bisects would end at this commit for those warnings. Link: https://lore.kernel.org/all/20250528114549.4d8a5e03@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Arnd Bergmann Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Nick Desaulniers Cc: Catalin Marinas Cc: Linus Torvalds Cc: Randy Dunlap Cc: Stephen Rothwell Link: https://lore.kernel.org/20251022004452.920728129@kernel.org Suggested-by: Mathieu Desnoyers # for using strings instead of pointers Signed-off-by: Steven Rostedt (Google) --- include/asm-generic/vmlinux.lds.h | 1 + include/linux/tracepoint.h | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8a9a2e732a65..c510fb097a8c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -1048,6 +1048,7 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) *(.no_trim_symbol) \ /* ld.bfd warns about .gnu.version* even when not emitted */ \ *(.gnu.version*) \ + *(__tracepoint_check) \ #define DISCARDS \ /DISCARD/ : { \ diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 826ce3f8e1f8..1e53d3626c78 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -221,6 +221,15 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __do_trace_##name(args); \ } +/* + * When a tracepoint is used, it's name is added to the __tracepoint_check + * section. This section is only used at build time to make sure all + * defined tracepoints are used. It is discarded after the build. + */ +# define TRACEPOINT_CHECK(name) \ + static const char __used __section("__tracepoint_check") __trace_check[] = \ + #name; + /* * Make sure the alignment of the structure in the __tracepoints section will * not add unwanted padding between the beginning of the section and the @@ -270,6 +279,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void __do_trace_##name(proto) \ { \ + TRACEPOINT_CHECK(name) \ if (cond) { \ guard(preempt_notrace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ @@ -289,6 +299,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DECLARE_TRACE_COMMON(name, PARAMS(proto), PARAMS(args), PARAMS(data_proto)) \ static inline void __do_trace_##name(proto) \ { \ + TRACEPOINT_CHECK(name) \ guard(rcu_tasks_trace)(); \ __DO_TRACE_CALL(name, TP_ARGS(args)); \ } \ -- cgit v1.2.3 From faf938153cad98d97f60ac835ead1db74961507e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Oct 2025 20:43:41 -0400 Subject: tracepoint: Do not warn for unused event that is exported There are a few generic events that may only be used by modules. They are defined and then set with EXPORT_TRACEPOINT*(). Mark events that are exported as being used, even though they still waste memory in the kernel proper. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Arnd Bergmann Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Nicolas Schier Cc: Nick Desaulniers Cc: Catalin Marinas Cc: Linus Torvalds Cc: Randy Dunlap Cc: Stephen Rothwell Link: https://lore.kernel.org/20251022004453.089254920@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/linux/tracepoint.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 1e53d3626c78..8a56f3278b1b 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -227,8 +227,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * defined tracepoints are used. It is discarded after the build. */ # define TRACEPOINT_CHECK(name) \ - static const char __used __section("__tracepoint_check") __trace_check[] = \ - #name; + static const char __used __section("__tracepoint_check") \ + __trace_check_##name[] = #name; /* * Make sure the alignment of the structure in the __tracepoints section will @@ -382,10 +382,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) __DEFINE_TRACE_EXT(_name, NULL, PARAMS(_proto), PARAMS(_args)); #define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \ + TRACEPOINT_CHECK(name) \ EXPORT_SYMBOL_GPL(__tracepoint_##name); \ EXPORT_SYMBOL_GPL(__traceiter_##name); \ EXPORT_STATIC_CALL_GPL(tp_func_##name) #define EXPORT_TRACEPOINT_SYMBOL(name) \ + TRACEPOINT_CHECK(name) \ EXPORT_SYMBOL(__tracepoint_##name); \ EXPORT_SYMBOL(__traceiter_##name); \ EXPORT_STATIC_CALL(tp_func_##name) -- cgit v1.2.3 From 35d7c70870338aa6a367b9e4ed528914320b0be0 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 22 Oct 2025 05:39:46 +0000 Subject: neighbour: Annotate access to neigh_parms fields. NEIGH_VAR() is read locklessly in the fast path, and IPv6 ndisc uses NEIGH_VAR_SET() locklessly. The next patch will convert neightbl_dump_info() to RCU. Let's annotate accesses to neigh_param with READ_ONCE() and WRITE_ONCE(). Note that ndisc_ifinfo_sysctl_change() uses &NEIGH_VAR() and we cannot use '&' with READ_ONCE(), so NEIGH_VAR_PTR() is introduced. Note also that NEIGH_VAR_INIT() does not need WRITE_ONCE() as it is before parms is published. Also, the only user hippi_neigh_setup_dev() is no longer called since commit e3804cbebb67 ("net: remove COMPAT_NET_DEV_OPS"), which looks wrong, but probably no one uses HIPPI and RoadRunner. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251022054004.2514876-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 4a30bd458c5a..998ff9eccebb 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -92,15 +92,17 @@ struct neigh_parms { static inline void neigh_var_set(struct neigh_parms *p, int index, int val) { set_bit(index, p->data_state); - p->data[index] = val; + WRITE_ONCE(p->data[index], val); } -#define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) +#define __NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) +#define NEIGH_VAR(p, attr) READ_ONCE(__NEIGH_VAR(p, attr)) +#define NEIGH_VAR_PTR(p, attr) (&(__NEIGH_VAR(p, attr))) /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used. * In other cases, NEIGH_VAR_SET should be used. */ -#define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val) +#define NEIGH_VAR_INIT(p, attr, val) (__NEIGH_VAR(p, attr) = val) #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val) static inline void neigh_parms_data_state_setall(struct neigh_parms *p) @@ -378,6 +380,13 @@ struct net *neigh_parms_net(const struct neigh_parms *parms) unsigned long neigh_rand_reach_time(unsigned long base); +static inline void neigh_set_reach_time(struct neigh_parms *p) +{ + unsigned long base = NEIGH_VAR(p, BASE_REACHABLE_TIME); + + WRITE_ONCE(p->reachable_time, neigh_rand_reach_time(base)); +} + void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, -- cgit v1.2.3 From 3064d0fe02af23a3956d2b690461abb44da88cf4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 22 Oct 2025 05:39:49 +0000 Subject: neighbour: Convert rwlock of struct neigh_table to spinlock. Only neigh_for_each() and neigh_seq_start/stop() are on the reader side of neigh_table.lock. Let's convert rwlock to the plain spinlock. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251022054004.2514876-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 998ff9eccebb..2dfee6d4258a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -238,7 +238,7 @@ struct neigh_table { atomic_t gc_entries; struct list_head gc_list; struct list_head managed_list; - rwlock_t lock; + spinlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; -- cgit v1.2.3 From 330ce8ffc1848cbfa3e06c2c22750cfffa115579 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 23 Oct 2025 10:16:30 +0100 Subject: net: phy: add phy_can_wakeup() Add phy_can_wakeup() to report whether the PHY driver has marked the PHY device as being wake-up capable as far as the driver model is concerned. Reviewed-by: Maxime Chevallier Reviewed-by: Florian Fainelli Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vBrQs-0000000BLzI-0w3U@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 3c7634482356..3eeeaec52832 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1379,6 +1379,18 @@ static inline void phy_disable_eee_mode(struct phy_device *phydev, u32 link_mode linkmode_clear_bit(link_mode, phydev->advertising_eee); } +/** + * phy_can_wakeup() - indicate whether PHY has driver model wakeup capabilities + * @phydev: The phy_device struct + * + * Returns: true/false depending on the PHY driver's device_set_wakeup_capable() + * setting. + */ +static inline bool phy_can_wakeup(struct phy_device *phydev) +{ + return device_can_wakeup(&phydev->mdio.dev); +} + void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); -- cgit v1.2.3 From b344bfacf1de2dd776a218ce8341b9c672745a01 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 23 Oct 2025 10:16:35 +0100 Subject: net: phy: add phy_may_wakeup() Add phy_may_wakeup() which uses the driver model's device_may_wakeup() when the PHY driver has marked the device as wakeup capable in the driver model, otherwise use phy_drv_wol_enabled(). Replace the sites that used to call phy_drv_wol_enabled() with this as checking the driver model will be more efficient than checking the WoL state. Export phy_may_wakeup() so that phylink can use it. Reviewed-by: Maxime Chevallier Reviewed-by: Florian Fainelli Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vBrQx-0000000BLzO-1RLt@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 3eeeaec52832..17a2cdc9f1a0 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1391,6 +1391,15 @@ static inline bool phy_can_wakeup(struct phy_device *phydev) return device_can_wakeup(&phydev->mdio.dev); } +/** + * phy_may_wakeup() - indicate whether PHY has wakeup enabled + * @phydev: The phy_device struct + * + * Returns: true/false depending on the PHY driver's device_set_wakeup_enabled() + * setting if using the driver model, otherwise the legacy determination. + */ +bool phy_may_wakeup(struct phy_device *phydev); + void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); -- cgit v1.2.3 From b79fbd86c84918790c128e6899b420de4667018e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 23 Oct 2025 10:16:40 +0100 Subject: net: phylink: add phylink managed MAC Wake-on-Lan support Add core phylink managed Wake-on-Lan support, which is enabled when the MAC driver fills in the new .mac_wol_set() method that this commit creates. When this feature is disabled, phylink acts as it has in the past, merely passing the ethtool WoL calls to phylib whenever a PHY exists. No other new functionality provided by this commit is enabled. When this feature is enabled, a more inteligent approach is used. Phylink will first pass WoL options to the PHY, read them back, and attempt to set any options that were not set at the PHY at the MAC. Since we have PHY drivers that report they support WoL, and accept WoL configuration even though they aren't wired up to be capable of waking the system, we need a way to differentiate between PHYs that think they support WoL and those which actually do. As PHY drivers do not make use of the driver model's wake-up infrastructure, but could, we use this to determine whether PHY drivers can participate. This gives a path forward where, as MAC drivers are converted to this, it encourages PHY drivers to also be converted. Phylink will also ignore the mac_wol argument to phylink_suspend() as it now knows the WoL state at the MAC. MAC drivers are expected to record/configure the Wake-on-Lan state in their .mac_set_wol() method, and deal appropriately with it in their suspend/resume methods. The driver model provides assistance to set the IRQ wake support which may assist driver authors in achieving the necessary configuration. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vBrR2-0000000BLzU-1xYL@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 9af0411761d7..59cb58b29d1d 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -156,6 +156,8 @@ enum phylink_op_type { * @lpi_capabilities: MAC speeds which can support LPI signalling * @lpi_timer_default: Default EEE LPI timer setting. * @eee_enabled_default: If set, EEE will be enabled by phylink at creation time + * @wol_phy_legacy: Use Wake-on-Lan with PHY even if phy_can_wakeup() is false + * @wol_mac_support: Bitmask of MAC supported %WAKE_* options */ struct phylink_config { struct device *dev; @@ -173,6 +175,10 @@ struct phylink_config { unsigned long lpi_capabilities; u32 lpi_timer_default; bool eee_enabled_default; + + /* Wake-on-Lan support */ + bool wol_phy_legacy; + u32 wol_mac_support; }; void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed); @@ -188,6 +194,7 @@ void phylink_limit_mac_speed(struct phylink_config *config, u32 max_speed); * @mac_link_up: allow the link to come up. * @mac_disable_tx_lpi: disable LPI. * @mac_enable_tx_lpi: enable and configure LPI. + * @mac_wol_set: configure Wake-on-Lan settings at the MAC. * * The individual methods are described more fully below. */ @@ -211,6 +218,9 @@ struct phylink_mac_ops { void (*mac_disable_tx_lpi)(struct phylink_config *config); int (*mac_enable_tx_lpi)(struct phylink_config *config, u32 timer, bool tx_clk_stop); + + int (*mac_wol_set)(struct phylink_config *config, u32 wolopts, + const u8 *sopass); }; #if 0 /* For kernel-doc purposes only. */ @@ -440,6 +450,22 @@ void mac_disable_tx_lpi(struct phylink_config *config); */ int mac_enable_tx_lpi(struct phylink_config *config, u32 timer, bool tx_clk_stop); + +/** + * mac_wol_set() - configure the Wake-on-Lan parameters + * @config: a pointer to a &struct phylink_config. + * @wolopts: Bitmask of %WAKE_* flags for enabled Wake-On-Lan modes. + * @sopass: SecureOn(tm) password; meaningful only for %WAKE_MAGICSECURE + * + * Enable the specified Wake-on-Lan options at the MAC. Options that the + * PHY can handle will have been removed from @wolopts. + * + * The presence of this method enables phylink-managed WoL support. + * + * Returns: 0 on success. + */ +int (*mac_wol_set)(struct phylink_config *config, u32 wolopts, + const u8 *sopass); #endif struct phylink_pcs_ops; -- cgit v1.2.3 From dc1a2a9ce5b2c80e02115ff6fb29b726ad9d7777 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 23 Oct 2025 10:16:45 +0100 Subject: net: phylink: add phylink managed wake-on-lan PHY speed control Some drivers, e.g. stmmac, use the speed_up()/speed_down() APIs to gain additional power saving during Wake-on-LAN where the PHY is managing the state. Add support to phylink for this, which can be enabled by the MAC driver. Only change the PHY speed if the PHY is configured for wake-up, but without any wake-up on the MAC side, as MAC side means changing the configuration once the negotiation has completed. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vBrR7-0000000BLza-2PjK@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 59cb58b29d1d..38363e566ac3 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -157,6 +157,7 @@ enum phylink_op_type { * @lpi_timer_default: Default EEE LPI timer setting. * @eee_enabled_default: If set, EEE will be enabled by phylink at creation time * @wol_phy_legacy: Use Wake-on-Lan with PHY even if phy_can_wakeup() is false + * @wol_phy_speed_ctrl: Use phy speed control on suspend/resume * @wol_mac_support: Bitmask of MAC supported %WAKE_* options */ struct phylink_config { @@ -178,6 +179,7 @@ struct phylink_config { /* Wake-on-Lan support */ bool wol_phy_legacy; + bool wol_phy_speed_ctrl; u32 wol_mac_support; }; -- cgit v1.2.3 From eea31f21dce10814e34dc7ef7ed5136269c7bb59 Mon Sep 17 00:00:00 2001 From: Adithya Jayachandran Date: Wed, 15 Oct 2025 18:40:55 -0700 Subject: {rdma,net}/mlx5: Query vports mac address from device Before this patch during either switchdev or legacy mode enablement we cleared the mac address of vports between changes. This change allows us to preserve the vports mac address between eswitch mode changes. Vports hold information for VFs/SFs such as the permanent mac address. VF/SF mac can be set either by iproute vf interface or devlink function interface. For no obvious reason we reset it to 0 on switchdev/legacy mode changes, this patch is fixing that, to align with other vport information that are never reset, e.g GUID,mtu,promisc mode, etc .. Signed-off-by: Adithya Jayachandran Signed-off-by: Saeed Mahameed Reviewed-by: Mark Bloch Acked-by: Leon Romanovsky # RDMA --- include/linux/mlx5/vport.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index c87b9507cfa1..f876bfc0669c 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -73,7 +73,8 @@ int mlx5_modify_nic_vport_mtu(struct mlx5_core_dev *mdev, u16 mtu); int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, u64 *system_image_guid); int mlx5_query_nic_vport_sd_group(struct mlx5_core_dev *mdev, u8 *sd_group); -int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); +int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, + u16 vport, bool other_vport, u64 *node_guid); int mlx5_modify_nic_vport_node_guid(struct mlx5_core_dev *mdev, u16 vport, u64 node_guid); int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, -- cgit v1.2.3 From a392cde88d19af917740d27e13115447d3b21a06 Mon Sep 17 00:00:00 2001 From: Ryder Lee Date: Tue, 23 Sep 2025 17:23:22 +0000 Subject: wifi: cfg80211/mac80211: validate radio frequency range for monitor mode In multi-radio devices, it is possible to have an MLD AP and a monitor interface active at the same time. In such cases, monitor mode may not be able to specify a fixed channel and could end up capturing frames from all radios, including those outside the intended frequency bands. This patch adds frequency validation for monitor mode. Received frames are now only processed if their frequency fall within the allowed ranges of the radios specified by the interface's radio_mask. This prevents monitor mode from capturing frames outside the supported radio. Signed-off-by: Ryder Lee Link: https://patch.msgid.link/700b8284e845d96654eb98431f8eeb5a81503862.1758647858.git.ryder.lee@mediatek.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 781624f5913a..3b6f48a783bb 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1015,6 +1015,7 @@ const struct cfg80211_chan_def * cfg80211_chandef_compatible(const struct cfg80211_chan_def *chandef1, const struct cfg80211_chan_def *chandef2); + /** * nl80211_chan_width_to_mhz - get the channel width in MHz * @chan_width: the channel width from &enum nl80211_chan_width @@ -6882,6 +6883,19 @@ static inline bool cfg80211_channel_is_psc(struct ieee80211_channel *chan) return ieee80211_frequency_to_channel(chan->center_freq) % 16 == 5; } +/** + * ieee80211_radio_freq_range_valid - Check if the radio supports the + * specified frequency range + * + * @radio: wiphy radio + * @freq: the frequency (in KHz) to be queried + * @width: the bandwidth (in KHz) to be queried + * + * Return: whether or not the given frequency range is valid for the given radio + */ +bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, + u32 freq, u32 width); + /** * cfg80211_radio_chandef_valid - Check if the radio supports the chandef * -- cgit v1.2.3 From 7cc986c04a9b07d91684f7e326fa5b960215bc97 Mon Sep 17 00:00:00 2001 From: Roopni Devanathan Date: Fri, 24 Oct 2025 10:16:48 +0530 Subject: wifi: cfg80211: Add debugfs support for multi-radio wiphy In multi-radio wiphy architecture, where a single wiphy can have multiple radios tied to it, radio specific configuration parameters and global wiphy parameters are maintained for the entire physical device and common to all radios. But, each radio in a wiphy can have different values for each radio configuration parameter, like RTS threshold. With the current debugfs directory structure, the values of global wiphy configuration parameters can be viewed, but, values of individual radio configuration parameters cannot be viewed, as radio specific configuration parameters are not maintained, separately. To address this, in addition to maintaining global wiphy configuration parameters common to all radios, create separate debugfs directories for each radio in a wiphy to maintain parameters corresponding to that radio in this directory. In implementation, maintain a dentry structure in wiphy_radio_cfg, a structure containing radio configurations of a wiphy. This struct is maintained to denote per-radio configurations of a wiphy. Create separate directories representing each radio within phy#X directory in debugfs during wiphy registration. Sample directory structure with this change: ls /sys/kernel/debug/ieee80211/phy0/radio radio0/ radio1/ radio2/ Signed-off-by: Roopni Devanathan Link: https://patch.msgid.link/20251024044649.483557-2-quic_rdevanat@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 3b6f48a783bb..53490eb04e87 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5684,9 +5684,13 @@ struct wiphy_iftype_akm_suites { * * @rts_threshold: RTS threshold (dot11RTSThreshold); * -1 (default) = RTS/CTS disabled + * @radio_debugfsdir: Pointer to debugfs directory containing the radio- + * specific parameters. + * NULL (default) = Debugfs directory not created */ struct wiphy_radio_cfg { u32 rts_threshold; + struct dentry *radio_debugfsdir; }; /** -- cgit v1.2.3 From f864e4b721e386be132cc973eadefe5d52cdfd94 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Wed, 15 Oct 2025 20:26:07 +0100 Subject: clk: renesas: rzv2h: Add support for DSI clocks Add support for PLLDSI and its post-dividers in the RZ/V2H CPG driver and export helper APIs for use by the DSI driver. Introduce per-PLL-DSI state in the CPG private structure and provide a set of helper functions that find valid PLL parameter combinations for a requested frequency. The new helpers are rzv2h_get_pll_pars(), rzv2h_get_pll_div_pars(), rzv2h_get_pll_divs_pars() and rzv2h_get_pll_dtable_pars() and they are exported in the "RZV2H_CPG" namespace for use by other consumers (notably the DSI driver). These helpers perform iterative searches over PLL parameters (M, K, P, S) and optional post-dividers and return the best match (or an exact match when possible). Move PLL/CLK related limits and parameter types into the shared include (include/linux/clk/renesas.h) by adding struct rzv2h_pll_limits, struct rzv2h_pll_pars and struct rzv2h_pll_div_pars plus the RZV2H_CPG_PLL_DSI_LIMITS() helper macro to define DSI PLL limits. This change centralises the PLLDSI algorithms so the CPG and DSI drivers compute PLL parameters consistently and allows the DSI driver to accurately request rates and program its PLL. Co-developed-by: Fabrizio Castro Signed-off-by: Fabrizio Castro Signed-off-by: Lad Prabhakar Acked-by: Tomi Valkeinen Reviewed-by: Geert Uytterhoeven Link: https://patch.msgid.link/20251015192611.241920-4-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/linux/clk/renesas.h | 145 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) (limited to 'include') diff --git a/include/linux/clk/renesas.h b/include/linux/clk/renesas.h index 0ebbe2f0b45e..69d8159deee3 100644 --- a/include/linux/clk/renesas.h +++ b/include/linux/clk/renesas.h @@ -10,7 +10,9 @@ #ifndef __LINUX_CLK_RENESAS_H_ #define __LINUX_CLK_RENESAS_H_ +#include #include +#include struct device; struct device_node; @@ -32,4 +34,147 @@ void cpg_mssr_detach_dev(struct generic_pm_domain *unused, struct device *dev); #define cpg_mssr_attach_dev NULL #define cpg_mssr_detach_dev NULL #endif + +/** + * struct rzv2h_pll_limits - PLL parameter constraints + * + * This structure defines the minimum and maximum allowed values for + * various parameters used to configure a PLL. These limits ensure + * the PLL operates within valid and stable ranges. + * + * @fout: Output frequency range (in MHz) + * @fout.min: Minimum allowed output frequency + * @fout.max: Maximum allowed output frequency + * + * @fvco: PLL oscillation frequency range (in MHz) + * @fvco.min: Minimum allowed VCO frequency + * @fvco.max: Maximum allowed VCO frequency + * + * @m: Main-divider range + * @m.min: Minimum main-divider value + * @m.max: Maximum main-divider value + * + * @p: Pre-divider range + * @p.min: Minimum pre-divider value + * @p.max: Maximum pre-divider value + * + * @s: Divider range + * @s.min: Minimum divider value + * @s.max: Maximum divider value + * + * @k: Delta-sigma modulator range (signed) + * @k.min: Minimum delta-sigma value + * @k.max: Maximum delta-sigma value + */ +struct rzv2h_pll_limits { + struct { + u32 min; + u32 max; + } fout; + + struct { + u32 min; + u32 max; + } fvco; + + struct { + u16 min; + u16 max; + } m; + + struct { + u8 min; + u8 max; + } p; + + struct { + u8 min; + u8 max; + } s; + + struct { + s16 min; + s16 max; + } k; +}; + +/** + * struct rzv2h_pll_pars - PLL configuration parameters + * + * This structure contains the configuration parameters for the + * Phase-Locked Loop (PLL), used to achieve a specific output frequency. + * + * @m: Main divider value + * @p: Pre-divider value + * @s: Output divider value + * @k: Delta-sigma modulation value + * @freq_millihz: Calculated PLL output frequency in millihertz + * @error_millihz: Frequency error from target in millihertz (signed) + */ +struct rzv2h_pll_pars { + u16 m; + u8 p; + u8 s; + s16 k; + u64 freq_millihz; + s64 error_millihz; +}; + +/** + * struct rzv2h_pll_div_pars - PLL parameters with post-divider + * + * This structure is used for PLLs that include an additional post-divider + * stage after the main PLL block. It contains both the PLL configuration + * parameters and the resulting frequency/error values after the divider. + * + * @pll: Main PLL configuration parameters (see struct rzv2h_pll_pars) + * + * @div: Post-divider configuration and result + * @div.divider_value: Divider applied to the PLL output + * @div.freq_millihz: Output frequency after divider in millihertz + * @div.error_millihz: Frequency error from target in millihertz (signed) + */ +struct rzv2h_pll_div_pars { + struct rzv2h_pll_pars pll; + struct { + u8 divider_value; + u64 freq_millihz; + s64 error_millihz; + } div; +}; + +#define RZV2H_CPG_PLL_DSI_LIMITS(name) \ + static const struct rzv2h_pll_limits (name) = { \ + .fout = { .min = 25 * MEGA, .max = 375 * MEGA }, \ + .fvco = { .min = 1600 * MEGA, .max = 3200 * MEGA }, \ + .m = { .min = 64, .max = 533 }, \ + .p = { .min = 1, .max = 4 }, \ + .s = { .min = 0, .max = 6 }, \ + .k = { .min = -32768, .max = 32767 }, \ + } \ + +#ifdef CONFIG_CLK_RZV2H +bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_pars *pars, u64 freq_millihz); + +bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_div_pars *pars, + const u8 *table, u8 table_size, u64 freq_millihz); +#else +static inline bool rzv2h_get_pll_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_pars *pars, + u64 freq_millihz) +{ + return false; +} + +static inline bool rzv2h_get_pll_divs_pars(const struct rzv2h_pll_limits *limits, + struct rzv2h_pll_div_pars *pars, + const u8 *table, u8 table_size, + u64 freq_millihz) +{ + return false; +} +#endif + #endif -- cgit v1.2.3 From 77a58ba7c64ccca20616aa03599766ccb0d1a330 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Tue, 21 Oct 2025 10:47:03 -0400 Subject: spi: spi-mem: Trace exec_op The spi subsystem has tracing, which is very convenient when debugging problems. Add tracing for spi-mem too so that accesses that skip the spi subsystem can still be seen. The format is roughly based on the existing spi tracing. We don't bother tracing the op's address because the tracing happens while the memory is locked, so there can be no confusion about the matching of start and stop. The conversion of cmd/addr/dummy to an array is directly analogous to the conversion in the latter half of spi_mem_exec_op. Signed-off-by: Sean Anderson Link: https://patch.msgid.link/20251021144702.1582397-1-sean.anderson@linux.dev Signed-off-by: Mark Brown --- include/trace/events/spi-mem.h | 106 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 include/trace/events/spi-mem.h (limited to 'include') diff --git a/include/trace/events/spi-mem.h b/include/trace/events/spi-mem.h new file mode 100644 index 000000000000..d13f0bcff5e7 --- /dev/null +++ b/include/trace/events/spi-mem.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM spi-mem + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR spi_mem + +#if !defined(_TRACE_SPI_MEM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SPI_MEM_H + +#include +#include + +#define decode_dtr(dtr) \ + __print_symbolic(dtr, \ + { 0, "S" }, \ + { 1, "D" }) + +TRACE_EVENT(spi_mem_start_op, + TP_PROTO(struct spi_mem *mem, const struct spi_mem_op *op), + TP_ARGS(mem, op), + + TP_STRUCT__entry( + __string(name, mem->name) + __dynamic_array(u8, op, 1 + op->addr.nbytes + op->dummy.nbytes) + __dynamic_array(u8, data, op->data.dir == SPI_MEM_DATA_OUT ? + min(op->data.nbytes, 64) : 0) + __field(u32, data_len) + __field(u32, max_freq) + __field(u8, cmd_buswidth) + __field(bool, cmd_dtr) + __field(u8, addr_buswidth) + __field(bool, addr_dtr) + __field(u8, dummy_nbytes) + __field(u8, data_buswidth) + __field(bool, data_dtr) + ), + + TP_fast_assign( + int i; + + __assign_str(name); + __entry->max_freq = op->max_freq ?: mem->spi->max_speed_hz; + + __entry->cmd_buswidth = op->cmd.buswidth; + __entry->cmd_dtr = op->cmd.dtr; + *((u8 *)__get_dynamic_array(op)) = op->cmd.opcode; + + __entry->addr_buswidth = op->addr.buswidth; + __entry->addr_dtr = op->addr.dtr; + for (i = 0; i < op->addr.nbytes; i++) + ((u8 *)__get_dynamic_array(op))[i + 1] = + op->addr.val >> (8 * (op->addr.nbytes - i - 1)); + + memset(((u8 *)__get_dynamic_array(op)) + op->addr.nbytes + 1, + 0xff, op->dummy.nbytes); + + __entry->data_len = op->data.nbytes; + __entry->data_buswidth = op->data.buswidth; + __entry->data_dtr = op->data.dtr; + if (op->data.dir == SPI_MEM_DATA_OUT) + memcpy(__get_dynamic_array(data), op->data.buf.out, + __get_dynamic_array_len(data)); + ), + + TP_printk("%s %u%s-%u%s-%u%s @%u Hz op=[%*phD] len=%u tx=[%*phD]", + __get_str(name), + __entry->cmd_buswidth, decode_dtr(__entry->cmd_dtr), + __entry->addr_buswidth, decode_dtr(__entry->addr_dtr), + __entry->data_buswidth, decode_dtr(__entry->data_dtr), + __entry->max_freq, + __get_dynamic_array_len(op), __get_dynamic_array(op), + __entry->data_len, + __get_dynamic_array_len(data), __get_dynamic_array(data)) +); + +TRACE_EVENT(spi_mem_stop_op, + TP_PROTO(struct spi_mem *mem, const struct spi_mem_op *op), + TP_ARGS(mem, op), + + TP_STRUCT__entry( + __string(name, mem->name) + __dynamic_array(u8, data, op->data.dir == SPI_MEM_DATA_IN ? + min(op->data.nbytes, 64) : 0) + __field(u32, data_len) + ), + + TP_fast_assign( + __assign_str(name); + __entry->data_len = op->data.nbytes; + if (op->data.dir == SPI_MEM_DATA_IN) + memcpy(__get_dynamic_array(data), op->data.buf.in, + __get_dynamic_array_len(data)); + ), + + TP_printk("%s len=%u rx=[%*phD]", + __get_str(name), + __entry->data_len, + __get_dynamic_array_len(data), __get_dynamic_array(data)) +); + + +#endif /* _TRACE_SPI_MEM_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 812df545e3e44051d7fd39c057e53ffb56868451 Mon Sep 17 00:00:00 2001 From: Zhengnan Chen Date: Sat, 18 Oct 2025 21:26:10 +0800 Subject: dt-bindings: mediatek: mt8189: Add bindings for MM & APU & INFRA IOMMU There are three iommu in total, namely MM_IOMMU, APU_IOMMU, INFRA_IOMMU, Add bindings for them. Signed-off-by: Zhengnan Chen Reviewed-by: Matthias Brugger Acked-by: Conor Dooley Reviewed-by: AngeloGioacchino Del Regno Acked-by: Krzysztof Kozlowski Signed-off-by: Joerg Roedel --- .../memory/mediatek,mt8189-memory-port.h | 283 +++++++++++++++++++++ 1 file changed, 283 insertions(+) create mode 100644 include/dt-bindings/memory/mediatek,mt8189-memory-port.h (limited to 'include') diff --git a/include/dt-bindings/memory/mediatek,mt8189-memory-port.h b/include/dt-bindings/memory/mediatek,mt8189-memory-port.h new file mode 100644 index 000000000000..849fead3d0f7 --- /dev/null +++ b/include/dt-bindings/memory/mediatek,mt8189-memory-port.h @@ -0,0 +1,283 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2025 MediaTek Inc. + * Author: Zhengnan chen + */ +#ifndef _DT_BINDINGS_MEMORY_MEDIATEK_MT8189_MEMORY_PORT_H_ +#define _DT_BINDINGS_MEMORY_MEDIATEK_MT8189_MEMORY_PORT_H_ + +#include + +#define SMI_L0_ID (0) +#define SMI_L1_ID (1) +#define SMI_L2_ID (2) +#define SMI_L4_ID (3) +#define SMI_L7_ID (4) +#define SMI_L9_ID (5) +#define SMI_L11_ID (6) +#define SMI_L13_ID (7) +#define SMI_L14_ID (8) +#define SMI_L16_ID (9) +#define SMI_L17_ID (10) +#define SMI_L19_ID (11) +#define SMI_L20_ID (12) + +/* + * MM IOMMU supports 16GB dma address. We separate it to four ranges: + * 0 ~ 4G; 4G ~ 8G; 8G ~ 12G; 12G ~ 16G, we could adjust these masters + * locate in anyone region. BUT: + * a) Make sure all the ports inside a larb are in one range. + * b) The iova of any master can NOT cross the 4G/8G/12G boundary. + * + * This is the suggested mapping in this SoC: + * + * modules dma-address-region larbs-ports + * disp/mdp 0 ~ 4G larb0/1/2 + * vcodec 4G ~ 8G larb4/7 + * imgsys/cam/ipesys 8G ~ 12G the other larbs. + * N/A 12G ~ 16G + */ + +/* Larb0 -- disp */ +#define M4U_L0_P0_DISP_OVL0_4L_HDR MTK_M4U_ID(SMI_L0_ID, 0) +#define M4U_L0_P1_DISP_OVL0_4L_RDMA0 MTK_M4U_ID(SMI_L0_ID, 1) +#define M4U_L0_P2_DISP_OVL1_4L_RDMA1 MTK_M4U_ID(SMI_L0_ID, 2) +#define M4U_L0_P3_DISP_OVL0_4L_RDMA2 MTK_M4U_ID(SMI_L0_ID, 3) +#define M4U_L0_P4_DISP_OVL1_4L_RDMA3 MTK_M4U_ID(SMI_L0_ID, 4) +#define M4U_L0_P5_DISP_RDMA0 MTK_M4U_ID(SMI_L0_ID, 5) +#define M4U_L0_P6_DISP_WDMA0 MTK_M4U_ID(SMI_L0_ID, 6) +#define M4U_L0_P7_DISP_FAKE_ENG0 MTK_M4U_ID(SMI_L0_ID, 7) + +/* Larb1 -- disp */ +#define M4U_L1_P0_DISP_OVL1_4L_HDR MTK_M4U_ID(SMI_L1_ID, 0) +#define M4U_L1_P1_DISP_OVL1_4L_RDMA0 MTK_M4U_ID(SMI_L1_ID, 1) +#define M4U_L1_P2_DISP_OVL0_4L_RDMA1 MTK_M4U_ID(SMI_L1_ID, 2) +#define M4U_L1_P3_DISP_OVL1_4L_RDMA2 MTK_M4U_ID(SMI_L1_ID, 3) +#define M4U_L1_P4_DISP_OVL0_4L_RDMA3 MTK_M4U_ID(SMI_L1_ID, 4) +#define M4U_L1_P5_DISP_RDMA1 MTK_M4U_ID(SMI_L1_ID, 5) +#define M4U_L1_P6_DISP_WDMA1 MTK_M4U_ID(SMI_L1_ID, 6) +#define M4U_L1_P7_DISP_FAKE_ENG1 MTK_M4U_ID(SMI_L1_ID, 7) + +/* Larb2 -- mmlsys(mdp) */ +#define M4U_L2_P0_MDP_RDMA0 MTK_M4U_ID(SMI_L2_ID, 0) +#define M4U_L2_P1_MDP_RDMA1 MTK_M4U_ID(SMI_L2_ID, 1) +#define M4U_L2_P2_MDP_WROT0 MTK_M4U_ID(SMI_L2_ID, 2) +#define M4U_L2_P3_MDP_WROT1 MTK_M4U_ID(SMI_L2_ID, 3) +#define M4U_L2_P4_MDP_DUMMY0 MTK_M4U_ID(SMI_L2_ID, 4) +#define M4U_L2_P5_MDP_DUMMY1 MTK_M4U_ID(SMI_L2_ID, 5) +#define M4U_L2_P6_MDP_RDMA2 MTK_M4U_ID(SMI_L2_ID, 6) +#define M4U_L2_P7_MDP_RDMA3 MTK_M4U_ID(SMI_L2_ID, 7) +#define M4U_L2_P8_MDP_WROT2 MTK_M4U_ID(SMI_L2_ID, 8) +#define M4U_L2_P9_MDP_WROT3 MTK_M4U_ID(SMI_L2_ID, 9) +#define M4U_L2_P10_DISP_FAKE0 MTK_M4U_ID(SMI_L2_ID, 10) + +/* Larb3: null */ + +/* Larb4 -- vdec */ +#define M4U_L4_P0_HW_VDEC_MC_EXT MTK_M4U_ID(SMI_L4_ID, 0) +#define M4U_L4_P1_HW_VDEC_UFO_EXT MTK_M4U_ID(SMI_L4_ID, 1) +#define M4U_L4_P2_HW_VDEC_PP_EXT MTK_M4U_ID(SMI_L4_ID, 2) +#define M4U_L4_P3_HW_VDEC_PRED_RD_EXT MTK_M4U_ID(SMI_L4_ID, 3) +#define M4U_L4_P4_HW_VDEC_PRED_WR_EXT MTK_M4U_ID(SMI_L4_ID, 4) +#define M4U_L4_P5_HW_VDEC_PPWRAP_EXT MTK_M4U_ID(SMI_L4_ID, 5) +#define M4U_L4_P6_HW_VDEC_TILE_EXT MTK_M4U_ID(SMI_L4_ID, 6) +#define M4U_L4_P7_HW_VDEC_VLD_EXT MTK_M4U_ID(SMI_L4_ID, 7) +#define M4U_L4_P8_HW_VDEC_VLD2_EXT MTK_M4U_ID(SMI_L4_ID, 8) +#define M4U_L4_P9_HW_VDEC_AVC_MV_EXT MTK_M4U_ID(SMI_L4_ID, 9) +#define M4U_L4_P10_HW_VDEC_RG_CTRL_DMA_EXT MTK_M4U_ID(SMI_L4_ID, 10) +#define M4U_L4_P11_HW_VDEC_UFO_ENC_EXT MTK_M4U_ID(SMI_L4_ID, 11) + +/* Larb5: null */ + +/* Larb6: null */ + +/* Larb7 -- venc */ +#define M4U_L7_P0_VENC_RCPU MTK_M4U_ID(SMI_L7_ID, 0) +#define M4U_L7_P1_VENC_REC MTK_M4U_ID(SMI_L7_ID, 1) +#define M4U_L7_P2_VENC_BSDMA MTK_M4U_ID(SMI_L7_ID, 2) +#define M4U_L7_P3_VENC_SV_COMV MTK_M4U_ID(SMI_L7_ID, 3) +#define M4U_L7_P4_VENC_RD_COMV MTK_M4U_ID(SMI_L7_ID, 4) +#define M4U_L7_P5_JPGENC_Y_RDMA MTK_M4U_ID(SMI_L7_ID, 5) +#define M4U_L7_P6_JPGENC_C_RDMA MTK_M4U_ID(SMI_L7_ID, 6) +#define M4U_L7_P7_JPGENC_Q_RDMA MTK_M4U_ID(SMI_L7_ID, 7) +#define M4U_L7_P8_VENC_SUB_W_LUMA MTK_M4U_ID(SMI_L7_ID, 8) +#define M4U_L7_P9_JPGENC_BSDMA MTK_M4U_ID(SMI_L7_ID, 9) +#define M4U_L7_P10_VENC_CUR_LUMA MTK_M4U_ID(SMI_L7_ID, 10) +#define M4U_L7_P11_VENC_CUR_CHROMA MTK_M4U_ID(SMI_L7_ID, 11) +#define M4U_L7_P12_VENC_REF_LUMA MTK_M4U_ID(SMI_L7_ID, 12) +#define M4U_L7_P13_VENC_REF_CHROMA MTK_M4U_ID(SMI_L7_ID, 13) +#define M4U_L7_P14_VENC_SUB_R_LUMA MTK_M4U_ID(SMI_L7_ID, 14) +#define M4U_L7_P15_JPGDEC_WDMA MTK_M4U_ID(SMI_L7_ID, 15) +#define M4U_L7_P16_JPGDEC_BSDMA MTK_M4U_ID(SMI_L7_ID, 16) +#define M4U_L7_P17_JPGDEC_HUFF_OFFSET MTK_M4U_ID(SMI_L7_ID, 17) + +/* Larb8: null */ + +/* Larb9 --imgsys */ +#define M4U_L9_P0_IMGI_D1 MTK_M4U_ID(SMI_L9_ID, 0) +#define M4U_L9_P1_IMGBI_D1 MTK_M4U_ID(SMI_L9_ID, 1) +#define M4U_L9_P2_DMGI_D1 MTK_M4U_ID(SMI_L9_ID, 2) +#define M4U_L9_P3_DEPI_D1 MTK_M4U_ID(SMI_L9_ID, 3) +#define M4U_L9_P4_LCE_D1 MTK_M4U_ID(SMI_L9_ID, 4) +#define M4U_L9_P5_SMTI_D1 MTK_M4U_ID(SMI_L9_ID, 5) +#define M4U_L9_P6_SMTO_D2 MTK_M4U_ID(SMI_L9_ID, 6) +#define M4U_L9_P7_SMTO_D1 MTK_M4U_ID(SMI_L9_ID, 7) +#define M4U_L9_P8_CRZO_D1 MTK_M4U_ID(SMI_L9_ID, 8) +#define M4U_L9_P9_IMG3O_D1 MTK_M4U_ID(SMI_L9_ID, 9) +#define M4U_L9_P10_VIPI_D1 MTK_M4U_ID(SMI_L9_ID, 10) +#define M4U_L9_P11_SMTI_D5 MTK_M4U_ID(SMI_L9_ID, 11) +#define M4U_L9_P12_TIMGO_D1 MTK_M4U_ID(SMI_L9_ID, 12) +#define M4U_L9_P13_UFBC_W0 MTK_M4U_ID(SMI_L9_ID, 13) +#define M4U_L9_P14_UFBC_R0 MTK_M4U_ID(SMI_L9_ID, 14) +#define M4U_L9_P15_WPE_RDMA1 MTK_M4U_ID(SMI_L9_ID, 15) +#define M4U_L9_P16_WPE_RDMA0 MTK_M4U_ID(SMI_L9_ID, 16) +#define M4U_L9_P17_WPE_WDMA MTK_M4U_ID(SMI_L9_ID, 17) +#define M4U_L9_P18_MFB_RDMA0 MTK_M4U_ID(SMI_L9_ID, 18) +#define M4U_L9_P19_MFB_RDMA1 MTK_M4U_ID(SMI_L9_ID, 19) +#define M4U_L9_P20_MFB_RDMA2 MTK_M4U_ID(SMI_L9_ID, 20) +#define M4U_L9_P21_MFB_RDMA3 MTK_M4U_ID(SMI_L9_ID, 21) +#define M4U_L9_P22_MFB_RDMA4 MTK_M4U_ID(SMI_L9_ID, 22) +#define M4U_L9_P23_MFB_RDMA5 MTK_M4U_ID(SMI_L9_ID, 23) +#define M4U_L9_P24_MFB_WDMA0 MTK_M4U_ID(SMI_L9_ID, 24) +#define M4U_L9_P25_MFB_WDMA1 MTK_M4U_ID(SMI_L9_ID, 25) +#define M4U_L9_P26_RESERVE6 MTK_M4U_ID(SMI_L9_ID, 26) +#define M4U_L9_P27_RESERVE7 MTK_M4U_ID(SMI_L9_ID, 27) +#define M4U_L9_P28_RESERVE8 MTK_M4U_ID(SMI_L9_ID, 28) + +/* Larb10: null */ + +/* Larb11 -- imgsys */ +#define M4U_L11_P0_IMGI_D1 MTK_M4U_ID(SMI_L11_ID, 0) +#define M4U_L11_P1_IMGBI_D1 MTK_M4U_ID(SMI_L11_ID, 1) +#define M4U_L11_P2_DMGI_D1 MTK_M4U_ID(SMI_L11_ID, 2) +#define M4U_L11_P3_DEPI_D1 MTK_M4U_ID(SMI_L11_ID, 3) +#define M4U_L11_P4_LCE_D1 MTK_M4U_ID(SMI_L11_ID, 4) +#define M4U_L11_P5_SMTI_D1 MTK_M4U_ID(SMI_L11_ID, 5) +#define M4U_L11_P6_SMTO_D2 MTK_M4U_ID(SMI_L11_ID, 6) +#define M4U_L11_P7_SMTO_D1 MTK_M4U_ID(SMI_L11_ID, 7) +#define M4U_L11_P8_CRZO_D1 MTK_M4U_ID(SMI_L11_ID, 8) +#define M4U_L11_P9_IMG3O_D1 MTK_M4U_ID(SMI_L11_ID, 9) +#define M4U_L11_P10_VIPI_D1 MTK_M4U_ID(SMI_L11_ID, 10) +#define M4U_L11_P11_SMTI_D5 MTK_M4U_ID(SMI_L11_ID, 11) +#define M4U_L11_P12_TIMGO_D1 MTK_M4U_ID(SMI_L11_ID, 12) +#define M4U_L11_P13_UFBC_W0 MTK_M4U_ID(SMI_L11_ID, 13) +#define M4U_L11_P14_UFBC_R0 MTK_M4U_ID(SMI_L11_ID, 14) +#define M4U_L11_P15_WPE_RDMA1 MTK_M4U_ID(SMI_L11_ID, 15) +#define M4U_L11_P16_WPE_RDMA0 MTK_M4U_ID(SMI_L11_ID, 16) +#define M4U_L11_P17_WPE_WDMA MTK_M4U_ID(SMI_L11_ID, 17) +#define M4U_L11_P18_MFB_RDMA0 MTK_M4U_ID(SMI_L11_ID, 18) +#define M4U_L11_P19_MFB_RDMA1 MTK_M4U_ID(SMI_L11_ID, 19) +#define M4U_L11_P20_MFB_RDMA2 MTK_M4U_ID(SMI_L11_ID, 20) +#define M4U_L11_P21_MFB_RDMA3 MTK_M4U_ID(SMI_L11_ID, 21) +#define M4U_L11_P22_MFB_RDMA4 MTK_M4U_ID(SMI_L11_ID, 22) +#define M4U_L11_P23_MFB_RDMA5 MTK_M4U_ID(SMI_L11_ID, 23) +#define M4U_L11_P24_MFB_WDMA0 MTK_M4U_ID(SMI_L11_ID, 24) +#define M4U_L11_P25_MFB_WDMA1 MTK_M4U_ID(SMI_L11_ID, 25) +#define M4U_L11_P26_RESERVE6 MTK_M4U_ID(SMI_L11_ID, 26) +#define M4U_L11_P27_RESERVE7 MTK_M4U_ID(SMI_L11_ID, 27) +#define M4U_L11_P28_RESERVE8 MTK_M4U_ID(SMI_L11_ID, 28) + +/* Larb12: null */ + +/* Larb13 -- cam */ +#define M4U_L13_P0_MRAWI MTK_M4U_ID(SMI_L13_ID, 0) +#define M4U_L13_P1_MRAWO_0 MTK_M4U_ID(SMI_L13_ID, 1) +#define M4U_L13_P2_MRAWO_1 MTK_M4U_ID(SMI_L13_ID, 2) +#define M4U_L13_P3_CAMSV_1 MTK_M4U_ID(SMI_L13_ID, 3) +#define M4U_L13_P4_CAMSV_2 MTK_M4U_ID(SMI_L13_ID, 4) +#define M4U_L13_P5_CAMSV_3 MTK_M4U_ID(SMI_L13_ID, 5) +#define M4U_L13_P6_CAMSV_4 MTK_M4U_ID(SMI_L13_ID, 6) +#define M4U_L13_P7_CAMSV_5 MTK_M4U_ID(SMI_L13_ID, 7) +#define M4U_L13_P8_CAMSV_6 MTK_M4U_ID(SMI_L13_ID, 8) +#define M4U_L13_P9_CCUI MTK_M4U_ID(SMI_L13_ID, 9) +#define M4U_L13_P10_CCUO MTK_M4U_ID(SMI_L13_ID, 10) +#define M4U_L13_P11_FAKE MTK_M4U_ID(SMI_L13_ID, 11) +#define M4U_L13_P12_PDAI_0 MTK_M4U_ID(SMI_L13_ID, 12) +#define M4U_L13_P13_PDAI_1 MTK_M4U_ID(SMI_L13_ID, 13) +#define M4U_L13_P14_PDAO MTK_M4U_ID(SMI_L13_ID, 14) + +/* Larb14 -- cam */ +#define M4U_L14_P0_RESERVE MTK_M4U_ID(SMI_L14_ID, 0) +#define M4U_L14_P1_RESERVE MTK_M4U_ID(SMI_L14_ID, 1) +#define M4U_L14_P2_RESERVE MTK_M4U_ID(SMI_L14_ID, 2) +#define M4U_L14_P3_CAMSV_0 MTK_M4U_ID(SMI_L14_ID, 3) +#define M4U_L14_P4_CCUI MTK_M4U_ID(SMI_L14_ID, 4) +#define M4U_L14_P5_CCUO MTK_M4U_ID(SMI_L14_ID, 5) +#define M4U_L14_P6_CAMSV_7 MTK_M4U_ID(SMI_L14_ID, 6) +#define M4U_L14_P7_CAMSV_8 MTK_M4U_ID(SMI_L14_ID, 7) +#define M4U_L14_P8_CAMSV_9 MTK_M4U_ID(SMI_L14_ID, 8) +#define M4U_L14_P9_CAMSV_10 MTK_M4U_ID(SMI_L14_ID, 9) + +/* Larb15: null */ + +/* Larb16 -- cam */ +#define M4U_L16_P0_IMGO_R1_A MTK_M4U_ID(SMI_L16_ID, 0) +#define M4U_L16_P1_RRZO_R1_A MTK_M4U_ID(SMI_L16_ID, 1) +#define M4U_L16_P2_CQI_R1_A MTK_M4U_ID(SMI_L16_ID, 2) +#define M4U_L16_P3_BPCI_R1_A MTK_M4U_ID(SMI_L16_ID, 3) +#define M4U_L16_P4_YUVO_R1_A MTK_M4U_ID(SMI_L16_ID, 4) +#define M4U_L16_P5_UFDI_R2_A MTK_M4U_ID(SMI_L16_ID, 5) +#define M4U_L16_P6_RAWI_R2_A MTK_M4U_ID(SMI_L16_ID, 6) +#define M4U_L16_P7_RAWI_R3_A MTK_M4U_ID(SMI_L16_ID, 7) +#define M4U_L16_P8_AAO_R1_A MTK_M4U_ID(SMI_L16_ID, 8) +#define M4U_L16_P9_AFO_R1_A MTK_M4U_ID(SMI_L16_ID, 9) +#define M4U_L16_P10_FLKO_R1_A MTK_M4U_ID(SMI_L16_ID, 10) +#define M4U_L16_P11_LCESO_R1_A MTK_M4U_ID(SMI_L16_ID, 11) +#define M4U_L16_P12_CRZO_R1_A MTK_M4U_ID(SMI_L16_ID, 12) +#define M4U_L16_P13_LTMSO_R1_A MTK_M4U_ID(SMI_L16_ID, 13) +#define M4U_L16_P14_RSSO_R1_A MTK_M4U_ID(SMI_L16_ID, 14) +#define M4U_L16_P15_AAHO_R1_A MTK_M4U_ID(SMI_L16_ID, 15) +#define M4U_L16_P16_LSCI_R1_A MTK_M4U_ID(SMI_L16_ID, 16) + +/* Larb17 -- cam */ +#define M4U_L17_P0_IMGO_R1_B MTK_M4U_ID(SMI_L17_ID, 0) +#define M4U_L17_P1_RRZO_R1_B MTK_M4U_ID(SMI_L17_ID, 1) +#define M4U_L17_P2_CQI_R1_B MTK_M4U_ID(SMI_L17_ID, 2) +#define M4U_L17_P3_BPCI_R1_B MTK_M4U_ID(SMI_L17_ID, 3) +#define M4U_L17_P4_YUVO_R1_B MTK_M4U_ID(SMI_L17_ID, 4) +#define M4U_L17_P5_UFDI_R2_B MTK_M4U_ID(SMI_L17_ID, 5) +#define M4U_L17_P6_RAWI_R2_B MTK_M4U_ID(SMI_L17_ID, 6) +#define M4U_L17_P7_RAWI_R3_B MTK_M4U_ID(SMI_L17_ID, 7) +#define M4U_L17_P8_AAO_R1_B MTK_M4U_ID(SMI_L17_ID, 8) +#define M4U_L17_P9_AFO_R1_B MTK_M4U_ID(SMI_L17_ID, 9) +#define M4U_L17_P10_FLKO_R1_B MTK_M4U_ID(SMI_L17_ID, 10) +#define M4U_L17_P11_LCESO_R1_B MTK_M4U_ID(SMI_L17_ID, 11) +#define M4U_L17_P12_CRZO_R1_B MTK_M4U_ID(SMI_L17_ID, 12) +#define M4U_L17_P13_LTMSO_R1_B MTK_M4U_ID(SMI_L17_ID, 13) +#define M4U_L17_P14_RSSO_R1_B MTK_M4U_ID(SMI_L17_ID, 14) +#define M4U_L17_P15_AAHO_R1_B MTK_M4U_ID(SMI_L17_ID, 15) +#define M4U_L17_P16_LSCI_R1_B MTK_M4U_ID(SMI_L17_ID, 16) + +/* Larb19 -- ipesys */ +#define M4U_L19_P0_DVS_RDMA MTK_M4U_ID(SMI_L19_ID, 0) +#define M4U_L19_P1_DVS_WDMA MTK_M4U_ID(SMI_L19_ID, 1) +#define M4U_L19_P2_DVP_RDMA MTK_M4U_ID(SMI_L19_ID, 2) +#define M4U_L19_P3_DVP_WDMA MTK_M4U_ID(SMI_L19_ID, 3) + +/* Larb20 -- ipesys */ +#define M4U_L20_P0_FDVT_RDA_0 MTK_M4U_ID(SMI_L20_ID, 0) +#define M4U_L20_P1_FDVT_RDB_0 MTK_M4U_ID(SMI_L20_ID, 1) +#define M4U_L20_P2_FDVT_WRA_0 MTK_M4U_ID(SMI_L20_ID, 2) +#define M4U_L20_P3_FDVT_WRB_0 MTK_M4U_ID(SMI_L20_ID, 3) +#define M4U_L20_P4_RSC_RDMA MTK_M4U_ID(SMI_L20_ID, 4) +#define M4U_L20_P5_RSC_WDMA MTK_M4U_ID(SMI_L20_ID, 5) + +/* fake larb21 for gce */ +#define M4U_L21_GCE_DM MTK_M4U_ID(21, 0) +#define M4U_L21_GCE_MM MTK_M4U_ID(21, 1) + +/* fake larb & port for svp and dual svp and wfd */ +#define M4U_PORT_SVP_HEAP MTK_M4U_ID(22, 0) +#define M4U_PORT_DUAL_SVP_HEAP MTK_M4U_ID(22, 1) +#define M4U_PORT_WFD_HEAP MTK_M4U_ID(22, 2) + +/* fake larb0 for apu */ +#define M4U_L0_APU_DATA MTK_M4U_ID(0, 0) +#define M4U_L0_APU_CODE MTK_M4U_ID(0, 1) +#define M4U_L0_APU_SECURE MTK_M4U_ID(0, 2) +#define M4U_L0_APU_VLM MTK_M4U_ID(0, 3) + +/* infra/peri */ +#define IFR_IOMMU_PORT_PCIE_0 MTK_IFAIOMMU_PERI_ID(0, 26) + +#endif -- cgit v1.2.3 From fd714986e4e46effa6697b13d32918fc59608ccb Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Wed, 22 Oct 2025 19:21:09 -0700 Subject: iommu: Pass in old domain to attach_dev callback functions The IOMMU core attaches each device to a default domain on probe(). Then, every new "attach" operation has a fundamental meaning of two-fold: - detach from its currently attached (old) domain - attach to a given new domain Modern IOMMU drivers following this pattern usually want to clean up the things related to the old domain, so they call iommu_get_domain_for_dev() to fetch the old domain. Pass in the old domain pointer from the core to drivers, aligning with the set_dev_pasid op that does so already. Ensure all low-level attach fcuntions in the core can forward the correct old domain pointer. Thus, rework those functions as well. Suggested-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Signed-off-by: Nicolin Chen Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c30d12e16473..801b2bd9e8d4 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -751,7 +751,8 @@ struct iommu_ops { * @free: Release the domain after use. */ struct iommu_domain_ops { - int (*attach_dev)(struct iommu_domain *domain, struct device *dev); + int (*attach_dev)(struct iommu_domain *domain, struct device *dev, + struct iommu_domain *old); int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, ioasid_t pasid, struct iommu_domain *old); -- cgit v1.2.3 From 1afc05996299b4546e8be9b13c89f78e19912c7d Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 21 Oct 2025 11:50:12 +0100 Subject: ASoC: cs35l56: Read silicon ID during initialization and save it Read the silicon ID from the amp during one-time cs35l56_hw_init() and store it in struct cs35l56_base, instead of reading it from registers every time it is needed. Note that marking it non-volatile without a default in regmap isn't a suitable alternative because this causes regcache_sync() to always write the cached value out to the registers. This could trigger a bus fault interrupt inside the amp, which we want to avoid. Signed-off-by: Richard Fitzgerald Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20251021105022.1013685-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index ab044ce2aa8b..ec9b1072d6be 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -309,6 +309,7 @@ struct cs35l56_base { struct cs35l56_spi_payload *spi_payload_buf; const struct cs35l56_fw_reg *fw_reg; const struct cirrus_amp_cal_controls *calibration_controls; + u64 silicon_uid; }; static inline bool cs35l56_is_otp_register(unsigned int reg) -- cgit v1.2.3 From cdd27fa3298ad2f39788804f7d09ab31af2b416c Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 21 Oct 2025 11:50:13 +0100 Subject: ASoC: cs-amp-lib: Add helpers for factory calibration Add helper functions for performing factory calibration. cs_amp_read_cal_coeffs() reads the results of a calibration into a struct cirrus_amp_cal_data. The calTime member is also filled in with the current time (which is defined to be in Windows format). cs_amp_write_ambient_temp() writes a given temperature value to the firmware control for ambient temperature. The cs_amp_cal_target_u64() has been moved into the header file so that it can be used by the calling code and by KUnit tests. cs_amp_create_debugfs() creates a debugfs directory to contain debugfs files related to calibration. This is placed in a directory in debugfs root, named "cirrus_logic". The purpose of this is to make it easier for tooling to find the files it needs by keeping control of the layout under this directory. By contrast the ASoC debugfs can vary between kernel releases and doesn't have a strictly stable naming convention. HDA does not have a debugfs directory at all and enabling the general ALSA debugfs (which is normally disabled) has other side-effects. Signed-off-by: Richard Fitzgerald Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20251021105022.1013685-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs-amp-lib.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h index 43a87a39110c..5b094f8e8a6f 100644 --- a/include/sound/cs-amp-lib.h +++ b/include/sound/cs-amp-lib.h @@ -47,9 +47,21 @@ struct cirrus_amp_cal_controls { int cs_amp_write_cal_coeffs(struct cs_dsp *dsp, const struct cirrus_amp_cal_controls *controls, const struct cirrus_amp_cal_data *data); +int cs_amp_read_cal_coeffs(struct cs_dsp *dsp, + const struct cirrus_amp_cal_controls *controls, + struct cirrus_amp_cal_data *data); +int cs_amp_write_ambient_temp(struct cs_dsp *dsp, + const struct cirrus_amp_cal_controls *controls, + u32 temp); int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_index, struct cirrus_amp_cal_data *out_data); int cs_amp_get_vendor_spkid(struct device *dev); +struct dentry *cs_amp_create_debugfs(struct device *dev); + +static inline u64 cs_amp_cal_target_u64(const struct cirrus_amp_cal_data *data) +{ + return ((u64)data->calTarget[1] << 32) | data->calTarget[0]; +} struct cs_amp_test_hooks { efi_status_t (*get_efi_variable)(efi_char16_t *name, -- cgit v1.2.3 From f7097161e94cd39df7a8848ad0de5f394124ed69 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 21 Oct 2025 11:50:14 +0100 Subject: ASoC: cs35l56: Add common code for factory calibration Add core code to support factory calibration. This can be used by both the ASoC and HDA drivers. This code consists of implementations of debugfs handlers for three debugfs files used to start factory calibration and read the results. This is not a full implementation of debugfs files. There are some requirements to synchronize with the rest of the amp driver, and the way this is done is significantly different between ASoC and HDA. Therefore cs35l56-shared.c provides the main part of the file handlers, but the files themselves are defined in the ASoC and HDA drivers with suitable handling before calling into this shared code. The cal_data file allows the calibration to be read and also for a previous calibration to be written (for systems where the storage is not something directly accessible to drivers, such as on filesystems). Code outside the kernel should treat the content of cal_data as an opaque blob, so the struct definition is not exported as a user API. Signed-off-by: Richard Fitzgerald Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20251021105022.1013685-4-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index ec9b1072d6be..349b896ee737 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -9,6 +9,7 @@ #ifndef __CS35L56_H #define __CS35L56_H +#include #include #include #include @@ -62,6 +63,8 @@ #define CS35L56_IRQ1_MASK_8 0x000E0AC #define CS35L56_IRQ1_MASK_18 0x000E0D4 #define CS35L56_IRQ1_MASK_20 0x000E0DC +#define CS35L56_MIXER_NGATE_CH1_CFG 0x0010004 +#define CS35L56_MIXER_NGATE_CH2_CFG 0x0010008 #define CS35L56_DSP_MBOX_1_RAW 0x0011000 #define CS35L56_DSP_VIRTUAL1_MBOX_1 0x0011020 #define CS35L56_DSP_VIRTUAL1_MBOX_2 0x0011024 @@ -177,6 +180,9 @@ /* IRQ1_EINT_8 */ #define CS35L56_TEMP_ERR_EINT1_MASK 0x80000000 +/* MIXER_NGATE_CHn_CFG */ +#define CS35L56_AUX_NGATE_CHn_EN 0x00000001 + /* Mixer input sources */ #define CS35L56_INPUT_SRC_NONE 0x00 #define CS35L56_INPUT_SRC_ASP1RX1 0x08 @@ -243,6 +249,7 @@ #define CS35L56_MBOX_CMD_AUDIO_PLAY 0x0B000001 #define CS35L56_MBOX_CMD_AUDIO_PAUSE 0x0B000002 #define CS35L56_MBOX_CMD_AUDIO_REINIT 0x0B000003 +#define CS35L56_MBOX_CMD_AUDIO_CALIBRATION 0x0B000006 #define CS35L56_MBOX_CMD_HIBERNATE_NOW 0x02000001 #define CS35L56_MBOX_CMD_WAKEUP 0x02000002 #define CS35L56_MBOX_CMD_PREVENT_AUTO_HIBERNATE 0x02000003 @@ -264,6 +271,9 @@ #define CS35L56_RESET_PULSE_MIN_US 1100 #define CS35L56_WAKE_HOLD_TIME_US 1000 +#define CS35L56_CALIBRATION_POLL_US (100 * USEC_PER_MSEC) +#define CS35L56_CALIBRATION_TIMEOUT_US (5 * USEC_PER_SEC) + #define CS35L56_SDW1_PLAYBACK_PORT 1 #define CS35L56_SDW1_CAPTURE_PORT 3 @@ -291,9 +301,16 @@ struct cs35l56_fw_reg { unsigned int posture_number; }; +struct cs35l56_cal_debugfs_fops { + const struct debugfs_short_fops calibrate; + const struct debugfs_short_fops cal_temperature; + const struct debugfs_short_fops cal_data; +}; + struct cs35l56_base { struct device *dev; struct regmap *regmap; + struct cs_dsp *dsp; int irq; struct mutex irq_lock; u8 type; @@ -309,6 +326,7 @@ struct cs35l56_base { struct cs35l56_spi_payload *spi_payload_buf; const struct cs35l56_fw_reg *fw_reg; const struct cirrus_amp_cal_controls *calibration_controls; + struct dentry *debugfs; u64 silicon_uid; }; @@ -359,6 +377,21 @@ int cs35l56_runtime_suspend_common(struct cs35l56_base *cs35l56_base); int cs35l56_runtime_resume_common(struct cs35l56_base *cs35l56_base, bool is_soundwire); void cs35l56_init_cs_dsp(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp); int cs35l56_get_calibration(struct cs35l56_base *cs35l56_base); +ssize_t cs35l56_calibrate_debugfs_write(struct cs35l56_base *cs35l56_base, + const char __user *from, size_t count, + loff_t *ppos); +ssize_t cs35l56_cal_ambient_debugfs_write(struct cs35l56_base *cs35l56_base, + const char __user *from, size_t count, + loff_t *ppos); +ssize_t cs35l56_cal_data_debugfs_read(struct cs35l56_base *cs35l56_base, + char __user *to, size_t count, + loff_t *ppos); +ssize_t cs35l56_cal_data_debugfs_write(struct cs35l56_base *cs35l56_base, + const char __user *from, size_t count, + loff_t *ppos); +void cs35l56_create_cal_debugfs(struct cs35l56_base *cs35l56_base, + const struct cs35l56_cal_debugfs_fops *fops); +void cs35l56_remove_cal_debugfs(struct cs35l56_base *cs35l56_base); int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base, bool *fw_missing, unsigned int *fw_version); void cs35l56_log_tuning(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp); -- cgit v1.2.3 From cf6290eebe3cc4eb677d11aa061d10cb1df12ab9 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 21 Oct 2025 11:50:17 +0100 Subject: ASoC: cs-amp-lib-test: Add cases for factory calibration helpers Add test cases for the cs_amp_read_cal_coeffs() and cs_amp_write_ambient_temp() functions. In both cases the test is simply to confirm that the correct data value(s) get passed back to the caller. Signed-off-by: Richard Fitzgerald Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20251021105022.1013685-7-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs-amp-lib.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h index 5b094f8e8a6f..efa744133a35 100644 --- a/include/sound/cs-amp-lib.h +++ b/include/sound/cs-amp-lib.h @@ -72,8 +72,11 @@ struct cs_amp_test_hooks { int (*write_cal_coeff)(struct cs_dsp *dsp, const struct cirrus_amp_cal_controls *controls, const char *ctl_name, u32 val); -}; + int (*read_cal_coeff)(struct cs_dsp *dsp, + const struct cirrus_amp_cal_controls *controls, + const char *ctl_name, u32 *val); +}; extern const struct cs_amp_test_hooks * const cs_amp_test_hooks; #endif /* CS_AMP_LIB_H */ -- cgit v1.2.3 From 959400caf51eb31f95d1ab754a285b5546ebd3e4 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 21 Oct 2025 11:50:18 +0100 Subject: ASoC: cs-amp-lib: Return attributes from cs_amp_get_efi_variable() Add a pointer argument to cs_amp_get_efi_variable() to optionally return the EFI variable attributes. Originally this function internally consumed the attributes from efi.get_variable(). The calling code did not use the attributes so this was a small simplification. However, when writing to a pre-existing variable we would want to pass the existing attributes to efi.set_variable(). This patch deals with the change to return the attribute in preparation for adding code to update the variable. Signed-off-by: Richard Fitzgerald Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20251021105022.1013685-8-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs-amp-lib.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h index efa744133a35..2e5616a5e1f7 100644 --- a/include/sound/cs-amp-lib.h +++ b/include/sound/cs-amp-lib.h @@ -66,6 +66,7 @@ static inline u64 cs_amp_cal_target_u64(const struct cirrus_amp_cal_data *data) struct cs_amp_test_hooks { efi_status_t (*get_efi_variable)(efi_char16_t *name, efi_guid_t *guid, + u32 *returned_attr, unsigned long *size, void *buf); -- cgit v1.2.3 From 2b62e66626f05e277c8fdeb50d4c1e0cbab2fe0e Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 21 Oct 2025 11:50:19 +0100 Subject: ASoC: cs-amp-lib: Add function to write calibration to UEFI Add cs_amp_set_efi_calibration_data() to write an amp calibration blob to UEFI calibration variable. The UEFI variable will be updated or created as necessary. - If a Vendor-specific variable exists it will be updated, else if the Cirrus variable exists it will be update else the Cirrus variable will be created. Some collateral changes are required: - cs_amp_convert_efi_status() now specifically handles EFI_WRITE_PROTECTED error. - cs_amp_get_cal_efi_buffer() can optionally return the name, guid and attr of the variable it found. - cs_amp_get_cal_efi_buffer() will update the 'size' field of the returned data blob if it is zero. The BIOS could have pre-allocated the UEFI variable as zero-filled Signed-off-by: Richard Fitzgerald Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20251021105022.1013685-9-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs-amp-lib.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h index 2e5616a5e1f7..240bc53a9307 100644 --- a/include/sound/cs-amp-lib.h +++ b/include/sound/cs-amp-lib.h @@ -55,6 +55,8 @@ int cs_amp_write_ambient_temp(struct cs_dsp *dsp, u32 temp); int cs_amp_get_efi_calibration_data(struct device *dev, u64 target_uid, int amp_index, struct cirrus_amp_cal_data *out_data); +int cs_amp_set_efi_calibration_data(struct device *dev, int amp_index, int num_amps, + const struct cirrus_amp_cal_data *in_data); int cs_amp_get_vendor_spkid(struct device *dev); struct dentry *cs_amp_create_debugfs(struct device *dev); -- cgit v1.2.3 From ef24466ee1912997c2bd526194006bbca424c24f Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 21 Oct 2025 11:50:20 +0100 Subject: ASoC: cs35l56: Add calibration command to store into UEFI Add a new command 'store_uefi' to the calibrate debugfs file. Writing this command will call cs_amp_set_efi_calibration_data() to save the new data into a UEFI variable. This is intended to be used after a successful factory calibration. On systems without UEFI the write to the debugfs file will return an error. Signed-off-by: Richard Fitzgerald Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20251021105022.1013685-10-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 349b896ee737..82559be0f249 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -321,6 +321,7 @@ struct cs35l56_base { bool can_hibernate; bool cal_data_valid; s8 cal_index; + u8 num_amps; struct cirrus_amp_cal_data cal_data; struct gpio_desc *reset_gpio; struct cs35l56_spi_payload *spi_payload_buf; -- cgit v1.2.3 From 4795375d8aa072e9aacb0b278e6203c6ca41816a Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 21 Oct 2025 11:50:22 +0100 Subject: ASoC: cs-amp-lib-test: Add test cases for cs_amp_set_efi_calibration_data() Add a set of test cases for cs_amp_set_efi_calibration_data(). Broadly there are two type of behavior being tested: How the EFI is updated: - Create a new EFI - Overwrite part of existing content - Overwrite part of zero-filled preallocated content - Grow the file to append new content And how the location within the content is chosen: - Overwrite a specific array entry - Overwrite an entry with the same calTarget (silicon ID) - Overwrite a free entry - Append after existing data Plus some cases for error conditions. Signed-off-by: Richard Fitzgerald Reviewed-by: Takashi Iwai Link: https://patch.msgid.link/20251021105022.1013685-12-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs-amp-lib.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/sound/cs-amp-lib.h b/include/sound/cs-amp-lib.h index 240bc53a9307..61e00017c9aa 100644 --- a/include/sound/cs-amp-lib.h +++ b/include/sound/cs-amp-lib.h @@ -71,6 +71,11 @@ struct cs_amp_test_hooks { u32 *returned_attr, unsigned long *size, void *buf); + efi_status_t (*set_efi_variable)(efi_char16_t *name, + efi_guid_t *guid, + u32 attr, + unsigned long size, + void *buf); int (*write_cal_coeff)(struct cs_dsp *dsp, const struct cirrus_amp_cal_controls *controls, -- cgit v1.2.3 From 483768846d66c04354898f00bcdaad58a3763be2 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 15 Oct 2025 11:27:28 -0400 Subject: PCI: endpoint: Rename 'epf_bar::aligned_size' to 'epf_bar:mem_size' Rename the member 'epf_bar::aligned_size' to 'epf_bar::mem_size' to better reflect its purpose. 'aligned_size' was misleading, as it actually represents the backing memory size allocated for the BAR rather than the aligned size. Signed-off-by: Frank Li Signed-off-by: Manivannan Sadhasivam Reviewed-by: Niklas Cassel Link: https://patch.msgid.link/20251015-vntb_msi_doorbell-v6-1-9230298b1910@nxp.com --- include/linux/pci-epf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 2e85504ba2ba..4022dd080e20 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -115,8 +115,8 @@ struct pci_epf_driver { * @phys_addr: physical address that should be mapped to the BAR * @addr: virtual address corresponding to the @phys_addr * @size: the size of the address space present in BAR - * @aligned_size: the size actually allocated to accommodate the iATU alignment - * requirement + * @mem_size: the size actually allocated to accommodate the iATU alignment + * requirement * @barno: BAR number * @flags: flags that are set for the BAR */ @@ -124,7 +124,7 @@ struct pci_epf_bar { dma_addr_t phys_addr; void *addr; size_t size; - size_t aligned_size; + size_t mem_size; enum pci_barno barno; int flags; }; -- cgit v1.2.3 From 83be4bee57f0374ff751aaff3fef4af0af66ec81 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Fri, 17 Oct 2025 13:26:28 +0000 Subject: ACPI: PRM: Add acpi_prm_handler_available() Add a helper function to check if a PRM handler/module is present. This can be used during init time by code that depends on a particular handler. If the handler is not present, then the code does not need to be loaded. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Reviewed-by: "Mario Limonciello (AMD)" Acked-by: "Rafael J. Wysocki (Intel)" Link: https://patch.msgid.link/all/20251017-wip-atl-prm-v2-1-7ab1df4a5fbc@amd.com --- include/linux/prmt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/prmt.h b/include/linux/prmt.h index c53ab287e932..8cdc987de963 100644 --- a/include/linux/prmt.h +++ b/include/linux/prmt.h @@ -4,9 +4,11 @@ #ifdef CONFIG_ACPI_PRMT void init_prmt(void); +bool acpi_prm_handler_available(const guid_t *handler_guid); int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer); #else static inline void init_prmt(void) { } +static inline bool acpi_prm_handler_available(const guid_t *handler_guid) { return false; } static inline int acpi_call_prm_handler(guid_t handler_guid, void *param_buffer) { return -EOPNOTSUPP; -- cgit v1.2.3 From 0bfc6758f213a701bd662982de86f0032b51f18c Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 15 Oct 2025 11:27:30 -0400 Subject: PCI: endpoint: Add pci_epf_assign_bar_space() API Add pci_epf_assign_bar_space() API to allow setting any MMIO address as the BAR memory space, such as an MSI message base address. This API also conforms to the BAR base address and size alignment restrictions enforced by the PCI spec r6.0, sec 7.5.1.2.1. Signed-off-by: Frank Li [mani: removed unused epc var, reworded kdoc, comments and description] Signed-off-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20251015-vntb_msi_doorbell-v6-3-9230298b1910@nxp.com --- include/linux/pci-epf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h index 4022dd080e20..48f68c4dcfa5 100644 --- a/include/linux/pci-epf.h +++ b/include/linux/pci-epf.h @@ -242,6 +242,12 @@ void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar, void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar, enum pci_epc_interface_type type); +int pci_epf_assign_bar_space(struct pci_epf *epf, size_t size, + enum pci_barno bar, + const struct pci_epc_features *epc_features, + enum pci_epc_interface_type type, + dma_addr_t bar_addr); + int pci_epf_align_inbound_addr(struct pci_epf *epf, enum pci_barno bar, u64 addr, dma_addr_t *base, size_t *off); int pci_epf_bind(struct pci_epf *epf); -- cgit v1.2.3 From 013a3a66f25af3fb614f45df43983657514944c4 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:54:55 +0100 Subject: regmap: sdw-mbq: Don't assume the regmap device is the SoundWire slave Currently, the code assumes that the device that registered the MBQ register map is the actual SoundWire slave device. This works fine for all current users, however future SDCA devices will likely be implemented with the SoundWire slave as a parent device and separate child drivers with regmaps for each audio Function. Update the regmap_init_sdw_mbq_cfg macro to allow these two to be specified separately. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-3-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/regmap.h | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 4e1ac1fbcec4..70daec535976 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -676,7 +676,7 @@ struct regmap *__regmap_init_sdw(struct sdw_slave *sdw, const struct regmap_config *config, struct lock_class_key *lock_key, const char *lock_name); -struct regmap *__regmap_init_sdw_mbq(struct sdw_slave *sdw, +struct regmap *__regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw, const struct regmap_config *config, const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, @@ -738,7 +738,7 @@ struct regmap *__devm_regmap_init_sdw(struct sdw_slave *sdw, const struct regmap_config *config, struct lock_class_key *lock_key, const char *lock_name); -struct regmap *__devm_regmap_init_sdw_mbq(struct sdw_slave *sdw, +struct regmap *__devm_regmap_init_sdw_mbq(struct device *dev, struct sdw_slave *sdw, const struct regmap_config *config, const struct regmap_sdw_mbq_cfg *mbq_config, struct lock_class_key *lock_key, @@ -970,7 +970,7 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); */ #define regmap_init_sdw_mbq(sdw, config) \ __regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config, \ - sdw, config, NULL) + &sdw->dev, sdw, config, NULL) /** * regmap_init_sdw_mbq_cfg() - Initialise MBQ SDW register map with config @@ -983,9 +983,9 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); * to a struct regmap. The regmap will be automatically freed by the * device management code. */ -#define regmap_init_sdw_mbq_cfg(sdw, config, mbq_config) \ +#define regmap_init_sdw_mbq_cfg(dev, sdw, config, mbq_config) \ __regmap_lockdep_wrapper(__regmap_init_sdw_mbq, #config, \ - sdw, config, mbq_config) + dev, sdw, config, mbq_config) /** * regmap_init_spi_avmm() - Initialize register map for Intel SPI Slave @@ -1198,12 +1198,13 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); */ #define devm_regmap_init_sdw_mbq(sdw, config) \ __regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, #config, \ - sdw, config, NULL) + &sdw->dev, sdw, config, NULL) /** * devm_regmap_init_sdw_mbq_cfg() - Initialise managed MBQ SDW register map with config * - * @sdw: Device that will be interacted with + * @dev: Device that will be interacted with + * @sdw: SoundWire Device that will be interacted with * @config: Configuration for register map * @mbq_config: Properties for the MBQ registers * @@ -1211,9 +1212,9 @@ bool regmap_ac97_default_volatile(struct device *dev, unsigned int reg); * to a struct regmap. The regmap will be automatically freed by the * device management code. */ -#define devm_regmap_init_sdw_mbq_cfg(sdw, config, mbq_config) \ - __regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, \ - #config, sdw, config, mbq_config) +#define devm_regmap_init_sdw_mbq_cfg(dev, sdw, config, mbq_config) \ + __regmap_lockdep_wrapper(__devm_regmap_init_sdw_mbq, \ + #config, dev, sdw, config, mbq_config) /** * devm_regmap_init_slimbus() - Initialise managed register map -- cgit v1.2.3 From 7159816707dc7040fe3ac4fa3d7ac3d173bd772a Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:54:57 +0100 Subject: ASoC: SDCA: Pass SoundWire slave to HID The SDCA HID code can't assume that the struct device it is passed is the SoundWire slave device. HID is represented by a Function in SDCA and will thus likely be implemented by a child driver. Update the code to explicitly pass in the SoundWire slave device. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-5-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 2 +- include/sound/sdca_hid.h | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index ea68856e4c8c..51e12fcfc53c 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -1332,7 +1332,7 @@ static inline u32 sdca_range_search(struct sdca_control_range *range, return 0; } -int sdca_parse_function(struct device *dev, +int sdca_parse_function(struct device *dev, struct sdw_slave *sdw, struct sdca_function_desc *desc, struct sdca_function_data *function); diff --git a/include/sound/sdca_hid.h b/include/sound/sdca_hid.h index 8ab3e498884e..3a155835e035 100644 --- a/include/sound/sdca_hid.h +++ b/include/sound/sdca_hid.h @@ -12,10 +12,14 @@ #include #if IS_ENABLED(CONFIG_SND_SOC_SDCA_HID) -int sdca_add_hid_device(struct device *dev, struct sdca_entity *entity); + +int sdca_add_hid_device(struct device *dev, struct sdw_slave *sdw, + struct sdca_entity *entity); #else -static inline int sdca_add_hid_device(struct device *dev, struct sdca_entity *entity) + +static inline int sdca_add_hid_device(struct device *dev, struct sdw_slave *sdw, + struct sdca_entity *entity) { return 0; } -- cgit v1.2.3 From 390c05f47d0749b24db65586482308c5fd680fe5 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:54:58 +0100 Subject: ASoC: SDCA: Pass device register map from IRQ alloc to handlers Store a copy of the device register map in the structure for the IRQ handlers. This will allow the individual IRQ handlers access to the device level register map if required. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-6-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_interrupts.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/sdca_interrupts.h b/include/sound/sdca_interrupts.h index bbbc3ab27eba..d652c6e94ddc 100644 --- a/include/sound/sdca_interrupts.h +++ b/include/sound/sdca_interrupts.h @@ -23,6 +23,7 @@ struct sdca_function_data; /** * struct sdca_interrupt - contains information about a single SDCA interrupt * @name: The name of the interrupt. + * @device_regmap: Pointer to the IRQ regmap. * @component: Pointer to the ASoC component owns the interrupt. * @function: Pointer to the Function that the interrupt is associated with. * @entity: Pointer to the Entity that the interrupt is associated with. @@ -35,6 +36,7 @@ struct sdca_function_data; struct sdca_interrupt { const char *name; + struct regmap *device_regmap; struct snd_soc_component *component; struct sdca_function_data *function; struct sdca_entity *entity; -- cgit v1.2.3 From 56bbda23d4bece7ce998666118a068e4f71d59fb Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:54:59 +0100 Subject: ASoC: SDCA: Update externally_requested flag to cover all requests Currently there is a flag to indicate if an IRQ has been requested by something outside the SDCA core, such that the core can skip requesting that IRQ. However, it is simpler and more useful to always store the allocated IRQ number. This will allow the core to see if the IRQ has been requested, to perform additional operations on the IRQ, and request IRQs in multiple phases. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-7-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_interrupts.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/sound/sdca_interrupts.h b/include/sound/sdca_interrupts.h index d652c6e94ddc..e4bf123936bb 100644 --- a/include/sound/sdca_interrupts.h +++ b/include/sound/sdca_interrupts.h @@ -29,9 +29,8 @@ struct sdca_function_data; * @entity: Pointer to the Entity that the interrupt is associated with. * @control: Pointer to the Control that the interrupt is associated with. * @priv: Pointer to private data for use by the handler. - * @externally_requested: Internal flag used to check if a client driver has - * already requested the interrupt, for custom handling, allowing the core to - * skip handling this interrupt. + * @irq: IRQ number allocated to this interrupt, also used internally to track + * the IRQ being assigned. */ struct sdca_interrupt { const char *name; @@ -44,7 +43,7 @@ struct sdca_interrupt { void *priv; - bool externally_requested; + int irq; }; /** -- cgit v1.2.3 From dfe7c3401ed3d3bd8e61be8d6d452896513eb52e Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:55:01 +0100 Subject: ASoC: SDCA: Rely less on the ASoC component in IRQ handling In the future some IRQs (mostly the UMPs used during File DownLoad) will need to run after the device has enumerated on the bus but before the soundcard is actually constructed. As such refactor more of the IRQ handling to use raw device and regmap pointers, rather than accessing things through the component. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-9-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_interrupts.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/sdca_interrupts.h b/include/sound/sdca_interrupts.h index e4bf123936bb..3983f515349a 100644 --- a/include/sound/sdca_interrupts.h +++ b/include/sound/sdca_interrupts.h @@ -23,7 +23,9 @@ struct sdca_function_data; /** * struct sdca_interrupt - contains information about a single SDCA interrupt * @name: The name of the interrupt. + * @dev: Pointer to the Function device. * @device_regmap: Pointer to the IRQ regmap. + * @function_regmap: Pointer to the SDCA Function regmap. * @component: Pointer to the ASoC component owns the interrupt. * @function: Pointer to the Function that the interrupt is associated with. * @entity: Pointer to the Entity that the interrupt is associated with. @@ -35,7 +37,9 @@ struct sdca_function_data; struct sdca_interrupt { const char *name; + struct device *dev; struct regmap *device_regmap; + struct regmap *function_regmap; struct snd_soc_component *component; struct sdca_function_data *function; struct sdca_entity *entity; @@ -65,7 +69,8 @@ struct sdca_interrupt_info { int sdca_irq_request(struct device *dev, struct sdca_interrupt_info *interrupt_info, int sdca_irq, const char *name, irq_handler_t handler, void *data); -int sdca_irq_data_populate(struct snd_soc_component *component, +int sdca_irq_data_populate(struct device *dev, struct regmap *function_regmap, + struct snd_soc_component *component, struct sdca_function_data *function, struct sdca_entity *entity, struct sdca_control *control, -- cgit v1.2.3 From c7b6c6b60594fd1efe35c61bc6a2176b25263ccc Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:55:02 +0100 Subject: ASoC: SDCA: Force some SDCA Controls to be volatile Whilst SDCA does specify an Access Mode for each Control, there is not a 1-to-1 mapping between that and ASoC's internal representation. Some registers require being treated as volatile from the hosts perspective even in their Access Mode is Read-Write. Add an explicit list of SDCA controls that should be forced volatile. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-10-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index 51e12fcfc53c..ab9af84082c9 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -771,6 +771,7 @@ struct sdca_control { u8 layers; bool deferrable; + bool is_volatile; bool has_default; bool has_fixed; }; -- cgit v1.2.3 From 0a5e9769d088bd1d8faf01207210911b9341b62c Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:55:03 +0100 Subject: ASoC: SDCA: Parse XU Entity properties Parse the DisCo properties for XU Entities. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-11-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index ab9af84082c9..f2ce13162151 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -1090,6 +1090,27 @@ struct sdca_entity_hide { struct hid_descriptor hid_desc; }; +/** + * enum sdca_xu_reset_machanism - SDCA FDL Resets + */ +enum sdca_xu_reset_mechanism { + SDCA_XU_RESET_FUNCTION = 0x0, + SDCA_XU_RESET_DEVICE = 0x1, + SDCA_XU_RESET_BUS = 0x2, +}; + +/** + * struct sdca_entity_xu - information specific to XU Entities + * @max_delay: the maximum time in microseconds allowed for the Device + * to change the ownership from Device to Host + * @reset_mechanism: indicates the type of reset that can be requested + * the end of an FDL. + */ +struct sdca_entity_xu { + unsigned int max_delay; + enum sdca_xu_reset_mechanism reset_mechanism; +}; + /** * struct sdca_entity - information for one SDCA Entity * @label: String such as "OT 12". @@ -1106,6 +1127,7 @@ struct sdca_entity_hide { * @pde: Power Domain Entity specific Entity properties. * @ge: Group Entity specific Entity properties. * @hide: HIDE Entity specific Entity properties. + * @xu: XU Entity specific Entity properties. */ struct sdca_entity { const char *label; @@ -1123,6 +1145,7 @@ struct sdca_entity { struct sdca_entity_pde pde; struct sdca_entity_ge ge; struct sdca_entity_hide hide; + struct sdca_entity_xu xu; }; }; -- cgit v1.2.3 From 7b6be935e7eff06025e18cea4c6620194450abe2 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:55:04 +0100 Subject: ASoC: SDCA: Parse Function Reset max delay Parse the DisCo property to get the timeout for a Function Reset. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-12-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index f2ce13162151..2e988a30481c 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -1323,6 +1323,8 @@ enum sdca_cluster_range { * @num_clusters: Number of Channel Clusters reported in this Function. * @busy_max_delay: Maximum Function busy delay in microseconds, before an * error should be reported. + * @reset_max_delay: Maximum Function reset delay in microseconds, before an + * error should be reported. */ struct sdca_function_data { struct sdca_function_desc *desc; @@ -1335,6 +1337,7 @@ struct sdca_function_data { int num_clusters; unsigned int busy_max_delay; + unsigned int reset_max_delay; }; static inline u32 sdca_range(struct sdca_control_range *range, -- cgit v1.2.3 From daab108504be73182c16a72b9cfe47ac3b1928ca Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:55:05 +0100 Subject: ASoC: SDCA: Add UMP buffer helper functions Add helper functions for handling Universal Message Passing (UMP) buffers on SDCA devices. These are generic mechanisms to pass blocks of binary data between the host and the device, in both directions. They are used for things like passing HID descriptors and the File Download process. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-13-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 26 +++++++++++++++++++++++++ include/sound/sdca_ump.h | 45 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 include/sound/sdca_ump.h (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index 2e988a30481c..6dd44a7a8a35 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -133,6 +133,32 @@ struct sdca_init_write { #define SDCA_CTL_TYPE_S(ent, sel) SDCA_CTL_TYPE(SDCA_ENTITY_TYPE_##ent, \ SDCA_CTL_##ent##_##sel) +/** + * enum sdca_messageoffset_range - Column definitions UMP MessageOffset + */ +enum sdca_messageoffset_range { + SDCA_MESSAGEOFFSET_BUFFER_START_ADDRESS = 0, + SDCA_MESSAGEOFFSET_BUFFER_LENGTH = 1, + SDCA_MESSAGEOFFSET_UMP_MODE = 2, + SDCA_MESSAGEOFFSET_NCOLS = 3, +}; + +/** + * enum sdca_ump_mode - SDCA UMP Mode + */ +enum sdca_ump_mode { + SDCA_UMP_MODE_DIRECT = 0x00, + SDCA_UMP_MODE_INDIRECT = 0x01, +}; + +/** + * enum sdca_ump_owner - SDCA UMP Owner + */ +enum sdca_ump_owner { + SDCA_UMP_OWNER_HOST = 0x00, + SDCA_UMP_OWNER_DEVICE = 0x01, +}; + /** * enum sdca_it_controls - SDCA Controls for Input Terminal * diff --git a/include/sound/sdca_ump.h b/include/sound/sdca_ump.h new file mode 100644 index 000000000000..b2363199d19a --- /dev/null +++ b/include/sound/sdca_ump.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The MIPI SDCA specification is available for public downloads at + * https://www.mipi.org/mipi-sdca-v1-0-download + * + * Copyright (C) 2025 Cirrus Logic, Inc. and + * Cirrus Logic International Semiconductor Ltd. + */ + +#ifndef __SDCA_UMP_H__ +#define __SDCA_UMP_H__ + +struct regmap; +struct sdca_control; +struct sdca_entity; +struct sdca_function_data; +struct snd_soc_component; + +int sdca_ump_get_owner_host(struct device *dev, + struct regmap *function_regmap, + struct sdca_function_data *function, + struct sdca_entity *entity, + struct sdca_control *control); +int sdca_ump_set_owner_device(struct device *dev, + struct regmap *function_regmap, + struct sdca_function_data *function, + struct sdca_entity *entity, + struct sdca_control *control); +int sdca_ump_read_message(struct device *dev, + struct regmap *device_regmap, + struct regmap *function_regmap, + struct sdca_function_data *function, + struct sdca_entity *entity, + unsigned int offset_sel, unsigned int length_sel, + void **msg); +int sdca_ump_write_message(struct device *dev, + struct regmap *device_regmap, + struct regmap *function_regmap, + struct sdca_function_data *function, + struct sdca_entity *entity, + unsigned int offset_sel, unsigned int msg_offset, + unsigned int length_sel, + void *msg, int msg_len); + +#endif // __SDCA_UMP_H__ -- cgit v1.2.3 From c4d096c3ca425562192a3626c30e82651d0f2c1c Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Mon, 20 Oct 2025 16:55:06 +0100 Subject: ASoC: SDCA: Add SDCA FDL data parsing Add parsing of ACPI DisCo information specific to FDL (File DownLoad). DisCo contains a list of File Sets which can be requested by the device and within each of those a list of individual files to be downloaded to the device. Optionally the contents of the files may also be present in a special ACPI table, called SWFT (SoundWire File Table). Reviewed-by: Bard Liao Signed-off-by: Maciej Strozek Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-14-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca.h | 5 +++++ include/sound/sdca_function.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'include') diff --git a/include/sound/sdca.h b/include/sound/sdca.h index 9c6a351c9d47..d38cdbfeb35f 100644 --- a/include/sound/sdca.h +++ b/include/sound/sdca.h @@ -12,6 +12,7 @@ #include #include +struct acpi_table_swft; struct sdw_slave; #define SDCA_MAX_FUNCTION_COUNT 8 @@ -37,11 +38,13 @@ struct sdca_function_desc { * @num_functions: Total number of supported SDCA functions. Invalid/unsupported * functions will be skipped. * @function: Array of function descriptors. + * @swft: Pointer to the SWFT table, if available. */ struct sdca_device_data { u32 interface_revision; int num_functions; struct sdca_function_desc function[SDCA_MAX_FUNCTION_COUNT]; + struct acpi_table_swft *swft; }; enum sdca_quirk { @@ -52,12 +55,14 @@ enum sdca_quirk { #if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SND_SOC_SDCA) void sdca_lookup_functions(struct sdw_slave *slave); +void sdca_lookup_swft(struct sdw_slave *slave); void sdca_lookup_interface_revision(struct sdw_slave *slave); bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_quirk quirk); #else static inline void sdca_lookup_functions(struct sdw_slave *slave) {} +static inline void sdca_lookup_swft(struct sdw_slave *slave) {} static inline void sdca_lookup_interface_revision(struct sdw_slave *slave) {} static inline bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_quirk quirk) { diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index 6dd44a7a8a35..f557206cec83 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -13,6 +13,7 @@ #include #include +struct acpi_table_swft; struct device; struct sdca_entity; struct sdca_function_desc; @@ -1338,6 +1339,42 @@ enum sdca_cluster_range { SDCA_CLUSTER_NCOLS = 2, }; +/** + * struct sdca_fdl_file - information about a file from a fileset used in FDL + * @vendor_id: Vendor ID of the file. + * @file_id: File ID of the file. + * @fdl_offset: Offset information for FDL. + */ +struct sdca_fdl_file { + u16 vendor_id; + u32 file_id; + u32 fdl_offset; +}; + +/** + * struct sdca_fdl_set - information about a set of files used in FDL + * @files: Array of files in this FDL set. + * @num_files: Number of files in this FDL set. + * @id: ID of the FDL set. + */ +struct sdca_fdl_set { + struct sdca_fdl_file *files; + int num_files; + u32 id; +}; + +/** + * struct sdca_fdl_data - information about a function's FDL data + * @swft: Pointer to the SoundWire File Table. + * @sets: Array of FDL sets used by this function. + * @num_sets: Number of FDL sets used by this function. + */ +struct sdca_fdl_data { + struct acpi_table_swft *swft; + struct sdca_fdl_set *sets; + int num_sets; +}; + /** * struct sdca_function_data - top-level information for one SDCA function * @desc: Pointer to short descriptor from initial parsing. @@ -1351,6 +1388,7 @@ enum sdca_cluster_range { * error should be reported. * @reset_max_delay: Maximum Function reset delay in microseconds, before an * error should be reported. + * @fdl_data: FDL data for this Function, if available. */ struct sdca_function_data { struct sdca_function_desc *desc; @@ -1364,6 +1402,8 @@ struct sdca_function_data { unsigned int busy_max_delay; unsigned int reset_max_delay; + + struct sdca_fdl_data fdl_data; }; static inline u32 sdca_range(struct sdca_control_range *range, -- cgit v1.2.3 From 71f7990a34cdb11f82d3cbbcddaca77a55635466 Mon Sep 17 00:00:00 2001 From: Maciej Strozek Date: Mon, 20 Oct 2025 16:55:07 +0100 Subject: ASoC: SDCA: Add FDL library for XU entities Some instances of the XU Entity have a need for Files to be downloaded from the Host. In these XUs, there is one instance of a Host to Device (Consumer) UMP, identified by the FDL_CurrentOwner Control. FDL Library introduced here implements the FDL flow triggered by FDL_CurrentOwner irq, which sends a file from SoundWire File Table (SWFT) or from the firmware directory in specific cases, to the Device FDL UMP. Currently only Direct method of FDL is implemented. Reviewed-by: Bard Liao Signed-off-by: Maciej Strozek Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-15-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_fdl.h | 58 +++++++++++++++++++++++++++++++++++++++++++ include/sound/sdca_function.h | 24 ++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 include/sound/sdca_fdl.h (limited to 'include') diff --git a/include/sound/sdca_fdl.h b/include/sound/sdca_fdl.h new file mode 100644 index 000000000000..8b025aff4a0c --- /dev/null +++ b/include/sound/sdca_fdl.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * The MIPI SDCA specification is available for public downloads at + * https://www.mipi.org/mipi-sdca-v1-0-download + * + * Copyright (C) 2025 Cirrus Logic, Inc. and + * Cirrus Logic International Semiconductor Ltd. + */ + +#ifndef __SDCA_FDL_H__ +#define __SDCA_FDL_H__ + +struct device; +struct regmap; +struct sdca_fdl_set; +struct sdca_function_data; +struct sdca_interrupt; + +/** + * struct fdl_state - FDL state structure to keep data between interrupts + * @set: Pointer to the FDL set currently being downloaded. + * @file_index: Index of the current file being processed. + */ +struct fdl_state { + struct sdca_fdl_set *set; + int file_index; +}; + +#define SDCA_CTL_XU_FDLH_COMPLETE 0 +#define SDCA_CTL_XU_FDLH_MORE_FILES SDCA_CTL_XU_FDLH_SET_IN_PROGRESS +#define SDCA_CTL_XU_FDLH_FILE_AVAILABLE (SDCA_CTL_XU_FDLH_TRANSFERRED_FILE | \ + SDCA_CTL_XU_FDLH_SET_IN_PROGRESS) +#define SDCA_CTL_XU_FDLH_MASK (SDCA_CTL_XU_FDLH_TRANSFERRED_CHUNK | \ + SDCA_CTL_XU_FDLH_TRANSFERRED_FILE | \ + SDCA_CTL_XU_FDLH_SET_IN_PROGRESS | \ + SDCA_CTL_XU_FDLH_RESET_ACK | \ + SDCA_CTL_XU_FDLH_REQ_ABORT) + +#define SDCA_CTL_XU_FDLD_COMPLETE 0 +#define SDCA_CTL_XU_FDLD_FILE_OK (SDCA_CTL_XU_FDLH_TRANSFERRED_FILE | \ + SDCA_CTL_XU_FDLH_SET_IN_PROGRESS | \ + SDCA_CTL_XU_FDLD_ACK_TRANSFER | \ + SDCA_CTL_XU_FDLD_NEEDS_SET) +#define SDCA_CTL_XU_FDLD_MORE_FILES_OK (SDCA_CTL_XU_FDLH_SET_IN_PROGRESS | \ + SDCA_CTL_XU_FDLD_ACK_TRANSFER | \ + SDCA_CTL_XU_FDLD_NEEDS_SET) +#define SDCA_CTL_XU_FDLD_MASK (SDCA_CTL_XU_FDLD_REQ_RESET | \ + SDCA_CTL_XU_FDLD_REQ_ABORT | \ + SDCA_CTL_XU_FDLD_ACK_TRANSFER | \ + SDCA_CTL_XU_FDLD_NEEDS_SET) + +int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt); +int sdca_fdl_process(struct sdca_interrupt *interrupt); + +int sdca_reset_function(struct device *dev, struct sdca_function_data *function, + struct regmap *regmap); + +#endif // __SDCA_FDL_H__ diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index f557206cec83..99cb978f7099 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -285,6 +285,27 @@ enum sdca_xu_controls { SDCA_CTL_XU_FDL_STATUS = 0x14, SDCA_CTL_XU_FDL_SET_INDEX = 0x15, SDCA_CTL_XU_FDL_HOST_REQUEST = 0x16, + + /* FDL Status Host->Device bit definitions */ + SDCA_CTL_XU_FDLH_TRANSFERRED_CHUNK = BIT(0), + SDCA_CTL_XU_FDLH_TRANSFERRED_FILE = BIT(1), + SDCA_CTL_XU_FDLH_SET_IN_PROGRESS = BIT(2), + SDCA_CTL_XU_FDLH_RESET_ACK = BIT(4), + SDCA_CTL_XU_FDLH_REQ_ABORT = BIT(5), + /* FDL Status Device->Host bit definitions */ + SDCA_CTL_XU_FDLD_REQ_RESET = BIT(4), + SDCA_CTL_XU_FDLD_REQ_ABORT = BIT(5), + SDCA_CTL_XU_FDLD_ACK_TRANSFER = BIT(6), + SDCA_CTL_XU_FDLD_NEEDS_SET = BIT(7), +}; + +/** + * enum sdca_set_index_range - Column definitions UMP SetIndex + */ +enum sdca_fdl_set_index_range { + SDCA_FDL_SET_INDEX_SET_NUMBER = 0, + SDCA_FDL_SET_INDEX_FILE_SET_ID = 1, + SDCA_FDL_SET_INDEX_NCOLS = 2, }; /** @@ -569,6 +590,9 @@ enum sdca_entity0_controls { SDCA_CTL_ENTITY_0_FUNCTION_NEEDS_INITIALIZATION = BIT(5), SDCA_CTL_ENTITY_0_FUNCTION_HAS_BEEN_RESET = BIT(6), SDCA_CTL_ENTITY_0_FUNCTION_BUSY = BIT(7), + + /* Function Action Bits */ + SDCA_CTL_ENTITY_0_RESET_FUNCTION_NOW = BIT(0), }; #define SDCA_CTL_MIC_BIAS_NAME "Mic Bias" -- cgit v1.2.3 From 0723affa1bee50c3bd7ca00e00dee07fcef224b8 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:55:09 +0100 Subject: ASoC: SDCA: Add completion for FDL start and stop Add some completions and a helper function to allow other parts of the system to wait for FDL to complete. The sdca_fdl_sync() function will wait until it completes a full time out without a new FDL request happening, this ensures that even parts requiring multiple rounds of FDL should be fully downloaded before the driver boot continues. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-17-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_fdl.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/sound/sdca_fdl.h b/include/sound/sdca_fdl.h index 8b025aff4a0c..4ea000d6acef 100644 --- a/include/sound/sdca_fdl.h +++ b/include/sound/sdca_fdl.h @@ -10,18 +10,26 @@ #ifndef __SDCA_FDL_H__ #define __SDCA_FDL_H__ +#include + struct device; struct regmap; struct sdca_fdl_set; struct sdca_function_data; struct sdca_interrupt; +struct sdca_interrupt_info; /** * struct fdl_state - FDL state structure to keep data between interrupts + * @begin: Completion indicating the start of an FDL download cycle. + * @done: Completion indicating the end of an FDL download cycle. * @set: Pointer to the FDL set currently being downloaded. * @file_index: Index of the current file being processed. */ struct fdl_state { + struct completion begin; + struct completion done; + struct sdca_fdl_set *set; int file_index; }; @@ -51,6 +59,8 @@ struct fdl_state { int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt); int sdca_fdl_process(struct sdca_interrupt *interrupt); +int sdca_fdl_sync(struct device *dev, struct sdca_function_data *function, + struct sdca_interrupt_info *info); int sdca_reset_function(struct device *dev, struct sdca_function_data *function, struct regmap *regmap); -- cgit v1.2.3 From e92e25f777483b7cc3e170214cc84337d7a415cf Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:55:10 +0100 Subject: ASoC: SDCA: Add UMP timeout handling for FDL Several of the UMP transactions in the FDL process should timeout if the device does not respond within a certain time, add handling into the UMP helpers and the FDL code to handle this. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-18-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_fdl.h | 7 +++++++ include/sound/sdca_ump.h | 5 +++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/sound/sdca_fdl.h b/include/sound/sdca_fdl.h index 4ea000d6acef..f4ba809cb203 100644 --- a/include/sound/sdca_fdl.h +++ b/include/sound/sdca_fdl.h @@ -11,6 +11,7 @@ #define __SDCA_FDL_H__ #include +#include struct device; struct regmap; @@ -23,13 +24,19 @@ struct sdca_interrupt_info; * struct fdl_state - FDL state structure to keep data between interrupts * @begin: Completion indicating the start of an FDL download cycle. * @done: Completion indicating the end of an FDL download cycle. + * @timeout: Delayed work used for timing out UMP transactions. + * @lock: Mutex to protect between the timeout work and IRQ handlers. + * @interrupt: Pointer to the interrupt struct to which this FDL is attached. * @set: Pointer to the FDL set currently being downloaded. * @file_index: Index of the current file being processed. */ struct fdl_state { struct completion begin; struct completion done; + struct delayed_work timeout; + struct mutex lock; + struct sdca_interrupt *interrupt; struct sdca_fdl_set *set; int file_index; }; diff --git a/include/sound/sdca_ump.h b/include/sound/sdca_ump.h index b2363199d19a..f54f9d48c64c 100644 --- a/include/sound/sdca_ump.h +++ b/include/sound/sdca_ump.h @@ -15,6 +15,7 @@ struct sdca_control; struct sdca_entity; struct sdca_function_data; struct snd_soc_component; +struct delayed_work; int sdca_ump_get_owner_host(struct device *dev, struct regmap *function_regmap, @@ -42,4 +43,8 @@ int sdca_ump_write_message(struct device *dev, unsigned int length_sel, void *msg, int msg_len); +void sdca_ump_cancel_timeout(struct delayed_work *work); +void sdca_ump_schedule_timeout(struct delayed_work *work, + unsigned int timeout_us); + #endif // __SDCA_UMP_H__ -- cgit v1.2.3 From 12aa3160c10a3179c73c4f99a2d5aec0fd907d0c Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:55:11 +0100 Subject: ASoC: SDCA: Add early IRQ handling Some IRQs (FDL) require processing before the primary soundcard is brought up, as the downloaded files could be firmware required for operation of the audio functions of the device. Add a new helper function which registers the required IRQs. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-19-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_interrupts.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/sound/sdca_interrupts.h b/include/sound/sdca_interrupts.h index 3983f515349a..8f13417d129a 100644 --- a/include/sound/sdca_interrupts.h +++ b/include/sound/sdca_interrupts.h @@ -75,6 +75,9 @@ int sdca_irq_data_populate(struct device *dev, struct regmap *function_regmap, struct sdca_entity *entity, struct sdca_control *control, struct sdca_interrupt *interrupt); +int sdca_irq_populate_early(struct device *dev, struct regmap *function_regmap, + struct sdca_function_data *function, + struct sdca_interrupt_info *info); int sdca_irq_populate(struct sdca_function_data *function, struct snd_soc_component *component, struct sdca_interrupt_info *info); -- cgit v1.2.3 From ef042df96d0e1089764f39ede61bc8f140a4be00 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 20 Oct 2025 16:55:12 +0100 Subject: ASoC: SDCA: Add HID button IRQ Now full support for the UMP buffers is available, it is possible to read the SDCA HID descriptors from the device and pass them to user-space. Add a helper function to process HID events from an SDCA device. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Reviewed-by: Pierre-Louis Bossart Link: https://patch.msgid.link/20251020155512.353774-20-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_hid.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/sound/sdca_hid.h b/include/sound/sdca_hid.h index 3a155835e035..18bebbe428c9 100644 --- a/include/sound/sdca_hid.h +++ b/include/sound/sdca_hid.h @@ -8,13 +8,17 @@ #ifndef __SDCA_HID_H__ #define __SDCA_HID_H__ -#include -#include +struct device; +struct sdw_slave; + +struct sdca_entity; +struct sdca_interrupt; #if IS_ENABLED(CONFIG_SND_SOC_SDCA_HID) int sdca_add_hid_device(struct device *dev, struct sdw_slave *sdw, struct sdca_entity *entity); +int sdca_hid_process_report(struct sdca_interrupt *interrupt); #else @@ -24,6 +28,11 @@ static inline int sdca_add_hid_device(struct device *dev, struct sdw_slave *sdw, return 0; } +static inline int sdca_hid_process_report(struct sdca_interrupt *interrupt) +{ + return 0; +} + #endif #endif /* __SDCA_HID_H__ */ -- cgit v1.2.3 From 8d748955279cfe1996e51ac51a4f746468614a10 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Mon, 27 Oct 2025 08:18:14 +0800 Subject: asm-generic: percpu: Add assembly guard Currently, asm/percpu.h is directly or indirectly included by some assembly files on x86. Some of them (e.g., checksum_32.S) are also used on um. But x86 and um provide different versions of asm/percpu.h -- um uses asm-generic/percpu.h directly. When SMP is enabled, asm-generic/percpu.h will introduce C code that cannot be assembled. Since asm-generic/percpu.h currently is not designed for use in assembly, and these assembly files do not actually need asm/percpu.h on um, let's add the assembly guard in asm-generic/percpu.h to fix this issue. Cc: Arnd Bergmann Cc: linux-arch@vger.kernel.org Signed-off-by: Tiwei Bie Acked-by: Arnd Bergmann Link: https://patch.msgid.link/20251027001815.1666872-8-tiwei.bie@linux.dev Signed-off-by: Johannes Berg --- include/asm-generic/percpu.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index 02aeca21479a..6628670bcb90 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -2,6 +2,8 @@ #ifndef _ASM_GENERIC_PERCPU_H_ #define _ASM_GENERIC_PERCPU_H_ +#ifndef __ASSEMBLER__ + #include #include #include @@ -557,4 +559,5 @@ do { \ this_cpu_generic_cmpxchg(pcp, oval, nval) #endif +#endif /* __ASSEMBLER__ */ #endif /* _ASM_GENERIC_PERCPU_H_ */ -- cgit v1.2.3 From 87b0031f7f73dac2ebb874fc8f331a66ee3b5cbd Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:18 +0100 Subject: irqdomain: Add firmware info reporting interface Add an irqdomain callback to report firmware-provided information that is otherwise not available in a generic way. This is reported using a new data structure (struct irq_fwspec_info). This callback is optional and the only information that can be reported currently is the affinity of an interrupt. However, the containing structure is designed to be extensible, allowing other potentially relevant information to be reported in the future. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251020122944.3074811-2-maz@kernel.org --- include/linux/irqdomain.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 4a86e6b915dd..9d6a5e99394f 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -44,6 +44,23 @@ struct irq_fwspec { u32 param[IRQ_DOMAIN_IRQ_SPEC_PARAMS]; }; +/** + * struct irq_fwspec_info - firmware provided IRQ information structure + * + * @flags: Information validity flags + * @cpumask: Affinity mask for this interrupt + * + * This structure reports firmware-specific information about an + * interrupt. The only significant information is the affinity of a + * per-CPU interrupt, but this is designed to be extended as required. + */ +struct irq_fwspec_info { + unsigned long flags; + const struct cpumask *affinity; +}; + +#define IRQ_FWSPEC_INFO_AFFINITY_VALID BIT(0) + /* Conversion function from of_phandle_args fields to fwspec */ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, unsigned int count, struct irq_fwspec *fwspec); @@ -69,6 +86,9 @@ void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, * @translate: Given @fwspec, decode the hardware irq number (@out_hwirq) and * linux irq type value (@out_type). This is a generalised @xlate * (over struct irq_fwspec) and is preferred if provided. + * @get_fwspec_info: + * Given @fwspec, report additional firmware-provided information in + * @info. Optional. * @debug_show: For domains to show specific data for an interrupt in debugfs. * * Functions below are provided by the driver and called whenever a new mapping @@ -96,6 +116,7 @@ struct irq_domain_ops { void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data); int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec, unsigned long *out_hwirq, unsigned int *out_type); + int (*get_fwspec_info)(struct irq_fwspec *fwspec, struct irq_fwspec_info *info); #endif #ifdef CONFIG_GENERIC_IRQ_DEBUGFS void (*debug_show)(struct seq_file *m, struct irq_domain *d, @@ -602,6 +623,8 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, unsigned int irq_bas int irq_domain_disconnect_hierarchy(struct irq_domain *domain, unsigned int virq); +int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info); + static inline bool irq_domain_is_hierarchy(struct irq_domain *domain) { return domain->flags & IRQ_DOMAIN_FLAG_HIERARCHY; @@ -685,6 +708,10 @@ static inline bool irq_domain_is_msi_device(struct irq_domain *domain) return false; } +static inline int irq_populate_fwspec_info(struct irq_fwspec *fwspec, struct irq_fwspec_info *info) +{ + return -EINVAL; +} #endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ #ifdef CONFIG_GENERIC_MSI_IRQ -- cgit v1.2.3 From 5324fe21ba9b77b299c02191645a97777cdd73ac Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:19 +0100 Subject: ACPI: irq: Add interrupt affinity reporting interface Plug the irq_populate_fwspec_info() helper into the ACPI layer to offer an interrupt affinity reporting function. This is currently only supported for the CONFIG_ACPI_GENERIC_GSI configurations, but could later be extended to legacy architectures if necessary. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Reviewed-by: Jonathan Cameron Acked-by: Rafael J. Wysocki (Intel) Link: https://patch.msgid.link/20251020122944.3074811-3-maz@kernel.org --- include/linux/acpi.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead..607db773b672 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1509,12 +1509,19 @@ static inline int acpi_parse_spcr(bool enable_earlycon, bool enable_console) #if IS_ENABLED(CONFIG_ACPI_GENERIC_GSI) int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res); +const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, + unsigned int index); #else static inline int acpi_irq_get(acpi_handle handle, unsigned int index, struct resource *res) { return -EINVAL; } +static inline const struct cpumask *acpi_irq_get_affinity(acpi_handle handle, + unsigned int index) +{ + return NULL; +} #endif #ifdef CONFIG_ACPI_LPIT -- cgit v1.2.3 From 5404f5c06dd41fd4445a01dec77a629e254a62e8 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:20 +0100 Subject: of/irq: Add interrupt affinity reporting interface Plug the irq_populate_fwspec_info() helper into the OF layer to offer an interrupt affinity reporting function. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251020122944.3074811-4-maz@kernel.org --- include/linux/of_irq.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 1db8543dfc8a..1c2bc0281807 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -43,6 +43,8 @@ extern int of_irq_parse_one(struct device_node *device, int index, struct of_phandle_args *out_irq); extern int of_irq_count(struct device_node *dev); extern int of_irq_get(struct device_node *dev, int index); +extern const struct cpumask *of_irq_get_affinity(struct device_node *dev, + int index); extern int of_irq_get_byname(struct device_node *dev, const char *name); extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); @@ -76,6 +78,11 @@ static inline int of_irq_get_byname(struct device_node *dev, const char *name) { return 0; } +static inline const struct cpumask *of_irq_get_affinity(struct device_node *dev, + int index) +{ + return NULL; +} static inline int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs) { -- cgit v1.2.3 From 0d5daa938c94b8b9183e9b257a88dc0929d59409 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:21 +0100 Subject: platform: Add firmware-agnostic irq and affinity retrieval interface Expand platform_get_irq_optional() to also return an affinity if available, renaming it to platform_get_irq_affinity() in the process. platform_get_irq_optional() is preserved with its current semantics by calling into the new helper with a NULL affinity pointer. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251020122944.3074811-5-maz@kernel.org --- include/linux/platform_device.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 074754c23d33..ad66333ce85c 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -102,6 +102,8 @@ devm_platform_ioremap_resource_byname(struct platform_device *pdev, extern int platform_get_irq(struct platform_device *, unsigned int); extern int platform_get_irq_optional(struct platform_device *, unsigned int); +extern int platform_get_irq_affinity(struct platform_device *, unsigned int, + const struct cpumask **); extern int platform_irq_count(struct platform_device *); extern int devm_platform_get_irqs_affinity(struct platform_device *dev, struct irq_affinity *affd, -- cgit v1.2.3 From 5ff78c8de9d83ad6fc0553bf8f2edc816385837d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:28 +0100 Subject: genirq: Kill handle_percpu_devid_fasteoi_nmi() There is no in-tree user of this flow handler anymore, so simply remove it. Suggested-by: Will Deacon Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-12-maz@kernel.org --- include/linux/irq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index c67e76fbcc07..b728c18f6ded 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -655,7 +655,6 @@ extern void handle_bad_irq(struct irq_desc *desc); extern void handle_nested_irq(unsigned int irq); extern void handle_fasteoi_nmi(struct irq_desc *desc); -extern void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc); extern int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg); extern int irq_chip_pm_get(struct irq_data *data); -- cgit v1.2.3 From 5c2b2cc472e015e79c4f0170893a1e0883bd3bb4 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:29 +0100 Subject: genirq: Merge irqaction::{dev_id,percpu_dev_id} When irqaction::percpu_dev_id was introduced, it was hoped that it could be part of an anonymous union with dev_id, as the two fields are mutually exclusive. However, toolchains used at the time were often showing terrible support for anonymous unions, breaking the build on a number of architectures. It was therefore decided to keep the two fields separate and address this down the line. 14 years later, the compiler dark age is over, and there is universal support for anonymous unions. Get a whole pointer back that can immediately be spent on something else. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-13-maz@kernel.org --- include/linux/interrupt.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 51b6484c0493..0ec1a71ab4e8 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -121,8 +121,10 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); */ struct irqaction { irq_handler_t handler; - void *dev_id; - void __percpu *percpu_dev_id; + union { + void *dev_id; + void __percpu *percpu_dev_id; + }; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; -- cgit v1.2.3 From 258e7d28a3dcd389239f9688058140c1a418b549 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:31 +0100 Subject: genirq: Add affinity to percpu_devid interrupt requests Add an affinity field to both the irqaction structure and the interrupt request primitives. Nothing is making use of it yet, and the only value used it NULL, which is used as a shorthand for cpu_possible_mask. This will shortly get used with actual affinities. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-15-maz@kernel.org --- include/linux/interrupt.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 0ec1a71ab4e8..52147d5f432b 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -125,6 +125,7 @@ struct irqaction { void *dev_id; void __percpu *percpu_dev_id; }; + const struct cpumask *affinity; struct irqaction *next; irq_handler_t thread_fn; struct task_struct *thread; @@ -181,7 +182,7 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler, extern int __must_check __request_percpu_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *devname, - void __percpu *percpu_dev_id); + const cpumask_t *affinity, void __percpu *percpu_dev_id); extern int __must_check request_nmi(unsigned int irq, irq_handler_t handler, unsigned long flags, @@ -192,7 +193,7 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id) { return __request_percpu_irq(irq, handler, 0, - devname, percpu_dev_id); + devname, NULL, percpu_dev_id); } extern int __must_check -- cgit v1.2.3 From b9c6aa9efc71dae656f9f913d1250ea08cd6e10f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:32 +0100 Subject: genirq: Update request_percpu_nmi() to take an affinity Continue spreading the notion of affinity to the per CPU interrupt request code by updating the call sites that use request_percpu_nmi() (all two of them) to take an affinity pointer. This pointer is firmly NULL for now. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-16-maz@kernel.org --- include/linux/interrupt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 52147d5f432b..81506ab759b8 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -197,8 +197,8 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler, } extern int __must_check -request_percpu_nmi(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *dev); +request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, + const struct cpumask *affinity, void __percpu *dev_id); extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); -- cgit v1.2.3 From c734af3b2b95f0ac6ed87c50e7602a6beeaf534f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:34 +0100 Subject: genirq: Add request_percpu_irq_affinity() helper While it would be nice to simply make request_percpu_irq() take an affinity mask, the churn is likely to be on the irritating side given that most drivers do not give a damn about affinities. So take the more innocuous path to provide a helper that parallels request_percpu_irq(), with an affinity as a bonus argument. Yes, request_percpu_irq_affinity() is a bit of a mouthful. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-18-maz@kernel.org --- include/linux/interrupt.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 81506ab759b8..fa62ab556ee3 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -196,6 +196,15 @@ request_percpu_irq(unsigned int irq, irq_handler_t handler, devname, NULL, percpu_dev_id); } +static inline int __must_check +request_percpu_irq_affinity(unsigned int irq, irq_handler_t handler, + const char *devname, const cpumask_t *affinity, + void __percpu *percpu_dev_id) +{ + return __request_percpu_irq(irq, handler, 0, + devname, affinity, percpu_dev_id); +} + extern int __must_check request_percpu_nmi(unsigned int irq, irq_handler_t handler, const char *name, const struct cpumask *affinity, void __percpu *dev_id); -- cgit v1.2.3 From 54b350fa8e965dc59622698e2a18d6bf73944bf4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 20 Oct 2025 13:29:35 +0100 Subject: perf: arm_pmu: Request specific affinities for per CPU NMIs/interrupts Let the PMU driver request both NMIs and normal interrupts with an affinity mask matching the PMU affinity. Signed-off-by: Will Deacon Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-19-maz@kernel.org --- include/linux/perf/arm_pmu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 93c9a26492fc..6690bd77aa4e 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -190,8 +190,8 @@ bool arm_pmu_irq_is_nmi(void); struct arm_pmu *armpmu_alloc(void); void armpmu_free(struct arm_pmu *pmu); int armpmu_register(struct arm_pmu *pmu); -int armpmu_request_irq(int irq, int cpu); -void armpmu_free_irq(int irq, int cpu); +int armpmu_request_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu); +void armpmu_free_irq(struct arm_pmu * __percpu *armpmu, int irq, int cpu); #define ARMV8_PMU_PDEV_NAME "armv8-pmu" -- cgit v1.2.3 From c620438ef2ac80b09269a9ae3c0b4fe5add19bfe Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:40 +0100 Subject: irqchip: Kill irq-partition-percpu This code is now completely unused, and nobody will ever miss it. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-24-maz@kernel.org --- include/linux/irqchip/irq-partition-percpu.h | 53 ---------------------------- 1 file changed, 53 deletions(-) delete mode 100644 include/linux/irqchip/irq-partition-percpu.h (limited to 'include') diff --git a/include/linux/irqchip/irq-partition-percpu.h b/include/linux/irqchip/irq-partition-percpu.h deleted file mode 100644 index b35ee22c278f..000000000000 --- a/include/linux/irqchip/irq-partition-percpu.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2016 ARM Limited, All Rights Reserved. - * Author: Marc Zyngier - */ - -#ifndef __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H -#define __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H - -#include -#include -#include - -struct partition_affinity { - cpumask_t mask; - void *partition_id; -}; - -struct partition_desc; - -#ifdef CONFIG_PARTITION_PERCPU -int partition_translate_id(struct partition_desc *desc, void *partition_id); -struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode, - struct partition_affinity *parts, - int nr_parts, - int chained_irq, - const struct irq_domain_ops *ops); -struct irq_domain *partition_get_domain(struct partition_desc *dsc); -#else -static inline int partition_translate_id(struct partition_desc *desc, - void *partition_id) -{ - return -EINVAL; -} - -static inline -struct partition_desc *partition_create_desc(struct fwnode_handle *fwnode, - struct partition_affinity *parts, - int nr_parts, - int chained_irq, - const struct irq_domain_ops *ops) -{ - return NULL; -} - -static inline -struct irq_domain *partition_get_domain(struct partition_desc *dsc) -{ - return NULL; -} -#endif - -#endif /* __LINUX_IRQCHIP_IRQ_PARTITION_PERCPU_H */ -- cgit v1.2.3 From ee2d50a9f524ae829d1a8ec296d7a0170e7b8ade Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:41 +0100 Subject: genirq: Kill irq_{g,s}et_percpu_devid_partition() These two helpers do not have any user anymore, and can be removed, together with the affinity field kept in the irqdesc structure. Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-25-maz@kernel.org --- include/linux/irq.h | 4 ---- include/linux/irqdesc.h | 1 - 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/irq.h b/include/linux/irq.h index b728c18f6ded..4a9f1d7b08c3 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -718,10 +718,6 @@ static inline void irq_set_chip_and_handler(unsigned int irq, } extern int irq_set_percpu_devid(unsigned int irq); -extern int irq_set_percpu_devid_partition(unsigned int irq, - const struct cpumask *affinity); -extern int irq_get_percpu_devid_partition(unsigned int irq, - struct cpumask *affinity); extern void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index fd091c35d572..37e0b5b5600a 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -82,7 +82,6 @@ struct irq_desc { int threads_handled_last; raw_spinlock_t lock; struct cpumask *percpu_enabled; - const struct cpumask *percpu_affinity; #ifdef CONFIG_SMP const struct cpumask *affinity_hint; struct irq_affinity_notify *affinity_notify; -- cgit v1.2.3 From ebac4649fcadc6047030810326875c6e612c7b2f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:42 +0100 Subject: irqdomain: Kill of_node_to_fwnode() helper There is no in-tree users of this helper since b13b41cc3dc18 ("misc: ti_fpc202: Switch to of_fwnode_handle()"), and is replaced with of_fwnode_handle(). Get rid of it. Suggested-by: Jonathan Cameron Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-26-maz@kernel.org --- include/linux/irqdomain.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 9d6a5e99394f..5907baf6099d 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -730,12 +730,6 @@ static inline void msi_device_domain_free_wired(struct irq_domain *domain, unsig } #endif -/* Deprecated functions. Will be removed in the merge window */ -static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node) -{ - return node ? &node->fwnode : NULL; -} - static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, const struct irq_domain_ops *ops, void *host_data) -- cgit v1.2.3 From fa9d2777387346645a40ab37cfb0c37b3ef40cc9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 20 Oct 2025 13:29:43 +0100 Subject: perf: arm_pmu: Kill last use of per-CPU cpu_armpmu pointer Having removed the use of the cpu_armpmu per-CPU variable from the interrupt handling, the only user left is the BRBE scheduler hook. It is easy to drop the use of this variable by following the pointer to the generic PMU structure, and get the arm_pmu structure from there. Perform the conversion and kill cpu_armpmu altogether. Suggested-by: Will Deacon Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Will Deacon Link: https://patch.msgid.link/20251020122944.3074811-27-maz@kernel.org --- include/linux/perf/arm_pmu.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 6690bd77aa4e..bab26a7d79f4 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -132,8 +132,6 @@ struct arm_pmu { #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu)) -DECLARE_PER_CPU(struct arm_pmu *, cpu_armpmu); - u64 armpmu_event_update(struct perf_event *event); int armpmu_event_set_period(struct perf_event *event); -- cgit v1.2.3 From 531b87d865eb9e625c2e46ec8f06a65a6157ee45 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 26 Oct 2025 20:38:45 +0000 Subject: bpf: widen dynptr size/offset to 64 bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dynptr currently caps size and offset at 24 bits, which isn’t sufficient for file-backed use cases; even 32 bits can be limiting. Refactor dynptr helpers/kfuncs to use 64-bit size and offset, ensuring consistency across the APIs. This change does not affect internals of xdp, skb or other dynptrs, which continue to behave as before. Also it does not break binary compatibility. The widening enables large-file access support via dynptr, implemented in the next patches. Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251026203853.135105-3-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 20 ++++++++++---------- include/uapi/linux/bpf.h | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e53cda0aabb6..907c69295293 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1387,19 +1387,19 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_SKB_META, }; -int bpf_dynptr_check_size(u32 size); -u32 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); -const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len); -void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u32 len); +int bpf_dynptr_check_size(u64 size); +u64 __bpf_dynptr_size(const struct bpf_dynptr_kern *ptr); +const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u64 len); +void *__bpf_dynptr_data_rw(const struct bpf_dynptr_kern *ptr, u64 len); bool __bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr); -int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, - void *src, u32 len, u64 flags); -void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset, - void *buffer__opt, u32 buffer__szk); +int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u64 offset, + void *src, u64 len, u64 flags); +void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u64 offset, + void *buffer__opt, u64 buffer__szk); -static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static inline int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u64 offset, u64 len) { - u32 size = __bpf_dynptr_size(ptr); + u64 size = __bpf_dynptr_size(ptr); if (len > size || offset > size - len) return -E2BIG; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6829936d33f5..77edd0253989 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5618,7 +5618,7 @@ union bpf_attr { * Return * *sk* if casting is valid, or **NULL** otherwise. * - * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * long bpf_dynptr_from_mem(void *data, u64 size, u64 flags, struct bpf_dynptr *ptr) * Description * Get a dynptr to local memory *data*. * @@ -5661,7 +5661,7 @@ union bpf_attr { * Return * Nothing. Always succeeds. * - * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * long bpf_dynptr_read(void *dst, u64 len, const struct bpf_dynptr *src, u64 offset, u64 flags) * Description * Read *len* bytes from *src* into *dst*, starting from *offset* * into *src*. @@ -5671,7 +5671,7 @@ union bpf_attr { * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if * *flags* is not 0. * - * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u64 offset, void *src, u64 len, u64 flags) * Description * Write *len* bytes from *src* into *dst*, starting from *offset* * into *dst*. @@ -5692,7 +5692,7 @@ union bpf_attr { * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). * - * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u64 offset, u64 len) * Description * Get a pointer to the underlying dynptr data. * -- cgit v1.2.3 From 76e4fed847124690f7344a43d01dbcd7b2925353 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 26 Oct 2025 20:38:46 +0000 Subject: lib: move freader into buildid.h Move struct freader and prototypes of the functions operating on it into the buildid.h. This allows reusing freader outside buildid, e.g. for file dynptr support added later. Signed-off-by: Mykyta Yatsenko Link: https://lore.kernel.org/r/20251026203853.135105-4-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/buildid.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 014a88c41073..831c1b4b626c 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -18,4 +18,29 @@ void init_vmlinux_build_id(void); static inline void init_vmlinux_build_id(void) { } #endif +struct freader { + void *buf; + u32 buf_sz; + int err; + union { + struct { + struct file *file; + struct folio *folio; + void *addr; + loff_t folio_off; + bool may_fault; + }; + struct { + const char *data; + u64 data_sz; + }; + }; +}; + +void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, + struct file *file, bool may_fault); +void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz); +const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz); +void freader_cleanup(struct freader *r); + #endif -- cgit v1.2.3 From 8d8771dc03e48300e80b43744dd3c320ccaf746a Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 26 Oct 2025 20:38:49 +0000 Subject: bpf: add plumbing for file-backed dynptr Add the necessary verifier plumbing for the new file-backed dynptr type. Introduce two kfuncs for its lifecycle management: * bpf_dynptr_from_file() for initialization * bpf_dynptr_file_discard() for destruction Currently there is no mechanism for kfunc to release dynptr, this patch add one: * Dynptr release function sets meta->release_regno * Call unmark_stack_slots_dynptr() if meta->release_regno is set and dynptr ref_obj_id is set as well. Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251026203853.135105-7-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 907c69295293..14f800773997 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -792,12 +792,15 @@ enum bpf_type_flag { /* DYNPTR points to skb_metadata_end()-skb_metadata_len() */ DYNPTR_TYPE_SKB_META = BIT(19 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to file */ + DYNPTR_TYPE_FILE = BIT(20 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; #define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ - | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META) + | DYNPTR_TYPE_XDP | DYNPTR_TYPE_SKB_META | DYNPTR_TYPE_FILE) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1385,6 +1388,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_XDP, /* Points to skb_metadata_end()-skb_metadata_len() */ BPF_DYNPTR_TYPE_SKB_META, + /* Underlying data is a file */ + BPF_DYNPTR_TYPE_FILE, }; int bpf_dynptr_check_size(u64 size); -- cgit v1.2.3 From 2c52e8943a437af6093d8b0f0920f1764f0e5f64 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Sun, 26 Oct 2025 20:38:52 +0000 Subject: bpf: dispatch to sleepable file dynptr File dynptr reads may sleep when the requested folios are not in the page cache. To avoid sleeping in non-sleepable contexts while still supporting valid sleepable use, given that dynptrs are non-sleepable by default, enable sleeping only when bpf_dynptr_from_file() is invoked from a sleepable context. This change: * Introduces a sleepable constructor: bpf_dynptr_from_file_sleepable() * Override non-sleepable constructor with sleepable if it's always called in sleepable context Signed-off-by: Mykyta Yatsenko Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251026203853.135105-10-mykyta.yatsenko5@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 14f800773997..a47d67db3be5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -670,6 +670,9 @@ static inline bool bpf_map_has_internal_structs(struct bpf_map *map) void bpf_map_free_internal_structs(struct bpf_map *map, void *obj); +int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags, + struct bpf_dynptr *ptr__uninit); + extern const struct bpf_map_ops bpf_map_offload_ops; /* bpf_type_flag contains a set of flags that are applicable to the values of -- cgit v1.2.3 From 457129aa3610f46bfa6f97725de731345d4aaef0 Mon Sep 17 00:00:00 2001 From: Jingyi Wang Date: Wed, 22 Oct 2025 21:57:36 -0700 Subject: dt-bindings: arm: qcom,ids: Add SoC ID for SM8850 Add the ID for the Qualcomm SM8850 SoC which represents the Kaanapali platform. Signed-off-by: Jingyi Wang Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20251022-knp-socid-v2-1-d147eadd09ee@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/arm/qcom,ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h index cb8ce53146f0..19598ed4679e 100644 --- a/include/dt-bindings/arm/qcom,ids.h +++ b/include/dt-bindings/arm/qcom,ids.h @@ -286,6 +286,7 @@ #define QCOM_ID_IPQ5424 651 #define QCOM_ID_QCM6690 657 #define QCOM_ID_QCS6690 658 +#define QCOM_ID_SM8850 660 #define QCOM_ID_IPQ5404 671 #define QCOM_ID_QCS9100 667 #define QCOM_ID_QCS8300 674 -- cgit v1.2.3 From 82cb5be6ad64198a3a028aeb49dcc7f6224d558a Mon Sep 17 00:00:00 2001 From: Wilfred Mallawa Date: Wed, 22 Oct 2025 10:19:36 +1000 Subject: net/tls: support setting the maximum payload size During a handshake, an endpoint may specify a maximum record size limit. Currently, the kernel defaults to TLS_MAX_PAYLOAD_SIZE (16KB) for the maximum record size. Meaning that, the outgoing records from the kernel can exceed a lower size negotiated during the handshake. In such a case, the TLS endpoint must send a fatal "record_overflow" alert [1], and thus the record is discarded. Upcoming Western Digital NVMe-TCP hardware controllers implement TLS support. For these devices, supporting TLS record size negotiation is necessary because the maximum TLS record size supported by the controller is less than the default 16KB currently used by the kernel. Currently, there is no way to inform the kernel of such a limit. This patch adds support to a new setsockopt() option `TLS_TX_MAX_PAYLOAD_LEN` that allows for setting the maximum plaintext fragment size. Once set, outgoing records are no larger than the size specified. This option can be used to specify the record size limit. [1] https://www.rfc-editor.org/rfc/rfc8449 Signed-off-by: Wilfred Mallawa Reviewed-by: Sabrina Dubroca Link: https://patch.msgid.link/20251022001937.20155-1-wilfred.opensource@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tls.h | 3 +++ include/uapi/linux/tls.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/tls.h b/include/net/tls.h index 857340338b69..f2af113728aa 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -53,6 +53,8 @@ struct tls_rec; /* Maximum data size carried in a TLS record */ #define TLS_MAX_PAYLOAD_SIZE ((size_t)1 << 14) +/* Minimum record size limit as per RFC8449 */ +#define TLS_MIN_RECORD_SIZE_LIM ((size_t)1 << 6) #define TLS_HEADER_SIZE 5 #define TLS_NONCE_OFFSET TLS_HEADER_SIZE @@ -226,6 +228,7 @@ struct tls_context { u8 rx_conf:3; u8 zerocopy_sendfile:1; u8 rx_no_pad:1; + u16 tx_max_payload_len; int (*push_pending_record)(struct sock *sk, int flags); void (*sk_write_space)(struct sock *sk); diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index b66a800389cc..b8b9c42f848c 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -41,6 +41,7 @@ #define TLS_RX 2 /* Set receive parameters */ #define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */ #define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */ +#define TLS_TX_MAX_PAYLOAD_LEN 5 /* Maximum plaintext size */ /* Supported versions */ #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) @@ -194,6 +195,7 @@ enum { TLS_INFO_RXCONF, TLS_INFO_ZC_RO_TX, TLS_INFO_RX_NO_PAD, + TLS_INFO_TX_MAX_PAYLOAD_LEN, __TLS_INFO_MAX, }; #define TLS_INFO_MAX (__TLS_INFO_MAX - 1) -- cgit v1.2.3 From 151b98d10ef7c3174465e409b99d8762e7e8de60 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 Oct 2025 23:16:53 +0000 Subject: net: Add sk_clone(). sctp_accept() will use sk_clone_lock(), but it will be called with the parent socket locked, and sctp_migrate() acquires the child lock later. Let's add no lock version of sk_clone_lock(). Note that lockdep complains if we simply use bh_lock_sock_nested(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Xin Long Link: https://patch.msgid.link/20251023231751.4168390-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 01ce231603db..c7e58b8e8a90 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1822,7 +1822,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, void sk_free(struct sock *sk); void sk_net_refcnt_upgrade(struct sock *sk); void sk_destruct(struct sock *sk); -struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); +struct sock *sk_clone(const struct sock *sk, const gfp_t priority, bool lock); + +static inline struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +{ + return sk_clone(sk, priority, true); +} struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); -- cgit v1.2.3 From c49ed521f1772ca9203d22a1e5950f337fd5f930 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 Oct 2025 23:16:55 +0000 Subject: sctp: Remove sctp_pf.create_accept_sk(). sctp_v[46]_create_accept_sk() are no longer used. Let's remove sctp_pf.create_accept_sk(). Signed-off-by: Kuniyuki Iwashima Acked-by: Xin Long Link: https://patch.msgid.link/20251023231751.4168390-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sctp/structs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 2ae390219efd..3dd304e411d0 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -497,9 +497,6 @@ struct sctp_pf { int (*bind_verify) (struct sctp_sock *, union sctp_addr *); int (*send_verify) (struct sctp_sock *, union sctp_addr *); int (*supported_addrs)(const struct sctp_sock *, __be16 *); - struct sock *(*create_accept_sk) (struct sock *sk, - struct sctp_association *asoc, - bool kern); int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr); void (*to_sk_saddr)(union sctp_addr *, struct sock *sk); void (*to_sk_daddr)(union sctp_addr *, struct sock *sk); -- cgit v1.2.3 From 71068e2e1b6bd78f5599e5bc89e125a75149884b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 23 Oct 2025 23:16:57 +0000 Subject: sctp: Remove sctp_copy_sock() and sctp_copy_descendant(). Now, sctp_accept() and sctp_do_peeloff() use sk_clone(), and we no longer need sctp_copy_sock() and sctp_copy_descendant(). Let's remove them. Signed-off-by: Kuniyuki Iwashima Acked-by: Xin Long Link: https://patch.msgid.link/20251023231751.4168390-9-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_sock.h | 8 -------- include/net/sctp/sctp.h | 3 +-- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index b6ec08072533..ac1c75975908 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -355,14 +355,6 @@ static inline struct sock *skb_to_full_sk(const struct sk_buff *skb) #define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk) -static inline void __inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from, - const int ancestor_size) -{ - memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, - sk_from->sk_prot->obj_size - ancestor_size); -} - int inet_sk_rebuild_header(struct sock *sk); /** diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index e96d1bd087f6..bb4b80c12541 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -94,8 +94,7 @@ void sctp_data_ready(struct sock *sk); __poll_t sctp_poll(struct file *file, struct socket *sock, poll_table *wait); void sctp_sock_rfree(struct sk_buff *skb); -void sctp_copy_sock(struct sock *newsk, struct sock *sk, - struct sctp_association *asoc); + extern struct percpu_counter sctp_sockets_allocated; int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *); struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int *); -- cgit v1.2.3 From 6f147c8328e045de3a35155ca7c883d88da9e916 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 24 Oct 2025 10:51:45 +0800 Subject: net/sched: Remove unused typedef psched_tdiff_t Since commit 051d44209842 ("net/sched: Retire CBQ qdisc") this is not used anymore. Signed-off-by: Yue Haibing Link: https://patch.msgid.link/20251024025145.4069583-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- include/net/pkt_sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index c660ac871083..4678db45832a 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -43,7 +43,6 @@ struct qdisc_walker { */ typedef u64 psched_time_t; -typedef long psched_tdiff_t; /* Avoid doing 64 bit divide */ #define PSCHED_SHIFT 6 -- cgit v1.2.3 From feeaf1346f80ffb181b6f9b739628103aa73b067 Mon Sep 17 00:00:00 2001 From: Xu Kuohai Date: Sat, 18 Oct 2025 11:57:36 +0800 Subject: bpf: Add overwrite mode for BPF ring buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the BPF ring buffer is full, a new event cannot be recorded until one or more old events are consumed to make enough space for it. In cases such as fault diagnostics, where recent events are more useful than older ones, this mechanism may lead to critical events being lost. So add overwrite mode for BPF ring buffer to address it. In this mode, the new event overwrites the oldest event when the buffer is full. The basic idea is as follows: 1. producer_pos tracks the next position to record new event. When there is enough free space, producer_pos is simply advanced by producer to make space for the new event. 2. To avoid waiting for consumer when the buffer is full, a new variable, overwrite_pos, is introduced for producer. It points to the oldest event committed in the buffer. It is advanced by producer to discard one or more oldest events to make space for the new event when the buffer is full. 3. pending_pos tracks the oldest event to be committed. pending_pos is never passed by producer_pos, so multiple producers never write to the same position at the same time. The following example diagrams show how it works in a 4096-byte ring buffer. 1. At first, {producer,overwrite,pending,consumer}_pos are all set to 0. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | +-----------------------------------------------------------------------+ ^ | | producer_pos = 0 overwrite_pos = 0 pending_pos = 0 consumer_pos = 0 2. Now reserve a 512-byte event A. There is enough free space, so A is allocated at offset 0. And producer_pos is advanced to 512, the end of A. Since A is not submitted, the BUSY bit is set. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | A | | | [BUSY] | | +-----------------------------------------------------------------------+ ^ ^ | | | | | producer_pos = 512 | overwrite_pos = 0 pending_pos = 0 consumer_pos = 0 3. Reserve event B, size 1024. B is allocated at offset 512 with BUSY bit set, and producer_pos is advanced to the end of B. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | A | B | | | [BUSY] | [BUSY] | | +-----------------------------------------------------------------------+ ^ ^ | | | | | producer_pos = 1536 | overwrite_pos = 0 pending_pos = 0 consumer_pos = 0 4. Reserve event C, size 2048. C is allocated at offset 1536, and producer_pos is advanced to 3584. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | A | B | C | | | [BUSY] | [BUSY] | [BUSY] | | +-----------------------------------------------------------------------+ ^ ^ | | | | | producer_pos = 3584 | overwrite_pos = 0 pending_pos = 0 consumer_pos = 0 5. Submit event A. The BUSY bit of A is cleared. B becomes the oldest event to be committed, so pending_pos is advanced to 512, the start of B. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | A | B | C | | | | [BUSY] | [BUSY] | | +-----------------------------------------------------------------------+ ^ ^ ^ | | | | | | | pending_pos = 512 producer_pos = 3584 | overwrite_pos = 0 consumer_pos = 0 6. Submit event B. The BUSY bit of B is cleared, and pending_pos is advanced to the start of C, which is now the oldest event to be committed. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | A | B | C | | | | | [BUSY] | | +-----------------------------------------------------------------------+ ^ ^ ^ | | | | | | | pending_pos = 1536 producer_pos = 3584 | overwrite_pos = 0 consumer_pos = 0 7. Reserve event D, size 1536 (3 * 512). There are 2048 bytes not being written between producer_pos (currently 3584) and pending_pos, so D is allocated at offset 3584, and producer_pos is advanced by 1536 (from 3584 to 5120). Since event D will overwrite all bytes of event A and the first 512 bytes of event B, overwrite_pos is advanced to the start of event C, the oldest event that is not overwritten. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | D End | | C | D Begin| | [BUSY] | | [BUSY] | [BUSY] | +-----------------------------------------------------------------------+ ^ ^ ^ | | | | | pending_pos = 1536 | | overwrite_pos = 1536 | | | producer_pos=5120 | consumer_pos = 0 8. Reserve event E, size 1024. Although there are 512 bytes not being written between producer_pos and pending_pos, E cannot be reserved, as it would overwrite the first 512 bytes of event C, which is still being written. 9. Submit event C and D. pending_pos is advanced to the end of D. 0 512 1024 1536 2048 2560 3072 3584 4096 +-----------------------------------------------------------------------+ | | | | | | D End | | C | D Begin| | | | | | +-----------------------------------------------------------------------+ ^ ^ ^ | | | | | overwrite_pos = 1536 | | | producer_pos=5120 | pending_pos=5120 | consumer_pos = 0 The performance data for overwrite mode will be provided in a follow-up patch that adds overwrite-mode benchmarks. A sample of performance data for non-overwrite mode, collected on an x86_64 CPU and an arm64 CPU, before and after this patch, is shown below. As we can see, no obvious performance regression occurs. - x86_64 (AMD EPYC 9654) Before: Ringbuf, multi-producer contention ================================== rb-libbpf nr_prod 1 11.623 ± 0.027M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 2 15.812 ± 0.014M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 3 7.871 ± 0.003M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 4 6.703 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 8 2.896 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 12 2.054 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 16 1.864 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 20 1.580 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 24 1.484 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 28 1.369 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 32 1.316 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 36 1.272 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 40 1.239 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 44 1.226 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 48 1.213 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 52 1.193 ± 0.001M/s (drops 0.000 ± 0.000M/s) After: Ringbuf, multi-producer contention ================================== rb-libbpf nr_prod 1 11.845 ± 0.036M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 2 15.889 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 3 8.155 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 4 6.708 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 8 2.918 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 12 2.065 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 16 1.870 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 20 1.582 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 24 1.482 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 28 1.372 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 32 1.323 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 36 1.264 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 40 1.236 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 44 1.209 ± 0.002M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 48 1.189 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 52 1.165 ± 0.002M/s (drops 0.000 ± 0.000M/s) - arm64 (HiSilicon Kunpeng 920) Before: Ringbuf, multi-producer contention ================================== rb-libbpf nr_prod 1 11.310 ± 0.623M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 2 9.947 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 3 6.634 ± 0.011M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 4 4.502 ± 0.003M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 8 3.888 ± 0.003M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 12 3.372 ± 0.005M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 16 3.189 ± 0.010M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 20 2.998 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 24 3.086 ± 0.018M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 28 2.845 ± 0.004M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 32 2.815 ± 0.008M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 36 2.771 ± 0.009M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 40 2.814 ± 0.011M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 44 2.752 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 48 2.695 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 52 2.710 ± 0.006M/s (drops 0.000 ± 0.000M/s) After: Ringbuf, multi-producer contention ================================== rb-libbpf nr_prod 1 11.283 ± 0.550M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 2 9.993 ± 0.003M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 3 6.898 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 4 5.257 ± 0.001M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 8 3.830 ± 0.005M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 12 3.528 ± 0.013M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 16 3.265 ± 0.018M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 20 2.990 ± 0.007M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 24 2.929 ± 0.014M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 28 2.898 ± 0.010M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 32 2.818 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 36 2.789 ± 0.012M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 40 2.770 ± 0.006M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 44 2.651 ± 0.007M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 48 2.669 ± 0.005M/s (drops 0.000 ± 0.000M/s) rb-libbpf nr_prod 52 2.695 ± 0.009M/s (drops 0.000 ± 0.000M/s) Signed-off-by: Xu Kuohai Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20251018035738.4039621-2-xukuohai@huaweicloud.com --- include/uapi/linux/bpf.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 77edd0253989..1d73f165394d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1430,6 +1430,9 @@ enum { /* Do not translate kernel bpf_arena pointers to user pointers */ BPF_F_NO_USER_CONV = (1U << 18), + +/* Enable BPF ringbuf overwrite mode */ + BPF_F_RB_OVERWRITE = (1U << 19), }; /* Flags for BPF_PROG_QUERY. */ @@ -6231,6 +6234,7 @@ enum { BPF_RB_RING_SIZE = 1, BPF_RB_CONS_POS = 2, BPF_RB_PROD_POS = 3, + BPF_RB_OVERWRITE_POS = 4, }; /* BPF ring buffer constants */ -- cgit v1.2.3 From b94d45b6bbb42571ec225d3be0e7457c8765a5b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 Oct 2025 09:56:38 +0100 Subject: seqlock: Allow KASAN to fail optimizing Some KASAN builds are failing to properly optimize this code -- luckily we don't care about core quality for KASAN builds, so just exclude it. Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Closes: https://lore.kernel.org/oe-kbuild-all/202510251641.idrNXhv5-lkp@intel.com/ --- include/linux/seqlock.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index b7bcc4111e90..a8a8661839b6 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -1234,11 +1234,14 @@ static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst) extern void __scoped_seqlock_invalid_target(void); -#if defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000 +#if (defined(CONFIG_CC_IS_GCC) && CONFIG_GCC_VERSION < 90000) || defined(CONFIG_KASAN) /* * For some reason some GCC-8 architectures (nios2, alpha) have trouble * determining that the ss_done state is impossible in __scoped_seqlock_next() * below. + * + * Similarly KASAN is known to confuse compilers enough to break this. But we + * don't care about code quality for KASAN builds anyway. */ static inline void __scoped_seqlock_bug(void) { } #else -- cgit v1.2.3 From 90a18c512884adb49ddc2fb30e94594169aae808 Mon Sep 17 00:00:00 2001 From: Antonio Borneo Date: Thu, 23 Oct 2025 15:26:50 +0200 Subject: pinctrl: pinconf-generic: Handle string values for generic properties Allow a generic pinconf property to specify its argument as one of the strings in a match list. Convert the matching string to an integer value using the index in the list, then keep using this value in the generic pinconf code. Signed-off-by: Antonio Borneo Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index d9245ecec71d..f82add5d3302 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -181,21 +181,28 @@ static inline unsigned long pinconf_to_config_packed(enum pin_config_param param return PIN_CONF_PACKED(param, argument); } -#define PCONFDUMP(a, b, c, d) { \ - .param = a, .display = b, .format = c, .has_arg = d \ +#define PCONFDUMP_WITH_VALUES(a, b, c, d, e, f) { \ + .param = a, .display = b, .format = c, .has_arg = d, \ + .values = e, .num_values = f \ } +#define PCONFDUMP(a, b, c, d) PCONFDUMP_WITH_VALUES(a, b, c, d, NULL, 0) + struct pin_config_item { const enum pin_config_param param; const char * const display; const char * const format; bool has_arg; + const char * const *values; + size_t num_values; }; struct pinconf_generic_params { const char * const property; enum pin_config_param param; u32 default_value; + const char * const *values; + size_t num_values; }; int pinconf_generic_dt_subnode_to_map(struct pinctrl_dev *pctldev, -- cgit v1.2.3 From 55c7f5ef904fc2dcc7ef5945c5efb0cd60b46d32 Mon Sep 17 00:00:00 2001 From: Antonio Borneo Date: Thu, 23 Oct 2025 15:26:51 +0200 Subject: pinctrl: pinconf-generic: Add properties 'skew-delay-{in,out}put-ps' Add the properties 'skew-delay-input-ps' and 'skew-delay-output-ps' to the generic parameters used for parsing DT files. This allows to specify the independent skew delay value for the two directions. This enables drivers that use the generic pin configuration to get the value passed through these new properties. Signed-off-by: Antonio Borneo Signed-off-by: Linus Walleij --- include/linux/pinctrl/pinconf-generic.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index f82add5d3302..1be4032071c2 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -112,6 +112,12 @@ struct pinctrl_map; * or latch delay (on outputs) this parameter (in a custom format) * specifies the clock skew or latch delay. It typically controls how * many double inverters are put in front of the line. + * @PIN_CONFIG_SKEW_DELAY_INPUT_PS: if the pin has independent values for the + * programmable skew rate (on inputs) and latch delay (on outputs), then + * this parameter specifies the clock skew only. The argument is in ps. + * @PIN_CONFIG_SKEW_DELAY_OUPUT_PS: if the pin has independent values for the + * programmable skew rate (on inputs) and latch delay (on outputs), then + * this parameter specifies the latch delay only. The argument is in ps. * @PIN_CONFIG_SLEEP_HARDWARE_STATE: indicate this is sleep related state. * @PIN_CONFIG_SLEW_RATE: if the pin can select slew rate, the argument to * this parameter (on a custom format) tells the driver which alternative @@ -147,6 +153,8 @@ enum pin_config_param { PIN_CONFIG_PERSIST_STATE, PIN_CONFIG_POWER_SOURCE, PIN_CONFIG_SKEW_DELAY, + PIN_CONFIG_SKEW_DELAY_INPUT_PS, + PIN_CONFIG_SKEW_DELAY_OUTPUT_PS, PIN_CONFIG_SLEEP_HARDWARE_STATE, PIN_CONFIG_SLEW_RATE, PIN_CONFIG_END = 0x7F, -- cgit v1.2.3 From 7718f2a8b87af7363d60819ac0ac0da8b2f8ff00 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 23 Oct 2025 12:16:57 +0300 Subject: net/mlx5: Add software system image GUID infrastructure Replace direct hardware system image GUID usage with a new software system image GUID function that supports variable-length identifiers. Key changes: - Add mlx5_query_nic_sw_system_image_guid() function with length parameter. - Update all callsites to use the new function and buffer/length approach. - Modify mapping contexts to use byte arrays instead of u64 keys. - Update devcom matching to support variable-length keys. - Change mlx5_same_hw_devs() to use buffer comparison instead of u64. This refactoring prepares the infrastructure for balance ID support, which requires extending the system image GUID with additional data. The change maintains backward compatibility while enabling future enhancements. Signed-off-by: Mark Bloch Reviewed-by: Shay Drori Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1761211020-925651-3-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- include/linux/mlx5/driver.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5405ca1038f9..dcf262aa9ea6 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1379,4 +1379,7 @@ static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) { return devlink_net(priv_to_devlink(dev)); } + +#define MLX5_SW_IMAGE_GUID_MAX_BYTES 8 + #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From 20d78ead947783b039b02ca4b8c551b4d1894759 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 23 Oct 2025 12:17:00 +0300 Subject: net/mlx5: Add balance ID support for LAG multiplane groups Implement balance ID support for multiplane LAG configurations. This feature enables per-multiplane group load balancing by extending the software system image GUID with a balance ID component. Key implementations: - Enable lag_per_mp_group capability when supported by hardware. - Append load_balance_id to software system image GUID when conditions are met. - Increase MLX5_SW_IMAGE_GUID_MAX_BYTES from 8 to 9 to accommodate the extra byte. The balance ID is appended to the system image GUID only when both load_balance_id and lag_per_mp_group capabilities are available, ensuring backward compatibility while enabling enhanced LAG functionality. This enhancement allows for more granular load balancing control in complex multi-plane LAG deployments, improving network performance and flexibility. Signed-off-by: Mark Bloch Reviewed-by: Moshe Shemesh Reviewed-by: Shay Drori Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1761211020-925651-6-git-send-email-tariqt@nvidia.com Signed-off-by: Paolo Abeni --- include/linux/mlx5/driver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index dcf262aa9ea6..046396269ccf 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1380,6 +1380,6 @@ static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) return devlink_net(priv_to_devlink(dev)); } -#define MLX5_SW_IMAGE_GUID_MAX_BYTES 8 +#define MLX5_SW_IMAGE_GUID_MAX_BYTES 9 #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From baeb66fbd4201d1c4325074e78b1f557dff89b5b Mon Sep 17 00:00:00 2001 From: Jimmy Hu Date: Thu, 23 Oct 2025 05:49:45 +0000 Subject: usb: gadget: udc: fix use-after-free in usb_gadget_state_work A race condition during gadget teardown can lead to a use-after-free in usb_gadget_state_work(), as reported by KASAN: BUG: KASAN: invalid-access in sysfs_notify+0x2c/0xd0 Workqueue: events usb_gadget_state_work The fundamental race occurs because a concurrent event (e.g., an interrupt) can call usb_gadget_set_state() and schedule gadget->work at any time during the cleanup process in usb_del_gadget(). Commit 399a45e5237c ("usb: gadget: core: flush gadget workqueue after device removal") attempted to fix this by moving flush_work() to after device_del(). However, this does not fully solve the race, as a new work item can still be scheduled *after* flush_work() completes but before the gadget's memory is freed, leading to the same use-after-free. This patch fixes the race condition robustly by introducing a 'teardown' flag and a 'state_lock' spinlock to the usb_gadget struct. The flag is set during cleanup in usb_del_gadget() *before* calling flush_work() to prevent any new work from being scheduled once cleanup has commenced. The scheduling site, usb_gadget_set_state(), now checks this flag under the lock before queueing the work, thus safely closing the race window. Fixes: 5702f75375aa9 ("usb: gadget: udc-core: move sysfs_notify() to a workqueue") Cc: stable Signed-off-by: Jimmy Hu Link: https://patch.msgid.link/20251023054945.233861-1-hhhuuu@google.com Signed-off-by: Greg Kroah-Hartman --- include/linux/usb/gadget.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index 3aaf19e77558..8285b19a25e0 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -376,6 +376,9 @@ struct usb_gadget_ops { * can handle. The UDC must support this and all slower speeds and lower * number of lanes. * @state: the state we are now (attached, suspended, configured, etc) + * @state_lock: Spinlock protecting the `state` and `teardown` members. + * @teardown: True if the device is undergoing teardown, used to prevent + * new work from being scheduled during cleanup. * @name: Identifies the controller hardware type. Used in diagnostics * and sometimes configuration. * @dev: Driver model state for this abstract device. @@ -451,6 +454,8 @@ struct usb_gadget { enum usb_ssp_rate max_ssp_rate; enum usb_device_state state; + spinlock_t state_lock; + bool teardown; const char *name; struct device dev; unsigned isoch_delay; -- cgit v1.2.3 From d1e6d2773898c7a1c19e12619d303920d32a9cd0 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 10 Oct 2025 17:38:13 +0200 Subject: rcu: Add a small-width RCU watching counter debug option A later commit will reduce the size of the RCU watching counter to free up some bits for another purpose. Paul suggested adding a config option to test the extreme case where the counter is reduced to its minimum usable width for rcutorture to poke at, so do that. Make it only configurable under RCU_EXPERT. While at it, add a comment to explain the layout of context_tracking->state. Link: http://lore.kernel.org/r/4c2cb573-168f-4806-b1d9-164e8276e66a@paulmck-laptop Suggested-by: Paul E. McKenney Signed-off-by: Valentin Schneider Reviewed-by: Paul E. McKenney Reviewed-by: Frederic Weisbecker Signed-off-by: Frederic Weisbecker --- include/linux/context_tracking_state.h | 44 ++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 7b8433d5a8ef..0b81248aa03e 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -18,12 +18,6 @@ enum ctx_state { CT_STATE_MAX = 4, }; -/* Odd value for watching, else even. */ -#define CT_RCU_WATCHING CT_STATE_MAX - -#define CT_STATE_MASK (CT_STATE_MAX - 1) -#define CT_RCU_WATCHING_MASK (~CT_STATE_MASK) - struct context_tracking { #ifdef CONFIG_CONTEXT_TRACKING_USER /* @@ -44,9 +38,45 @@ struct context_tracking { #endif }; +/* + * We cram two different things within the same atomic variable: + * + * CT_RCU_WATCHING_START CT_STATE_START + * | | + * v v + * MSB [ RCU watching counter ][ context_state ] LSB + * ^ ^ + * | | + * CT_RCU_WATCHING_END CT_STATE_END + * + * Bits are used from the LSB upwards, so unused bits (if any) will always be in + * upper bits of the variable. + */ #ifdef CONFIG_CONTEXT_TRACKING +#define CT_SIZE (sizeof(((struct context_tracking *)0)->state) * BITS_PER_BYTE) + +#define CT_STATE_WIDTH bits_per(CT_STATE_MAX - 1) +#define CT_STATE_START 0 +#define CT_STATE_END (CT_STATE_START + CT_STATE_WIDTH - 1) + +#define CT_RCU_WATCHING_MAX_WIDTH (CT_SIZE - CT_STATE_WIDTH) +#define CT_RCU_WATCHING_WIDTH (IS_ENABLED(CONFIG_RCU_DYNTICKS_TORTURE) ? 2 : CT_RCU_WATCHING_MAX_WIDTH) +#define CT_RCU_WATCHING_START (CT_STATE_END + 1) +#define CT_RCU_WATCHING_END (CT_RCU_WATCHING_START + CT_RCU_WATCHING_WIDTH - 1) +#define CT_RCU_WATCHING BIT(CT_RCU_WATCHING_START) + +#define CT_STATE_MASK GENMASK(CT_STATE_END, CT_STATE_START) +#define CT_RCU_WATCHING_MASK GENMASK(CT_RCU_WATCHING_END, CT_RCU_WATCHING_START) + +#define CT_UNUSED_WIDTH (CT_RCU_WATCHING_MAX_WIDTH - CT_RCU_WATCHING_WIDTH) + +static_assert(CT_STATE_WIDTH + + CT_RCU_WATCHING_WIDTH + + CT_UNUSED_WIDTH == + CT_SIZE); + DECLARE_PER_CPU(struct context_tracking, context_tracking); -#endif +#endif /* CONFIG_CONTEXT_TRACKING */ #ifdef CONFIG_CONTEXT_TRACKING_USER static __always_inline int __ct_state(void) -- cgit v1.2.3 From bcce8c74f1ce1e2731ac0261287897e3768767d8 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 24 Oct 2025 15:46:21 -0700 Subject: PCI: Enable host bridge emulation for PCI_DOMAINS_GENERIC platforms The ability to emulate a host bridge is useful not only for hardware PCI controllers like CONFIG_VMD, or virtual PCI controllers like CONFIG_PCI_HYPERV, but also for test and development scenarios like CONFIG_SAMPLES_DEVSEC [1]. One stumbling block for defining CONFIG_SAMPLES_DEVSEC, a sample implementation of a platform TSM for PCI Device Security, is the need to accommodate PCI_DOMAINS_GENERIC architectures alongside x86 [2]. In support of supplementing the existing CONFIG_PCI_BRIDGE_EMUL infrastructure for host bridges: * Introduce pci_bus_find_emul_domain_nr() as a common way to find a free PCI domain number whether that is to reuse the existing dynamic allocation code in the !ACPI case, or to assign an unused domain above the last ACPI segment. * Convert pci-hyperv to the new allocator so that the PCI core can unconditionally assume that bridge->domain_nr != PCI_DOMAIN_NR_NOT_SET is the dynamically allocated case. A follow on patch can also convert vmd to the new scheme. Currently vmd is limited to CONFIG_PCI_DOMAINS_GENERIC=n (x86) so, unlike pci-hyperv, it does not immediately conflict with this new pci_bus_find_emul_domain_nr() mechanism. Link: http://lore.kernel.org/174107249038.1288555.12362100502109498455.stgit@dwillia2-xfh.jf.intel.com [1] Reported-by: Suzuki K Poulose Closes: http://lore.kernel.org/20250311144601.145736-3-suzuki.poulose@arm.com [2] Signed-off-by: Dan Williams Signed-off-by: Bjorn Helgaas Tested-by: Suzuki K Poulose Tested-by: Michael Kelley Reviewed-by: Michael Kelley Cc: Lorenzo Pieralisi Cc: Manivannan Sadhasivam Cc: Rob Herring Cc: K. Y. Srinivasan Cc: Haiyang Zhang Cc: Wei Liu Cc: Dexuan Cui Link: https://patch.msgid.link/20251024224622.1470555-2-dan.j.williams@intel.com --- include/linux/pci.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..1ef1535802b0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1956,10 +1956,17 @@ DEFINE_GUARD(pci_dev, struct pci_dev *, pci_dev_lock(_T), pci_dev_unlock(_T)) */ #ifdef CONFIG_PCI_DOMAINS extern int pci_domains_supported; +int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max); +void pci_bus_release_emul_domain_nr(int domain_nr); #else enum { pci_domains_supported = 0 }; static inline int pci_domain_nr(struct pci_bus *bus) { return 0; } static inline int pci_proc_domain(struct pci_bus *bus) { return 0; } +static inline int pci_bus_find_emul_domain_nr(u32 hint, u32 min, u32 max) +{ + return 0; +} +static inline void pci_bus_release_emul_domain_nr(int domain_nr) { } #endif /* CONFIG_PCI_DOMAINS */ /* -- cgit v1.2.3 From a544d9a66bdf20eb25cc40f99e1d09c825b71b26 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:16 -0400 Subject: tracing: Have syscall trace events read user space string As of commit 654ced4a1377 ("tracing: Introduce tracepoint_is_faultable()") system call trace events allow faulting in user space memory. Have some of the system call trace events take advantage of this. Use the trace_user_fault_read() logic to read the user space buffer from user space and instead of just saving the pointer to the buffer in the system call event, also save the string that is passed in. The syscall event has its nb_args shorten from an int to a short (where even u8 is plenty big enough) and the freed two bytes are used for "user_mask". The new "user_mask" field is used to store the index of the "args" field array that has the address to read from user space. This value is set to 0 if the system call event does not need to read user space for a field. This mask can be used to know if the event may fault or not. Only one bit set in user_mask is supported at this time. This allows the output to look like this: sys_access(filename: 0x7f8c55368470 "/etc/ld.so.preload", mode: 4) sys_execve(filename: 0x564ebcf5a6b8 "/usr/bin/emacs", argv: 0x7fff357c0300, envp: 0x564ebc4a4820) Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231147.261867956@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/trace/syscall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 8e193f3a33b3..85f21ca15a41 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -16,6 +16,7 @@ * @name: name of the syscall * @syscall_nr: number of the syscall * @nb_args: number of parameters it takes + * @user_mask: mask of @args that will read user space * @types: list of types as strings * @args: list of args as strings (args[i] matches types[i]) * @enter_fields: list of fields for syscall_enter trace event @@ -25,7 +26,8 @@ struct syscall_metadata { const char *name; int syscall_nr; - int nb_args; + short nb_args; + short user_mask; const char **types; const char **args; struct list_head enter_fields; -- cgit v1.2.3 From b4f7624cfc9422209b844793521c60edb289fb69 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:19 -0400 Subject: tracing: Have system call events record user array data For system call events that have a length field, add a "user_arg_size" parameter to the system call meta data that denotes the index of the args array that holds the size of arg that the user_mask field has a bit set for. The "user_mask" has a bit set that denotes the arg that points to an array in the user space address space and if a system call event has the user_mask field set and the user_arg_size set, it will then record the content of that address into the trace event, up to the size defined by SYSCALL_FAULT_BUF_SZ - 1. This allows the output to look like: sys_write(fd: 0xa, buf: 0x5646978d13c0 (01:00:05:00:00:00:00:00:01:87:55:89:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00), count: 0x20) Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231147.763528474@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/trace/syscall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 85f21ca15a41..9413c139da66 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -16,6 +16,7 @@ * @name: name of the syscall * @syscall_nr: number of the syscall * @nb_args: number of parameters it takes + * @user_arg_size: holds @arg that has size of the user space to read * @user_mask: mask of @args that will read user space * @types: list of types as strings * @args: list of args as strings (args[i] matches types[i]) @@ -26,7 +27,8 @@ struct syscall_metadata { const char *name; int syscall_nr; - short nb_args; + u8 nb_args; + s8 user_arg_size; short user_mask; const char **types; const char **args; -- cgit v1.2.3 From 011ea0501daaba36c06910fd383cf7428ea45844 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:20 -0400 Subject: tracing: Display some syscall arrays as strings Some of the system calls that read a fixed length of memory from the user space address are not arrays but strings. Take a bit away from the nb_args field in the syscall meta data to use as a flag to denote that the system call's user_arg_size is being used as a string. The nb_args should never be more than 6, so 7 bits is plenty to hold that number. When the user_arg_is_str flag that, when set, will display the data array from the user space address as a string and not an array. This will allow the output to look like this: sys_sethostname(name: 0x5584310eb2a0 "debian", len: 6) Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231147.930550359@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/trace/syscall.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 9413c139da66..0dd7f2b33431 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -16,6 +16,7 @@ * @name: name of the syscall * @syscall_nr: number of the syscall * @nb_args: number of parameters it takes + * @user_arg_is_str: set if the arg for @user_arg_size is a string * @user_arg_size: holds @arg that has size of the user space to read * @user_mask: mask of @args that will read user space * @types: list of types as strings @@ -27,7 +28,8 @@ struct syscall_metadata { const char *name; int syscall_nr; - u8 nb_args; + u8 nb_args:7; + u8 user_arg_is_str:1; s8 user_arg_size; short user_mask; const char **types; -- cgit v1.2.3 From 32e0f607ac6a2bb5d144540897535fd01be77586 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Oct 2025 19:11:24 -0400 Subject: tracing: Add trace_seq_pop() and seq_buf_pop() In order to allow an interface to remove an added character from the trace_seq and seq_buf descriptors, add helper functions trace_seq_pop() and seq_buf_pop(). Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Peter Zijlstra Cc: Namhyung Kim Cc: Takaya Saeki Cc: Tom Zanussi Cc: Thomas Gleixner Cc: Ian Rogers Cc: Douglas Raillard Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Adrian Hunter Cc: Ingo Molnar Link: https://lore.kernel.org/20251028231148.594898736@kernel.org Signed-off-by: Steven Rostedt (Google) --- include/linux/seq_buf.h | 17 +++++++++++++++++ include/linux/trace_seq.h | 13 +++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 52791e070506..9f2839e73f8a 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -149,6 +149,23 @@ static inline void seq_buf_commit(struct seq_buf *s, int num) } } +/** + * seq_buf_pop - pop off the last written character + * @s: the seq_buf handle + * + * Removes the last written character to the seq_buf @s. + * + * Returns the last character or -1 if it is empty. + */ +static inline int seq_buf_pop(struct seq_buf *s) +{ + if (!s->len) + return -1; + + s->len--; + return (unsigned int)s->buffer[s->len]; +} + extern __printf(2, 3) int seq_buf_printf(struct seq_buf *s, const char *fmt, ...); extern __printf(2, 0) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 557780fe1c77..4a0b8c172d27 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -80,6 +80,19 @@ static inline bool trace_seq_has_overflowed(struct trace_seq *s) return s->full || seq_buf_has_overflowed(&s->seq); } +/** + * trace_seq_pop - pop off the last written character + * @s: trace sequence descriptor + * + * Removes the last written character to the trace_seq @s. + * + * Returns the last character or -1 if it is empty. + */ +static inline int trace_seq_pop(struct trace_seq *s) +{ + return seq_buf_pop(&s->seq); +} + /* * Currently only defined when tracing is enabled. */ -- cgit v1.2.3 From c72568c21b97dbc48d02b769f4eec6667ad13d5a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Oct 2025 09:12:40 +0000 Subject: net: rps: softnet_data reorg to make enqueue_to_backlog() fast enqueue_to_backlog() is showing up in kernel profiles on hosts with many cores, when RFS/RPS is used. The following softnet_data fields need to be updated: - input_queue_tail - input_pkt_queue (next, prev, qlen, lock) - backlog.state (if input_pkt_queue was empty) Unfortunately they are currenly using two cache lines: /* --- cacheline 3 boundary (192 bytes) --- */ call_single_data_t csd __attribute__((__aligned__(64))); /* 0xc0 0x20 */ struct softnet_data * rps_ipi_next; /* 0xe0 0x8 */ unsigned int cpu; /* 0xe8 0x4 */ unsigned int input_queue_tail; /* 0xec 0x4 */ struct sk_buff_head input_pkt_queue; /* 0xf0 0x18 */ /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ struct napi_struct backlog __attribute__((__aligned__(8))); /* 0x108 0x1f0 */ Add one ____cacheline_aligned_in_smp to make sure they now are using a single cache line. Also, because napi_struct has written fields, make @state its first field. We want to make sure that cpus adding packets to sd->input_pkt_queue are not slowing down cpus processing their backlog because of false sharing. After this patch new layout is: /* --- cacheline 5 boundary (320 bytes) --- */ long int pad[3] __attribute__((__aligned__(64))); /* 0x140 0x18 */ unsigned int input_queue_tail; /* 0x158 0x4 */ /* XXX 4 bytes hole, try to pack */ struct sk_buff_head input_pkt_queue; /* 0x160 0x18 */ struct napi_struct backlog __attribute__((__aligned__(8))); /* 0x178 0x1f0 */ Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251024091240.3292546-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7f5aad5cc9a1..9c1e5042c5e7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -377,6 +377,8 @@ struct napi_config { * Structure for NAPI scheduling similar to tasklet but with weighting */ struct napi_struct { + /* This field should be first or softnet_data.backlog needs tweaks. */ + unsigned long state; /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means * whoever atomically sets that bit can add this napi_struct @@ -385,7 +387,6 @@ struct napi_struct { */ struct list_head poll_list; - unsigned long state; int weight; u32 defer_hard_irqs_count; int (*poll)(struct napi_struct *, int); @@ -3529,9 +3530,17 @@ struct softnet_data { call_single_data_t csd ____cacheline_aligned_in_smp; struct softnet_data *rps_ipi_next; unsigned int cpu; + + /* We force a cacheline alignment from here, to hold together + * input_queue_tail, input_pkt_queue and backlog.state. + * We add holes so that backlog.state is the last field + * of this cache line. + */ + long pad[3] ____cacheline_aligned_in_smp; unsigned int input_queue_tail; #endif struct sk_buff_head input_pkt_queue; + struct napi_struct backlog; struct numa_drop_counters drop_counters; -- cgit v1.2.3 From 8443c3160858b860bfc2db6a8397c72c9f6b513e Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Fri, 24 Oct 2025 11:02:56 -0700 Subject: net: netmem: remove NET_IOV_MAX from net_iov_type enum Remove the NET_IOV_MAX workaround from the net_iov_type enum. This entry was previously added to force the enum size to unsigned long to satisfy the NET_IOV_ASSERT_OFFSET static assertions. After commit f3d85c9ee510 ("netmem: introduce struct netmem_desc mirroring struct page") this approach became unnecessary by placing the net_iov_type after the netmem_desc. Placing the net_iov_type after netmem_desc results in the net_iov_type size having no effect on the position or layout of the fields that mirror the struct page. The layout before this patch: struct net_iov { union { struct netmem_desc desc; /* 0 48 */ struct { long unsigned int _flags; /* 0 8 */ long unsigned int pp_magic; /* 8 8 */ struct page_pool * pp; /* 16 8 */ long unsigned int _pp_mapping_pad; /* 24 8 */ long unsigned int dma_addr; /* 32 8 */ atomic_long_t pp_ref_count; /* 40 8 */ }; /* 0 48 */ }; /* 0 48 */ struct net_iov_area * owner; /* 48 8 */ enum net_iov_type type; /* 56 8 */ /* size: 64, cachelines: 1, members: 3 */ }; The layout after this patch: struct net_iov { union { struct netmem_desc desc; /* 0 48 */ struct { long unsigned int _flags; /* 0 8 */ long unsigned int pp_magic; /* 8 8 */ struct page_pool * pp; /* 16 8 */ long unsigned int _pp_mapping_pad; /* 24 8 */ long unsigned int dma_addr; /* 32 8 */ atomic_long_t pp_ref_count; /* 40 8 */ }; /* 0 48 */ }; /* 0 48 */ struct net_iov_area * owner; /* 48 8 */ enum net_iov_type type; /* 56 4 */ /* size: 64, cachelines: 1, members: 3 */ /* padding: 4 */ }; Signed-off-by: Bobby Eshleman Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20251024-b4-devmem-remove-niov-max-v1-1-ba72c68bc869@meta.com Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/net/netmem.h b/include/net/netmem.h index 651e2c62d1dd..9e10f4ac50c3 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -68,10 +68,6 @@ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); enum net_iov_type { NET_IOV_DMABUF, NET_IOV_IOURING, - - /* Force size to unsigned long to make the NET_IOV_ASSERTS below pass. - */ - NET_IOV_MAX = ULONG_MAX }; /* A memory descriptor representing abstract networking I/O vectors, -- cgit v1.2.3 From 294bfe0343da3b59db040c3a4dac05b4c91ce013 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 25 Oct 2025 09:40:59 +0200 Subject: sctp: Constify struct sctp_sched_ops 'struct sctp_sched_ops' is not modified in these drivers. Constifying this structure moves some data to a read-only section, so increases overall security, especially when the structure holds some function pointers. On a x86_64, with allmodconfig, as an example: Before: ====== text data bss dec hex filename 8019 568 0 8587 218b net/sctp/stream_sched_fc.o After: ===== text data bss dec hex filename 8275 312 0 8587 218b net/sctp/stream_sched_fc.o Signed-off-by: Christophe JAILLET Link: https://patch.msgid.link/dce03527eb7b7cc8a3c26d5cdac12bafe3350135.1761377890.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski --- include/net/sctp/stream_sched.h | 4 ++-- include/net/sctp/structs.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h index 8034bf5febbe..77806ef1cb70 100644 --- a/include/net/sctp/stream_sched.h +++ b/include/net/sctp/stream_sched.h @@ -52,10 +52,10 @@ void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch); void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch); int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp); -struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream); +const struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream); void sctp_sched_ops_register(enum sctp_sched_type sched, - struct sctp_sched_ops *sched_ops); + const struct sctp_sched_ops *sched_ops); void sctp_sched_ops_prio_init(void); void sctp_sched_ops_rr_init(void); void sctp_sched_ops_fc_init(void); diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 3dd304e411d0..5900196d65fd 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1073,7 +1073,7 @@ struct sctp_outq { struct list_head out_chunk_list; /* Stream scheduler being used */ - struct sctp_sched_ops *sched; + const struct sctp_sched_ops *sched; unsigned int out_qlen; /* Total length of queued data chunks. */ -- cgit v1.2.3 From f74ee32963f1b74865fe679e2475450434fea51c Mon Sep 17 00:00:00 2001 From: Qinxin Xia Date: Tue, 28 Oct 2025 20:09:00 +0800 Subject: tools/dma: move dma_map_benchmark from selftests to tools/dma dma_map_benchmark is a standalone developer tool rather than an automated selftest. It has no pass/fail criteria, expects manual invocation, and is built as a normal userspace binary. Move it to tools/dma/ and add a minimal Makefile. Suggested-by: Marek Szyprowski Suggested-by: Barry Song Signed-off-by: Qinxin Xia Acked-by: Barry Song Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251028120900.2265511-3-xiaqinxin@huawei.com --- include/linux/map_benchmark.h | 32 -------------------------------- include/uapi/linux/map_benchmark.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 32 deletions(-) delete mode 100644 include/linux/map_benchmark.h create mode 100644 include/uapi/linux/map_benchmark.h (limited to 'include') diff --git a/include/linux/map_benchmark.h b/include/linux/map_benchmark.h deleted file mode 100644 index 48e2ff95332f..000000000000 --- a/include/linux/map_benchmark.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2022 HiSilicon Limited. - */ - -#ifndef _KERNEL_DMA_BENCHMARK_H -#define _KERNEL_DMA_BENCHMARK_H - -#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) -#define DMA_MAP_MAX_THREADS 1024 -#define DMA_MAP_MAX_SECONDS 300 -#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) - -#define DMA_MAP_BIDIRECTIONAL 0 -#define DMA_MAP_TO_DEVICE 1 -#define DMA_MAP_FROM_DEVICE 2 - -struct map_benchmark { - __u64 avg_map_100ns; /* average map latency in 100ns */ - __u64 map_stddev; /* standard deviation of map latency */ - __u64 avg_unmap_100ns; /* as above */ - __u64 unmap_stddev; - __u32 threads; /* how many threads will do map/unmap in parallel */ - __u32 seconds; /* how long the test will last */ - __s32 node; /* which numa node this benchmark will run on */ - __u32 dma_bits; /* DMA addressing capability */ - __u32 dma_dir; /* DMA data direction */ - __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ - __u8 expansion[76]; /* For future use */ -}; -#endif /* _KERNEL_DMA_BENCHMARK_H */ diff --git a/include/uapi/linux/map_benchmark.h b/include/uapi/linux/map_benchmark.h new file mode 100644 index 000000000000..c2d91088a40d --- /dev/null +++ b/include/uapi/linux/map_benchmark.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ +/* + * Copyright (C) 2022-2025 HiSilicon Limited. + */ + +#ifndef _UAPI_DMA_BENCHMARK_H +#define _UAPI_DMA_BENCHMARK_H + +#include + +#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) +#define DMA_MAP_MAX_THREADS 1024 +#define DMA_MAP_MAX_SECONDS 300 +#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) + +#define DMA_MAP_BIDIRECTIONAL 0 +#define DMA_MAP_TO_DEVICE 1 +#define DMA_MAP_FROM_DEVICE 2 + +struct map_benchmark { + __u64 avg_map_100ns; /* average map latency in 100ns */ + __u64 map_stddev; /* standard deviation of map latency */ + __u64 avg_unmap_100ns; /* as above */ + __u64 unmap_stddev; + __u32 threads; /* how many threads will do map/unmap in parallel */ + __u32 seconds; /* how long the test will last */ + __s32 node; /* which numa node this benchmark will run on */ + __u32 dma_bits; /* DMA addressing capability */ + __u32 dma_dir; /* DMA data direction */ + __u32 dma_trans_ns; /* time for DMA transmission in ns */ + __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ + __u8 expansion[76]; /* For future use */ +}; + +#endif /* _UAPI_DMA_BENCHMARK_H */ -- cgit v1.2.3 From ed7fc3cbb38ffdca7a189e15982ce96acab4684c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 15 Oct 2025 12:12:47 +0300 Subject: dma-mapping: prepare dma_map_ops to conversion to physical address Add new .map_phys() and .unmap_phys() callbacks to dma_map_ops as a preparation to replace .map_page() and .unmap_page() respectively. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-1-3bbfe3a25cdf@kernel.org --- include/linux/dma-map-ops.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 10882d00cb17..79d2a74d4b49 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -37,6 +37,13 @@ struct dma_map_ops { void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs); + + dma_addr_t (*map_phys)(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + void (*unmap_phys)(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs); /* * map_sg should return a negative error code on error. See * dma_map_sgtable() for a list of appropriate error codes -- cgit v1.2.3 From 14cb413af00c5d3950d1a339dd2b6f01ce313fce Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 15 Oct 2025 12:12:52 +0300 Subject: dma-mapping: remove unused mapping resource callbacks After ARM and XEN conversions to use physical addresses for the mapping, there are no in-kernel users for map_resource/unmap_resource callbacks, so remove them. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-6-3bbfe3a25cdf@kernel.org --- include/linux/dma-map-ops.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 79d2a74d4b49..2e98ecc313a3 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -53,12 +53,6 @@ struct dma_map_ops { enum dma_data_direction dir, unsigned long attrs); void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); - dma_addr_t (*map_resource)(struct device *dev, phys_addr_t phys_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs); - void (*unmap_resource)(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir, - unsigned long attrs); void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void (*sync_single_for_device)(struct device *dev, -- cgit v1.2.3 From 131971f67e258170c678fe572fda95f8cef88e66 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 15 Oct 2025 12:13:00 +0300 Subject: dma-mapping: remove unused map_page callback After conversion of arch code to use physical address mapping, there are no users of .map_page() and .unmap_page() callbacks, so let's remove them. Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251015-remove-map-page-v5-14-3bbfe3a25cdf@kernel.org --- include/linux/dma-map-ops.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 2e98ecc313a3..4809204c674c 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -31,13 +31,6 @@ struct dma_map_ops { void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); - dma_addr_t (*map_page)(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs); - void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir, - unsigned long attrs); - dma_addr_t (*map_phys)(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, unsigned long attrs); -- cgit v1.2.3 From c31b9d2f589463a7cb286467a618b3b598654890 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Sep 2025 15:44:10 +0200 Subject: unwind: Shorten lines There are some exceptionally long lines that cause ugly wrapping. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt (Google) Link: https://patch.msgid.link/20250924080118.545274393@infradead.org --- include/linux/unwind_deferred.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index 26122d00708a..25f4dffebd1b 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -8,7 +8,9 @@ struct unwind_work; -typedef void (*unwind_callback_t)(struct unwind_work *work, struct unwind_stacktrace *trace, u64 cookie); +typedef void (*unwind_callback_t)(struct unwind_work *work, + struct unwind_stacktrace *trace, + u64 cookie); struct unwind_work { struct list_head list; @@ -68,9 +70,17 @@ static __always_inline void unwind_reset_info(void) static inline void unwind_task_init(struct task_struct *task) {} static inline void unwind_task_free(struct task_struct *task) {} -static inline int unwind_user_faultable(struct unwind_stacktrace *trace) { return -ENOSYS; } -static inline int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) { return -ENOSYS; } -static inline int unwind_deferred_request(struct unwind_work *work, u64 *timestamp) { return -ENOSYS; } +static inline int unwind_user_faultable(struct unwind_stacktrace *trace) +{ return -ENOSYS; } + +static inline int +unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) +{ return -ENOSYS; } + +static inline int +unwind_deferred_request(struct unwind_work *work, u64 *timestamp) +{ return -ENOSYS; } + static inline void unwind_deferred_cancel(struct unwind_work *work) {} static inline void unwind_deferred_task_exit(struct task_struct *task) {} -- cgit v1.2.3 From b1164c7d118defb01a885b53f56e3336db784df7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Sep 2025 15:44:59 +0200 Subject: unwind: Add required include files To be self sufficient, the file needs to include linux/types.h. This provides things like u32/u64 and struct callback_head. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Steven Rostedt (Google) Link: https://patch.msgid.link/20250924080118.665787071@infradead.org --- include/linux/unwind_deferred_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h index 33b62ac25c86..29452ff49859 100644 --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -2,6 +2,8 @@ #ifndef _LINUX_UNWIND_USER_DEFERRED_TYPES_H #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H +#include + struct unwind_cache { unsigned long unwind_completed; unsigned int nr_entries; -- cgit v1.2.3 From 52a1ec718b3eb6da29a76d05a662365a997139cc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Sep 2025 15:46:00 +0200 Subject: unwind: Simplify unwind_reset_info() Invert the condition of the first if and make it an early exit to reduce an indent level for the rest fo the function. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (Google) Link: https://patch.msgid.link/20250924080118.777916262@infradead.org --- include/linux/unwind_deferred.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index 25f4dffebd1b..196e12c1449e 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -46,22 +46,22 @@ void unwind_deferred_task_exit(struct task_struct *task); static __always_inline void unwind_reset_info(void) { struct unwind_task_info *info = ¤t->unwind_info; - unsigned long bits; + unsigned long bits = info->unwind_mask; /* Was there any unwinding? */ - if (unlikely(info->unwind_mask)) { - bits = info->unwind_mask; - do { - /* Is a task_work going to run again before going back */ - if (bits & UNWIND_PENDING) - return; - } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL)); - current->unwind_info.id.id = 0; - - if (unlikely(info->cache)) { - info->cache->nr_entries = 0; - info->cache->unwind_completed = 0; - } + if (likely(!bits)) + return; + + do { + /* Is a task_work going to run again before going back */ + if (bits & UNWIND_PENDING) + return; + } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL)); + current->unwind_info.id.id = 0; + + if (unlikely(info->cache)) { + info->cache->nr_entries = 0; + info->cache->unwind_completed = 0; } } -- cgit v1.2.3 From 639214f65b1db87c6992eadf93079ff0d8768c2d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Sep 2025 16:09:17 +0200 Subject: unwind: Make unwind_task_info::unwind_mask consistent The unwind_task_info::unwind_mask was manipulated using a mixture of: regular store WRITE_ONCE() try_cmpxchg() set_bit() atomic_long_*() Clean up and make it consistently atomic_long_t. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20250924080119.384384486@infradead.org --- include/linux/unwind_deferred.h | 4 ++-- include/linux/unwind_deferred_types.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index 196e12c1449e..f4743c8cff4c 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -46,7 +46,7 @@ void unwind_deferred_task_exit(struct task_struct *task); static __always_inline void unwind_reset_info(void) { struct unwind_task_info *info = ¤t->unwind_info; - unsigned long bits = info->unwind_mask; + unsigned long bits = atomic_long_read(&info->unwind_mask); /* Was there any unwinding? */ if (likely(!bits)) @@ -56,7 +56,7 @@ static __always_inline void unwind_reset_info(void) /* Is a task_work going to run again before going back */ if (bits & UNWIND_PENDING) return; - } while (!try_cmpxchg(&info->unwind_mask, &bits, 0UL)); + } while (!atomic_long_try_cmpxchg(&info->unwind_mask, &bits, 0UL)); current->unwind_info.id.id = 0; if (unlikely(info->cache)) { diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h index 29452ff49859..0a4c8ddbbc57 100644 --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -3,6 +3,7 @@ #define _LINUX_UNWIND_USER_DEFERRED_TYPES_H #include +#include struct unwind_cache { unsigned long unwind_completed; @@ -32,7 +33,7 @@ union unwind_task_id { }; struct unwind_task_info { - unsigned long unwind_mask; + atomic_long_t unwind_mask; struct unwind_cache *cache; struct callback_head work; union unwind_task_id id; -- cgit v1.2.3 From c79dd946e370af3537edb854f210cba3a94b4516 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2025 13:27:34 +0200 Subject: unwind: Implement compat fp unwind It is important to be able to unwind compat tasks too. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20250924080119.613695709@infradead.org --- include/linux/unwind_user_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h index a449f15be890..938f7e623332 100644 --- a/include/linux/unwind_user_types.h +++ b/include/linux/unwind_user_types.h @@ -36,6 +36,7 @@ struct unwind_user_state { unsigned long ip; unsigned long sp; unsigned long fp; + unsigned int ws; enum unwind_user_type current_type; unsigned int available_types; bool done; -- cgit v1.2.3 From ae25884ad749e7f6e0c3565513bdc8aa2554a425 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Oct 2025 12:31:10 +0200 Subject: unwind_user/x86: Teach FP unwind about start of function When userspace is interrupted at the start of a function, before we get a chance to complete the frame, unwind will miss one caller. X86 has a uprobe specific fixup for this, add bits to the generic unwinder to support this. Suggested-by: Jens Remus Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251024145156.GM4068168@noisy.programming.kicks-ass.net --- include/linux/unwind_user_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/unwind_user_types.h b/include/linux/unwind_user_types.h index 938f7e623332..412729a269bc 100644 --- a/include/linux/unwind_user_types.h +++ b/include/linux/unwind_user_types.h @@ -39,6 +39,7 @@ struct unwind_user_state { unsigned int ws; enum unwind_user_type current_type; unsigned int available_types; + bool topmost; bool done; }; -- cgit v1.2.3 From c69993ecdd4dfde2b7da08b022052a33b203da07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Oct 2025 15:17:05 +0200 Subject: perf: Support deferred user unwind Add support for deferred userspace unwind to perf. Where perf currently relies on in-place stack unwinding; from NMI context and all that. This moves the userspace part of the unwind to right before the return-to-userspace. This has two distinct benefits, the biggest is that it moves the unwind to a faultable context. It becomes possible to fault in debug info (.eh_frame, SFrame etc.) that might not otherwise be readily available. And secondly, it de-duplicates the user callchain where multiple samples happen during the same kernel entry. To facilitate this the perf interface is extended with a new record type: PERF_RECORD_CALLCHAIN_DEFERRED and two new attribute flags: perf_event_attr::defer_callchain - to request the user unwind be deferred perf_event_attr::defer_output - to request PERF_RECORD_CALLCHAIN_DEFERRED records The existing PERF_RECORD_SAMPLE callchain section gets a new context type: PERF_CONTEXT_USER_DEFERRED After which will come a single entry, denoting the 'cookie' of the deferred callchain that should be attached here, matching the 'cookie' field of the above mentioned PERF_RECORD_CALLCHAIN_DEFERRED. The 'defer_callchain' flag is expected on all events with PERF_SAMPLE_CALLCHAIN. The 'defer_output' flag is expect on the event responsible for collecting side-band events (like mmap, comm etc.). Setting 'defer_output' on multiple events will get you duplicated PERF_RECORD_CALLCHAIN_DEFERRED records. Based on earlier patches by Josh and Steven. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251023150002.GR4067720@noisy.programming.kicks-ass.net --- include/linux/perf_event.h | 2 +- include/linux/unwind_deferred.h | 12 ------------ include/linux/unwind_deferred_types.h | 13 +++++++++++++ include/uapi/linux/perf_event.h | 21 ++++++++++++++++++++- 4 files changed, 34 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fd1d91017b99..9870d768db4c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1720,7 +1720,7 @@ extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct p extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs); extern struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark); + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie); extern int get_callchain_buffers(int max_stack); extern void put_callchain_buffers(void); extern struct perf_callchain_entry *get_callchain_entry(int *rctx); diff --git a/include/linux/unwind_deferred.h b/include/linux/unwind_deferred.h index f4743c8cff4c..bc7ae7d21900 100644 --- a/include/linux/unwind_deferred.h +++ b/include/linux/unwind_deferred.h @@ -6,18 +6,6 @@ #include #include -struct unwind_work; - -typedef void (*unwind_callback_t)(struct unwind_work *work, - struct unwind_stacktrace *trace, - u64 cookie); - -struct unwind_work { - struct list_head list; - unwind_callback_t func; - int bit; -}; - #ifdef CONFIG_UNWIND_USER enum { diff --git a/include/linux/unwind_deferred_types.h b/include/linux/unwind_deferred_types.h index 0a4c8ddbbc57..18fa3932f61c 100644 --- a/include/linux/unwind_deferred_types.h +++ b/include/linux/unwind_deferred_types.h @@ -39,4 +39,17 @@ struct unwind_task_info { union unwind_task_id id; }; +struct unwind_work; +struct unwind_stacktrace; + +typedef void (*unwind_callback_t)(struct unwind_work *work, + struct unwind_stacktrace *trace, + u64 cookie); + +struct unwind_work { + struct list_head list; + unwind_callback_t func; + int bit; +}; + #endif /* _LINUX_UNWIND_USER_DEFERRED_TYPES_H */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 78a362b80027..d292f96bc06f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -463,7 +463,9 @@ struct perf_event_attr { inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ sigtrap : 1, /* send synchronous SIGTRAP on event */ - __reserved_1 : 26; + defer_callchain: 1, /* request PERF_RECORD_CALLCHAIN_DEFERRED records */ + defer_output : 1, /* output PERF_RECORD_CALLCHAIN_DEFERRED records */ + __reserved_1 : 24; union { __u32 wakeup_events; /* wake up every n events */ @@ -1239,6 +1241,22 @@ enum perf_event_type { */ PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + /* + * This user callchain capture was deferred until shortly before + * returning to user space. Previous samples would have kernel + * callchains only and they need to be stitched with this to make full + * callchains. + * + * struct { + * struct perf_event_header header; + * u64 cookie; + * u64 nr; + * u64 ips[nr]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CALLCHAIN_DEFERRED = 22, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -1269,6 +1287,7 @@ enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, PERF_CONTEXT_USER = (__u64)-512, + PERF_CONTEXT_USER_DEFERRED = (__u64)-640, PERF_CONTEXT_GUEST = (__u64)-2048, PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, -- cgit v1.2.3 From 9c7f7262bc1affb9b9acd2ec2fb1f6314d5d474c Mon Sep 17 00:00:00 2001 From: Sander Vanheule Date: Wed, 29 Oct 2025 09:12:47 +0100 Subject: regmap: add flat cache with sparse validity The flat regcache will always assume the data in the cache is valid. Since the cache is preferred over hardware access, this may shadow the actual state of the device. Add a new containing cache structure with the flat data table and a bitmap indicating cache validity. REGCACHE_FLAT will still behave as before, as the validity is ignored. Define new cache type REGCACHE_FLAT_S: a flat cache with sparse validity. The sparse validity is used to determine if a hardware access should occur to initialize the cache on the fly, vs. at regmap init for REGCACHE_FLAT. Contrary to REGCACHE_FLAT, this allows us to implement regcache_ops.drop. Signed-off-by: Sander Vanheule Link: https://patch.msgid.link/20251029081248.52607-2-sander@svanheule.net Signed-off-by: Mark Brown --- include/linux/regmap.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 4e1ac1fbcec4..17bed25dc4e3 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -55,18 +55,23 @@ struct sdw_slave; #define REGMAP_DOWNSHIFT(s) (s) /* - * The supported cache types, the default is no cache. Any new caches - * should usually use the maple tree cache unless they specifically - * require that there are never any allocations at runtime and can't - * provide defaults in which case they should use the flat cache. The - * rbtree cache *may* have some performance advantage for very low end - * systems that make heavy use of cache syncs but is mainly legacy. + * The supported cache types, the default is no cache. Any new caches should + * usually use the maple tree cache unless they specifically require that there + * are never any allocations at runtime in which case they should use the sparse + * flat cache. The rbtree cache *may* have some performance advantage for very + * low end systems that make heavy use of cache syncs but is mainly legacy. + * These caches are sparse and entries will be initialized from hardware if no + * default has been provided. + * The non-sparse flat cache is provided for compatibility with existing users + * and will zero-initialize cache entries for which no defaults are provided. + * New users should use the sparse flat cache. */ enum regcache_type { REGCACHE_NONE, REGCACHE_RBTREE, REGCACHE_FLAT, REGCACHE_MAPLE, + REGCACHE_FLAT_S, }; /** -- cgit v1.2.3 From 7fabcb7fbabbcddd9dc42dbe4c92d18ce3e54283 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:17 +0200 Subject: mm,btrfs: add a filemap_flush_nr helper Abstract out the btrfs-specific behavior of kicking off I/O on a number of pages on an address_space into a well-defined helper. Note: there is no kerneldoc comment for the new function because it is not part of the public API. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-7-hch@lst.de Reviewed-by: David Hildenbrand Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..cebdf160d3dd 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -38,6 +38,7 @@ int filemap_invalidate_pages(struct address_space *mapping, int write_inode_now(struct inode *, int sync); int filemap_fdatawrite(struct address_space *); int filemap_flush(struct address_space *); +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write); int filemap_fdatawait_keep_errors(struct address_space *mapping); int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend); int filemap_fdatawait_range_keep_errors(struct address_space *mapping, -- cgit v1.2.3 From 1bcb413d0cd80efb386751910036a93147fd8dbc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:19 +0200 Subject: mm: remove filemap_fdatawrite_wbc Replace filemap_fdatawrite_wbc, which exposes a writeback_control to the callers with a filemap_writeback helper that takes all the possible arguments and declares the writeback_control itself. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-9-hch@lst.de Reviewed-by: David Hildenbrand Reviewed-by: Jan Kara Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index cebdf160d3dd..678d8ae23d01 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -60,8 +60,6 @@ int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); void __filemap_set_wb_err(struct address_space *mapping, int err); -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc); int kiocb_write_and_wait(struct kiocb *iocb, size_t count); static inline int filemap_write_and_wait(struct address_space *mapping) -- cgit v1.2.3 From 45cbce5b8877f339b72548f60aa97634044c255c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:20 +0200 Subject: mm: remove __filemap_fdatawrite_range Use filemap_fdatawrite_range and filemap_fdatawrite_range_kick instead of the low-level __filemap_fdatawrite_range that requires the caller to know the internals of the writeback_control structure and remove __filemap_fdatawrite_range now that it is trivial and only two callers would be left. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-10-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 678d8ae23d01..d0a7dd43c835 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -54,8 +54,6 @@ static inline int filemap_fdatawait(struct address_space *mapping) bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); -int __filemap_fdatawrite_range(struct address_space *mapping, - loff_t start, loff_t end, int sync_mode); int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); int filemap_check_errors(struct address_space *mapping); -- cgit v1.2.3 From c28d67b33cbf6da2043ee7517f1aa4cbf92dbbba Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Oct 2025 10:04:21 +0200 Subject: mm: rename filemap_fdatawrite_range_kick to filemap_flush_range Rename filemap_fdatawrite_range_kick to filemap_flush_range because it is the ranged version of filemap_flush. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251024080431.324236-11-hch@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..a5dbfa20f8d7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3014,7 +3014,7 @@ extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) @@ -3051,8 +3051,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; - filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, - iocb->ki_pos - 1); + filemap_flush_range(mapping, iocb->ki_pos - count, + iocb->ki_pos - 1); } return count; -- cgit v1.2.3 From 90db4d4441f58d433ecf74f7e3bd17e0a553c20c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Oct 2025 05:45:48 +0200 Subject: writeback: allow the file system to override MIN_WRITEBACK_PAGES The relatively low minimal writeback size of 4MiB means that written back inodes on rotational media are switched a lot. Besides introducing additional seeks, this also can lead to extreme file fragmentation on zoned devices when a lot of files are cached relative to the available writeback bandwidth. Add a superblock field that allows the file system to override the default size. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251017034611.651385-3-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + include/linux/writeback.h | 5 +++++ 2 files changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index a5dbfa20f8d7..6bf369095d2e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1583,6 +1583,7 @@ struct super_block { spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ + long s_min_writeback_pages; } __randomize_layout; static inline struct user_namespace *i_user_ns(const struct inode *inode) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 22dd4adc5667..49e1dd96f43e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -374,4 +374,9 @@ bool redirty_page_for_writepage(struct writeback_control *, struct page *); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); +/* + * 4MB minimal write chunk size + */ +#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) + #endif /* WRITEBACK_H */ -- cgit v1.2.3 From 4952f35f0545f3b53dab8d5fd727c4827c2a2778 Mon Sep 17 00:00:00 2001 From: Julian Sun Date: Mon, 29 Sep 2025 19:13:49 +0800 Subject: fs: Make wbc_to_tag() inline and use it in fs. The logic in wbc_to_tag() is widely used in file systems, so modify this function to be inline and use it in file systems. This patch has only passed compilation tests, but it should be fine. Signed-off-by: Julian Sun Reviewed-by: Qu Wenruo Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/writeback.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 49e1dd96f43e..2a81816f7507 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -196,6 +196,13 @@ static inline void wait_on_inode(struct inode *inode) !(READ_ONCE(inode->i_state) & I_NEW)); } +static inline xa_mark_t wbc_to_tag(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + return PAGECACHE_TAG_TOWRITE; + return PAGECACHE_TAG_DIRTY; +} + #ifdef CONFIG_CGROUP_WRITEBACK #include -- cgit v1.2.3 From f0e7036fc9cb08bdfb27d64eee7fc003ba0bc2e5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 27 Oct 2025 10:22:30 +0200 Subject: ipv4: icmp: Add RFC 5837 support Add the ability to append the incoming IP interface information to ICMPv4 error messages in accordance with RFC 5837 and RFC 4884. This is required for more meaningful traceroute results in unnumbered networks. The feature is disabled by default and controlled via a new sysctl ("net.ipv4.icmp_errors_extension_mask") which accepts a bitmask of ICMP extensions to append to ICMP error messages. Currently, only a single value is supported, but the interface and the implementation should be able to support more extensions, if needed. Clone the skb and copy the relevant data portions before modifying the skb as the caller of __icmp_send() still owns the skb after the function returns. This should be fine since by default ICMP error messages are rate limited to 1000 per second and no more than 1 per second per specific host. Trim or pad the packet to 128 bytes before appending the ICMP extension structure in order to be compatible with legacy applications that assume that the ICMP extension structure always starts at this offset (the minimum length specified by RFC 4884). Reviewed-by: Petr Machata Reviewed-by: David Ahern Reviewed-by: Willem de Bruijn Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251027082232.232571-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/icmp.h | 32 ++++++++++++++++++++++++++++++++ include/net/netns/ipv4.h | 1 + 2 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/icmp.h b/include/linux/icmp.h index 0af4d210ee31..043ec5d9c882 100644 --- a/include/linux/icmp.h +++ b/include/linux/icmp.h @@ -40,4 +40,36 @@ void ip_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out, int thlen, int off); +/* RFC 4884 */ +#define ICMP_EXT_ORIG_DGRAM_MIN_LEN 128 +#define ICMP_EXT_VERSION_2 2 + +/* ICMP Extension Object Classes */ +#define ICMP_EXT_OBJ_CLASS_IIO 2 /* RFC 5837 */ + +/* Interface Information Object - RFC 5837 */ +enum { + ICMP_EXT_CTYPE_IIO_ROLE_IIF, +}; + +#define ICMP_EXT_CTYPE_IIO_ROLE(ROLE) ((ROLE) << 6) +#define ICMP_EXT_CTYPE_IIO_MTU BIT(0) +#define ICMP_EXT_CTYPE_IIO_NAME BIT(1) +#define ICMP_EXT_CTYPE_IIO_IPADDR BIT(2) +#define ICMP_EXT_CTYPE_IIO_IFINDEX BIT(3) + +struct icmp_ext_iio_name_subobj { + u8 len; + char name[IFNAMSIZ]; +}; + +enum { + /* RFC 5837 - Incoming IP Interface Role */ + ICMP_ERR_EXT_IIO_IIF, + /* Add new constants above. Used by "icmp_errors_extension_mask" + * sysctl. + */ + ICMP_ERR_EXT_COUNT, +}; + #endif /* _LINUX_ICMP_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 34eb3aecb3f2..0e96c90e56c6 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -135,6 +135,7 @@ struct netns_ipv4 { u8 sysctl_icmp_echo_ignore_broadcasts; u8 sysctl_icmp_ignore_bogus_error_responses; u8 sysctl_icmp_errors_use_inbound_ifaddr; + u8 sysctl_icmp_errors_extension_mask; int sysctl_icmp_ratelimit; int sysctl_icmp_ratemask; int sysctl_icmp_msgs_per_sec; -- cgit v1.2.3 From d12d04d221f8d928a27a66236228e7501cd4cad5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 27 Oct 2025 10:22:31 +0200 Subject: ipv6: icmp: Add RFC 5837 support Add the ability to append the incoming IP interface information to ICMPv6 error messages in accordance with RFC 5837 and RFC 4884. This is required for more meaningful traceroute results in unnumbered networks. The feature is disabled by default and controlled via a new sysctl ("net.ipv6.icmp.errors_extension_mask") which accepts a bitmask of ICMP extensions to append to ICMP error messages. Currently, only a single value is supported, but the interface and the implementation should be able to support more extensions, if needed. Clone the skb and copy the relevant data portions before modifying the skb as the caller of icmp6_send() still owns the skb after the function returns. This should be fine since by default ICMP error messages are rate limited to 1000 per second and no more than 1 per second per specific host. Trim or pad the packet to 128 bytes before appending the ICMP extension structure in order to be compatible with legacy applications that assume that the ICMP extension structure always starts at this offset (the minimum length specified by RFC 4884). Since commit 20e1954fe238 ("ipv6: RFC 4884 partial support for SIT/GRE tunnels") it is possible for icmp6_send() to be called with an skb that already contains ICMP extensions. This can happen when we receive an ICMPv4 message with extensions from a tunnel and translate it to an ICMPv6 message towards an IPv6 host in the overlay network. I could not find an RFC that supports this behavior, but it makes sense to not overwrite the original extensions that were appended to the packet. Therefore, avoid appending extensions if the length field in the provided ICMPv6 header is already filled. Export netdev_copy_name() using EXPORT_IPV6_MOD_GPL() to make it available to IPv6 when it is built as a module. Reviewed-by: Petr Machata Reviewed-by: David Ahern Reviewed-by: Willem de Bruijn Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251027082232.232571-3-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 47dc70d8100a..08d2ecc96e2b 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -56,6 +56,7 @@ struct netns_sysctl_ipv6 { u8 skip_notify_on_dev_down; u8 fib_notify_on_flag_change; u8 icmpv6_error_anycast_as_unicast; + u8 icmpv6_errors_extension_mask; }; struct netns_ipv6 { -- cgit v1.2.3 From 26888de97b2ffe0267c12dd4e9fcd552545903f1 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 25 Oct 2025 20:49:50 +0200 Subject: net: phy: add iterator mdiobus_for_each_phy Add an iterator for all PHY's on a MII bus, and phy_find_next() as a prerequisite. Signed-off-by: Heiner Kallweit Reviewed-by: Wei Fang Link: https://patch.msgid.link/cd112f15-401a-43d9-8525-9ff0965a68cd@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 17a2cdc9f1a0..358dd6f0ff96 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1869,7 +1869,7 @@ int phy_sfp_probe(struct phy_device *phydev, const struct sfp_upstream_ops *ops); struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phy_interface_t interface); -struct phy_device *phy_find_first(struct mii_bus *bus); +struct phy_device *phy_find_next(struct mii_bus *bus, struct phy_device *pos); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); int phy_connect_direct(struct net_device *dev, struct phy_device *phydev, @@ -1896,6 +1896,15 @@ bool phy_check_valid(int speed, int duplex, unsigned long *features); int phy_restart_aneg(struct phy_device *phydev); int phy_reset_after_clk_enable(struct phy_device *phydev); +static inline struct phy_device *phy_find_first(struct mii_bus *bus) +{ + return phy_find_next(bus, NULL); +} + +#define mdiobus_for_each_phy(_bus, _phydev) \ + for (_phydev = phy_find_first(_bus); _phydev; \ + _phydev = phy_find_next(_bus, _phydev)) + #if IS_ENABLED(CONFIG_PHYLIB) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 00b3e8480be7a49203594bd1fdb4fd46f3b69d59 Mon Sep 17 00:00:00 2001 From: Izhar Ameer Shaikh Date: Tue, 21 Oct 2025 17:00:01 +0530 Subject: scsi: firmware: xilinx: Add support for secure read/write ioctl interface Add support for a generic ioctl read/write interface using which users can request firmware to perform read/write operations on a protected and secure address space. The functionality is introduced through the means of two new IOCTL IDs which extend the existing PM_IOCTL EEMI API: - IOCTL_READ_REG - IOCTL_MASK_WRITE_REG The caller only passes the node id of the given device and an offset. The base address is not exposed to the caller and internally retrieved by the firmware. Firmware will enforce an access policy on the incoming read/write request. Signed-off-by: Izhar Ameer Shaikh Reviewed-by: Tanmay Shah Signed-off-by: Radhey Shyam Pandey Signed-off-by: Ajay Neeli Acked-by: Senthil Nathan Thangaraj Acked-by: Michal Simek Acked-by: Bart Van Assche Link: https://patch.msgid.link/20251021113003.13650-3-ajay.neeli@amd.com Signed-off-by: Martin K. Petersen --- include/linux/firmware/xlnx-zynqmp.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index ae48d619c4e0..b161f37de5cc 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -241,6 +241,7 @@ enum pm_ioctl_id { IOCTL_GET_FEATURE_CONFIG = 27, /* IOCTL for Secure Read/Write Interface */ IOCTL_READ_REG = 28, + IOCTL_MASK_WRITE_REG = 29, /* Dynamic SD/GEM configuration */ IOCTL_SET_SD_CONFIG = 30, IOCTL_SET_GEM_CONFIG = 31, @@ -619,6 +620,9 @@ int zynqmp_pm_feature(const u32 api_id); int zynqmp_pm_is_function_supported(const u32 api_id, const u32 id); int zynqmp_pm_set_feature_config(enum pm_feature_config_id id, u32 value); int zynqmp_pm_get_feature_config(enum pm_feature_config_id id, u32 *payload); +int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value); +int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, + u32 mask, u32 value); int zynqmp_pm_register_sgi(u32 sgi_num, u32 reset); int zynqmp_pm_force_pwrdwn(const u32 target, const enum zynqmp_pm_request_ack ack); @@ -916,6 +920,17 @@ static inline int zynqmp_pm_request_wake(const u32 node, return -ENODEV; } +static inline int zynqmp_pm_sec_read_reg(u32 node_id, u32 offset, u32 *ret_value) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_sec_mask_write_reg(const u32 node_id, const u32 offset, + u32 mask, u32 value) +{ + return -ENODEV; +} + static inline int zynqmp_pm_get_rpu_mode(u32 node_id, enum rpu_oper_mode *rpu_mode) { return -ENODEV; -- cgit v1.2.3 From 0e4d26f79a74bc633846a27a9a20d52217c108dc Mon Sep 17 00:00:00 2001 From: Ajay Neeli Date: Tue, 21 Oct 2025 17:00:02 +0530 Subject: scsi: firmware: xilinx: Add APIs for UFS PHY initialization - Add APIs for UFS PHY initialization. - Verify M-PHY TX-RX configuration readiness. - Confirm SRAM initialization and Set SRAM bypass. - Retrieve UFS calibration values. Signed-off-by: Ajay Neeli Acked-by: Senthil Nathan Thangaraj Acked-by: Michal Simek Acked-by: Bart Van Assche Link: https://patch.msgid.link/20251021113003.13650-4-ajay.neeli@amd.com Signed-off-by: Martin K. Petersen --- include/linux/firmware/xlnx-zynqmp-ufs.h | 38 ++++++++++++++++++++++++++++++++ include/linux/firmware/xlnx-zynqmp.h | 1 + 2 files changed, 39 insertions(+) create mode 100644 include/linux/firmware/xlnx-zynqmp-ufs.h (limited to 'include') diff --git a/include/linux/firmware/xlnx-zynqmp-ufs.h b/include/linux/firmware/xlnx-zynqmp-ufs.h new file mode 100644 index 000000000000..d3538dd5822a --- /dev/null +++ b/include/linux/firmware/xlnx-zynqmp-ufs.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Firmware layer for UFS APIs. + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + */ + +#ifndef __FIRMWARE_XLNX_ZYNQMP_UFS_H__ +#define __FIRMWARE_XLNX_ZYNQMP_UFS_H__ + +#if IS_REACHABLE(CONFIG_ZYNQMP_FIRMWARE) +int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready); +int zynqmp_pm_is_sram_init_done(bool *is_done); +int zynqmp_pm_set_sram_bypass(void); +int zynqmp_pm_get_ufs_calibration_values(u32 *val); +#else +static inline int zynqmp_pm_is_mphy_tx_rx_config_ready(bool *is_ready) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_is_sram_init_done(bool *is_done) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_set_sram_bypass(void) +{ + return -ENODEV; +} + +static inline int zynqmp_pm_get_ufs_calibration_values(u32 *val) +{ + return -ENODEV; +} +#endif + +#endif /* __FIRMWARE_XLNX_ZYNQMP_UFS_H__ */ diff --git a/include/linux/firmware/xlnx-zynqmp.h b/include/linux/firmware/xlnx-zynqmp.h index b161f37de5cc..784d5920b4cd 100644 --- a/include/linux/firmware/xlnx-zynqmp.h +++ b/include/linux/firmware/xlnx-zynqmp.h @@ -16,6 +16,7 @@ #include #include +#include #define ZYNQMP_PM_VERSION_MAJOR 1 #define ZYNQMP_PM_VERSION_MINOR 0 -- cgit v1.2.3 From 769b8b2ffded4cd880669edd83e2952efeeb27f7 Mon Sep 17 00:00:00 2001 From: Sai Krishna Potthuri Date: Tue, 21 Oct 2025 17:00:03 +0530 Subject: scsi: ufs: amd-versal2: Add UFS support for AMD Versal Gen 2 SoC Add support for the UFS host controller on the AMD Versal Gen 2 SoC, built on the Synopsys DWC UFS architecture, using the UFSHCD DWC and UFSHCD platform driver. This controller requires specific configurations like M-PHY/RMMI/UniPro and vendor specific registers programming before doing the UIC_LINKSTARTUP. Signed-off-by: Sai Krishna Potthuri Signed-off-by: Ajay Neeli Acked-by: Bart Van Assche Link: https://patch.msgid.link/20251021113003.13650-5-ajay.neeli@amd.com Signed-off-by: Martin K. Petersen --- include/ufs/unipro.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/ufs/unipro.h b/include/ufs/unipro.h index 360e1245fb40..faf1c471ad30 100644 --- a/include/ufs/unipro.h +++ b/include/ufs/unipro.h @@ -174,6 +174,7 @@ #define VS_POWERSTATE 0xD083 #define VS_MPHYCFGUPDT 0xD085 #define VS_DEBUGOMC 0xD09E +#define VS_MPHYDISABLE 0xD0C1 #define PA_GRANULARITY_MIN_VAL 1 #define PA_GRANULARITY_MAX_VAL 6 -- cgit v1.2.3 From 50b8e36994a042103ea92b6d9f6d7de725f9ac5f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 17 Oct 2025 21:30:57 -0700 Subject: lib/crypto: blake2s: Adjust parameter order of blake2s() Reorder the parameters of blake2s() from (out, in, key, outlen, inlen, keylen) to (key, keylen, in, inlen, out, outlen). This aligns BLAKE2s with the common conventions of pairing buffers and their lengths, and having outputs follow inputs. This is widely used elsewhere in lib/crypto/ and crypto/, and even elsewhere in the BLAKE2s code itself such as blake2s_init_key() and blake2s_final(). So blake2s() was a bit of an exception. Notably, this results in the same order as hmac_*_usingrawkey(). Note that since the type signature changed, it's not possible for a blake2s() call site to be silently missed. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251018043106.375964-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/blake2s.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h index f9ffd39194eb..a7dd678725b2 100644 --- a/include/crypto/blake2s.h +++ b/include/crypto/blake2s.h @@ -86,9 +86,9 @@ static inline void blake2s_init_key(struct blake2s_state *state, void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen); void blake2s_final(struct blake2s_state *state, u8 *out); -static inline void blake2s(u8 *out, const u8 *in, const u8 *key, - const size_t outlen, const size_t inlen, - const size_t keylen) +static inline void blake2s(const u8 *key, const size_t keylen, + const u8 *in, const size_t inlen, + u8 *out, const size_t outlen) { struct blake2s_state state; -- cgit v1.2.3 From 5e0ec8e46d4d6488242bb39a4ce5c0276afa5f32 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 17 Oct 2025 21:30:58 -0700 Subject: lib/crypto: blake2s: Rename blake2s_state to blake2s_ctx For consistency with the SHA-1, SHA-2, SHA-3 (in development), and MD5 library APIs, rename blake2s_state to blake2s_ctx. As a refresher, the ctx name: - Is a bit shorter. - Avoids confusion with the compression function state, which is also often called the state (but is just part of the full context). - Is consistent with OpenSSL. Not a big deal, of course. But consistency is nice. With a BLAKE2b library API about to be added, this is a convenient time to update this. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251018043106.375964-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/blake2s.h | 59 ++++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h index a7dd678725b2..4c8d532ee97b 100644 --- a/include/crypto/blake2s.h +++ b/include/crypto/blake2s.h @@ -22,7 +22,7 @@ enum blake2s_lengths { BLAKE2S_256_HASH_SIZE = 32, }; -struct blake2s_state { +struct blake2s_ctx { /* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */ u32 h[8]; u32 t[2]; @@ -43,62 +43,61 @@ enum blake2s_iv { BLAKE2S_IV7 = 0x5BE0CD19UL, }; -static inline void __blake2s_init(struct blake2s_state *state, size_t outlen, +static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen, const void *key, size_t keylen) { - state->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen); - state->h[1] = BLAKE2S_IV1; - state->h[2] = BLAKE2S_IV2; - state->h[3] = BLAKE2S_IV3; - state->h[4] = BLAKE2S_IV4; - state->h[5] = BLAKE2S_IV5; - state->h[6] = BLAKE2S_IV6; - state->h[7] = BLAKE2S_IV7; - state->t[0] = 0; - state->t[1] = 0; - state->f[0] = 0; - state->f[1] = 0; - state->buflen = 0; - state->outlen = outlen; + ctx->h[0] = BLAKE2S_IV0 ^ (0x01010000 | keylen << 8 | outlen); + ctx->h[1] = BLAKE2S_IV1; + ctx->h[2] = BLAKE2S_IV2; + ctx->h[3] = BLAKE2S_IV3; + ctx->h[4] = BLAKE2S_IV4; + ctx->h[5] = BLAKE2S_IV5; + ctx->h[6] = BLAKE2S_IV6; + ctx->h[7] = BLAKE2S_IV7; + ctx->t[0] = 0; + ctx->t[1] = 0; + ctx->f[0] = 0; + ctx->f[1] = 0; + ctx->buflen = 0; + ctx->outlen = outlen; if (keylen) { - memcpy(state->buf, key, keylen); - memset(&state->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen); - state->buflen = BLAKE2S_BLOCK_SIZE; + memcpy(ctx->buf, key, keylen); + memset(&ctx->buf[keylen], 0, BLAKE2S_BLOCK_SIZE - keylen); + ctx->buflen = BLAKE2S_BLOCK_SIZE; } } -static inline void blake2s_init(struct blake2s_state *state, - const size_t outlen) +static inline void blake2s_init(struct blake2s_ctx *ctx, const size_t outlen) { - __blake2s_init(state, outlen, NULL, 0); + __blake2s_init(ctx, outlen, NULL, 0); } -static inline void blake2s_init_key(struct blake2s_state *state, +static inline void blake2s_init_key(struct blake2s_ctx *ctx, const size_t outlen, const void *key, const size_t keylen) { WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE || !key || !keylen || keylen > BLAKE2S_KEY_SIZE)); - __blake2s_init(state, outlen, key, keylen); + __blake2s_init(ctx, outlen, key, keylen); } -void blake2s_update(struct blake2s_state *state, const u8 *in, size_t inlen); -void blake2s_final(struct blake2s_state *state, u8 *out); +void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen); +void blake2s_final(struct blake2s_ctx *ctx, u8 *out); static inline void blake2s(const u8 *key, const size_t keylen, const u8 *in, const size_t inlen, u8 *out, const size_t outlen) { - struct blake2s_state state; + struct blake2s_ctx ctx; WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen || outlen > BLAKE2S_HASH_SIZE || keylen > BLAKE2S_KEY_SIZE || (!key && keylen))); - __blake2s_init(&state, outlen, key, keylen); - blake2s_update(&state, in, inlen); - blake2s_final(&state, out); + __blake2s_init(&ctx, outlen, key, keylen); + blake2s_update(&ctx, in, inlen); + blake2s_final(&ctx, out); } #endif /* _CRYPTO_BLAKE2S_H */ -- cgit v1.2.3 From 5385bcbffe5a76a74d6bb135af1c88fb235f8134 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 17 Oct 2025 21:30:59 -0700 Subject: lib/crypto: blake2s: Drop excessive const & rename block => data A couple more small cleanups to the BLAKE2s code before these things get propagated into the BLAKE2b code: - Drop 'const' from some non-pointer function parameters. It was a bit excessive and not conventional. - Rename 'block' argument of blake2s_compress*() to 'data'. This is for consistency with the SHA-* code, and also to avoid the implication that it points to a singular "block". No functional changes. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251018043106.375964-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/blake2s.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h index 4c8d532ee97b..33893057eb41 100644 --- a/include/crypto/blake2s.h +++ b/include/crypto/blake2s.h @@ -67,14 +67,13 @@ static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen, } } -static inline void blake2s_init(struct blake2s_ctx *ctx, const size_t outlen) +static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen) { __blake2s_init(ctx, outlen, NULL, 0); } -static inline void blake2s_init_key(struct blake2s_ctx *ctx, - const size_t outlen, const void *key, - const size_t keylen) +static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen, + const void *key, size_t keylen) { WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2S_HASH_SIZE || !key || !keylen || keylen > BLAKE2S_KEY_SIZE)); @@ -85,9 +84,9 @@ static inline void blake2s_init_key(struct blake2s_ctx *ctx, void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen); void blake2s_final(struct blake2s_ctx *ctx, u8 *out); -static inline void blake2s(const u8 *key, const size_t keylen, - const u8 *in, const size_t inlen, - u8 *out, const size_t outlen) +static inline void blake2s(const u8 *key, size_t keylen, + const u8 *in, size_t inlen, + u8 *out, size_t outlen) { struct blake2s_ctx ctx; -- cgit v1.2.3 From b95d4471cb5830b59667ead8d1d59dc3d661a1df Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 17 Oct 2025 21:31:00 -0700 Subject: lib/crypto: blake2s: Document the BLAKE2s library API Add kerneldoc for the BLAKE2s library API. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251018043106.375964-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/blake2s.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'include') diff --git a/include/crypto/blake2s.h b/include/crypto/blake2s.h index 33893057eb41..648cb7824358 100644 --- a/include/crypto/blake2s.h +++ b/include/crypto/blake2s.h @@ -22,6 +22,15 @@ enum blake2s_lengths { BLAKE2S_256_HASH_SIZE = 32, }; +/** + * struct blake2s_ctx - Context for hashing a message with BLAKE2s + * @h: compression function state + * @t: block counter + * @f: finalization indicator + * @buf: partial block buffer; 'buflen' bytes are valid + * @buflen: number of bytes buffered in @buf + * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE + */ struct blake2s_ctx { /* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */ u32 h[8]; @@ -67,11 +76,27 @@ static inline void __blake2s_init(struct blake2s_ctx *ctx, size_t outlen, } } +/** + * blake2s_init() - Initialize a BLAKE2s context for a new message (unkeyed) + * @ctx: the context to initialize + * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE + * + * Context: Any context. + */ static inline void blake2s_init(struct blake2s_ctx *ctx, size_t outlen) { __blake2s_init(ctx, outlen, NULL, 0); } +/** + * blake2s_init_key() - Initialize a BLAKE2s context for a new message (keyed) + * @ctx: the context to initialize + * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE + * @key: the key + * @keylen: the key length in bytes, at most BLAKE2S_KEY_SIZE + * + * Context: Any context. + */ static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen, const void *key, size_t keylen) { @@ -81,9 +106,42 @@ static inline void blake2s_init_key(struct blake2s_ctx *ctx, size_t outlen, __blake2s_init(ctx, outlen, key, keylen); } +/** + * blake2s_update() - Update a BLAKE2s context with message data + * @ctx: the context to update; must have been initialized + * @in: the message data + * @inlen: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ void blake2s_update(struct blake2s_ctx *ctx, const u8 *in, size_t inlen); + +/** + * blake2s_final() - Finish computing a BLAKE2s hash + * @ctx: the context to finalize; must have been initialized + * @out: (output) the resulting BLAKE2s hash. Its length will be equal to the + * @outlen that was passed to blake2s_init() or blake2s_init_key(). + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ void blake2s_final(struct blake2s_ctx *ctx, u8 *out); +/** + * blake2s() - Compute BLAKE2s hash in one shot + * @key: the key, or NULL for an unkeyed hash + * @keylen: the key length in bytes (at most BLAKE2S_KEY_SIZE), or 0 for an + * unkeyed hash + * @in: the message data + * @inlen: the data length in bytes + * @out: (output) the resulting BLAKE2s hash, with length @outlen + * @outlen: length of output hash value in bytes, at most BLAKE2S_HASH_SIZE + * + * Context: Any context. + */ static inline void blake2s(const u8 *key, size_t keylen, const u8 *in, size_t inlen, u8 *out, size_t outlen) -- cgit v1.2.3 From c99d30706043481a1d631bbd9c7a4b70fe002a2b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 17 Oct 2025 21:31:01 -0700 Subject: byteorder: Add le64_to_cpu_array() and cpu_to_le64_array() Add le64_to_cpu_array() and cpu_to_le64_array(). These mirror the corresponding 32-bit functions. These will be used by the BLAKE2b code. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251018043106.375964-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/linux/byteorder/generic.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index b3705e8bbe2b..55a44199de87 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -173,6 +173,22 @@ static inline void cpu_to_le32_array(u32 *buf, unsigned int words) } } +static inline void le64_to_cpu_array(u64 *buf, unsigned int words) +{ + while (words--) { + __le64_to_cpus(buf); + buf++; + } +} + +static inline void cpu_to_le64_array(u64 *buf, unsigned int words) +{ + while (words--) { + __cpu_to_le64s(buf); + buf++; + } +} + static inline void memcpy_from_le32(u32 *dst, const __le32 *src, size_t words) { size_t i; -- cgit v1.2.3 From 23a16c9533ed92cc639c8f5bd9eb104809fe2919 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 17 Oct 2025 21:31:02 -0700 Subject: lib/crypto: blake2b: Add BLAKE2b library functions Add a library API for BLAKE2b, closely modeled after the BLAKE2s API. This will allow in-kernel users such as btrfs to use BLAKE2b without going through the generic crypto layer. In addition, as usual the BLAKE2b crypto_shash algorithms will be reimplemented on top of this. Note: to create lib/crypto/blake2b.c I made a copy of lib/crypto/blake2s.c and made the updates from BLAKE2s => BLAKE2b. This way, the BLAKE2s and BLAKE2b code is kept consistent. Therefore, it borrows the SPDX-License-Identifier and Copyright from lib/crypto/blake2s.c rather than crypto/blake2b_generic.c. The library API uses 'struct blake2b_ctx', consistent with other lib/crypto/ APIs. The existing 'struct blake2b_state' will be removed once the blake2b crypto_shash algorithms are updated to stop using it. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251018043106.375964-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/blake2b.h | 133 ++++++++++++++++++++++++++++++++++---- include/crypto/internal/blake2b.h | 17 ++++- 2 files changed, 137 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/crypto/blake2b.h b/include/crypto/blake2b.h index dd7694477e50..4879e2ec2686 100644 --- a/include/crypto/blake2b.h +++ b/include/crypto/blake2b.h @@ -28,6 +28,25 @@ enum blake2b_lengths { BLAKE2B_512_HASH_SIZE = 64, }; +/** + * struct blake2b_ctx - Context for hashing a message with BLAKE2b + * @h: compression function state + * @t: block counter + * @f: finalization indicator + * @buf: partial block buffer; 'buflen' bytes are valid + * @buflen: number of bytes buffered in @buf + * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE + */ +struct blake2b_ctx { + /* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */ + u64 h[8]; + u64 t[2]; + u64 f[2]; + u8 buf[BLAKE2B_BLOCK_SIZE]; + unsigned int buflen; + unsigned int outlen; +}; + enum blake2b_iv { BLAKE2B_IV0 = 0x6A09E667F3BCC908ULL, BLAKE2B_IV1 = 0xBB67AE8584CAA73BULL, @@ -39,19 +58,109 @@ enum blake2b_iv { BLAKE2B_IV7 = 0x5BE0CD19137E2179ULL, }; -static inline void __blake2b_init(struct blake2b_state *state, size_t outlen, - size_t keylen) +static inline void __blake2b_init(struct blake2b_ctx *ctx, size_t outlen, + const void *key, size_t keylen) +{ + ctx->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen); + ctx->h[1] = BLAKE2B_IV1; + ctx->h[2] = BLAKE2B_IV2; + ctx->h[3] = BLAKE2B_IV3; + ctx->h[4] = BLAKE2B_IV4; + ctx->h[5] = BLAKE2B_IV5; + ctx->h[6] = BLAKE2B_IV6; + ctx->h[7] = BLAKE2B_IV7; + ctx->t[0] = 0; + ctx->t[1] = 0; + ctx->f[0] = 0; + ctx->f[1] = 0; + ctx->buflen = 0; + ctx->outlen = outlen; + if (keylen) { + memcpy(ctx->buf, key, keylen); + memset(&ctx->buf[keylen], 0, BLAKE2B_BLOCK_SIZE - keylen); + ctx->buflen = BLAKE2B_BLOCK_SIZE; + } +} + +/** + * blake2b_init() - Initialize a BLAKE2b context for a new message (unkeyed) + * @ctx: the context to initialize + * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE + * + * Context: Any context. + */ +static inline void blake2b_init(struct blake2b_ctx *ctx, size_t outlen) +{ + __blake2b_init(ctx, outlen, NULL, 0); +} + +/** + * blake2b_init_key() - Initialize a BLAKE2b context for a new message (keyed) + * @ctx: the context to initialize + * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE + * @key: the key + * @keylen: the key length in bytes, at most BLAKE2B_KEY_SIZE + * + * Context: Any context. + */ +static inline void blake2b_init_key(struct blake2b_ctx *ctx, size_t outlen, + const void *key, size_t keylen) +{ + WARN_ON(IS_ENABLED(DEBUG) && (!outlen || outlen > BLAKE2B_HASH_SIZE || + !key || !keylen || keylen > BLAKE2B_KEY_SIZE)); + + __blake2b_init(ctx, outlen, key, keylen); +} + +/** + * blake2b_update() - Update a BLAKE2b context with message data + * @ctx: the context to update; must have been initialized + * @in: the message data + * @inlen: the data length in bytes + * + * This can be called any number of times. + * + * Context: Any context. + */ +void blake2b_update(struct blake2b_ctx *ctx, const u8 *in, size_t inlen); + +/** + * blake2b_final() - Finish computing a BLAKE2b hash + * @ctx: the context to finalize; must have been initialized + * @out: (output) the resulting BLAKE2b hash. Its length will be equal to the + * @outlen that was passed to blake2b_init() or blake2b_init_key(). + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void blake2b_final(struct blake2b_ctx *ctx, u8 *out); + +/** + * blake2b() - Compute BLAKE2b hash in one shot + * @key: the key, or NULL for an unkeyed hash + * @keylen: the key length in bytes (at most BLAKE2B_KEY_SIZE), or 0 for an + * unkeyed hash + * @in: the message data + * @inlen: the data length in bytes + * @out: (output) the resulting BLAKE2b hash, with length @outlen + * @outlen: length of output hash value in bytes, at most BLAKE2B_HASH_SIZE + * + * Context: Any context. + */ +static inline void blake2b(const u8 *key, size_t keylen, + const u8 *in, size_t inlen, + u8 *out, size_t outlen) { - state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen); - state->h[1] = BLAKE2B_IV1; - state->h[2] = BLAKE2B_IV2; - state->h[3] = BLAKE2B_IV3; - state->h[4] = BLAKE2B_IV4; - state->h[5] = BLAKE2B_IV5; - state->h[6] = BLAKE2B_IV6; - state->h[7] = BLAKE2B_IV7; - state->t[0] = 0; - state->t[1] = 0; + struct blake2b_ctx ctx; + + WARN_ON(IS_ENABLED(DEBUG) && ((!in && inlen > 0) || !out || !outlen || + outlen > BLAKE2B_HASH_SIZE || keylen > BLAKE2B_KEY_SIZE || + (!key && keylen))); + + __blake2b_init(&ctx, outlen, key, keylen); + blake2b_update(&ctx, in, inlen); + blake2b_final(&ctx, out); } #endif /* _CRYPTO_BLAKE2B_H */ diff --git a/include/crypto/internal/blake2b.h b/include/crypto/internal/blake2b.h index 3e09e2485306..3712df69def1 100644 --- a/include/crypto/internal/blake2b.h +++ b/include/crypto/internal/blake2b.h @@ -57,13 +57,28 @@ static inline int crypto_blake2b_setkey(struct crypto_shash *tfm, return 0; } +static inline void __crypto_blake2b_init(struct blake2b_state *state, + size_t outlen, size_t keylen) +{ + state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen); + state->h[1] = BLAKE2B_IV1; + state->h[2] = BLAKE2B_IV2; + state->h[3] = BLAKE2B_IV3; + state->h[4] = BLAKE2B_IV4; + state->h[5] = BLAKE2B_IV5; + state->h[6] = BLAKE2B_IV6; + state->h[7] = BLAKE2B_IV7; + state->t[0] = 0; + state->t[1] = 0; +} + static inline int crypto_blake2b_init(struct shash_desc *desc) { const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct blake2b_state *state = shash_desc_ctx(desc); unsigned int outlen = crypto_shash_digestsize(desc->tfm); - __blake2b_init(state, outlen, tctx->keylen); + __crypto_blake2b_init(state, outlen, tctx->keylen); return tctx->keylen ? crypto_shash_update(desc, tctx->key, BLAKE2B_BLOCK_SIZE) : 0; } -- cgit v1.2.3 From fa3ca9bfe3f001ed306cb3ce9761dacffbe143f8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 17 Oct 2025 21:31:05 -0700 Subject: crypto: blake2b - Reimplement using library API Replace blake2b_generic.c with a new file blake2b.c which implements the BLAKE2b crypto_shash algorithms on top of the BLAKE2b library API. Change the driver name suffix from "-generic" to "-lib" to reflect that these algorithms now just use the (possibly arch-optimized) library. This closely mirrors crypto/{md5,sha1,sha256,sha512}.c. Remove include/crypto/internal/blake2b.h since it is no longer used. Likewise, remove struct blake2b_state from include/crypto/blake2b.h. Omit support for import_core and export_core, since there are no legacy drivers that need these for these algorithms. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251018043106.375964-10-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/blake2b.h | 10 ---- include/crypto/internal/blake2b.h | 116 -------------------------------------- 2 files changed, 126 deletions(-) delete mode 100644 include/crypto/internal/blake2b.h (limited to 'include') diff --git a/include/crypto/blake2b.h b/include/crypto/blake2b.h index 4879e2ec2686..3bc37fd103a7 100644 --- a/include/crypto/blake2b.h +++ b/include/crypto/blake2b.h @@ -7,20 +7,10 @@ #include #include -struct blake2b_state { - /* 'h', 't', and 'f' are used in assembly code, so keep them as-is. */ - u64 h[8]; - u64 t[2]; - /* The true state ends here. The rest is temporary storage. */ - u64 f[2]; -}; - enum blake2b_lengths { BLAKE2B_BLOCK_SIZE = 128, BLAKE2B_HASH_SIZE = 64, BLAKE2B_KEY_SIZE = 64, - BLAKE2B_STATE_SIZE = offsetof(struct blake2b_state, f), - BLAKE2B_DESC_SIZE = sizeof(struct blake2b_state), BLAKE2B_160_HASH_SIZE = 20, BLAKE2B_256_HASH_SIZE = 32, diff --git a/include/crypto/internal/blake2b.h b/include/crypto/internal/blake2b.h deleted file mode 100644 index 3712df69def1..000000000000 --- a/include/crypto/internal/blake2b.h +++ /dev/null @@ -1,116 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -/* - * Helper functions for BLAKE2b implementations. - * Keep this in sync with the corresponding BLAKE2s header. - */ - -#ifndef _CRYPTO_INTERNAL_BLAKE2B_H -#define _CRYPTO_INTERNAL_BLAKE2B_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static inline void blake2b_set_lastblock(struct blake2b_state *state) -{ - state->f[0] = -1; - state->f[1] = 0; -} - -static inline void blake2b_set_nonlast(struct blake2b_state *state) -{ - state->f[0] = 0; - state->f[1] = 0; -} - -typedef void (*blake2b_compress_t)(struct blake2b_state *state, - const u8 *block, size_t nblocks, u32 inc); - -/* Helper functions for shash implementations of BLAKE2b */ - -struct blake2b_tfm_ctx { - u8 key[BLAKE2B_BLOCK_SIZE]; - unsigned int keylen; -}; - -static inline int crypto_blake2b_setkey(struct crypto_shash *tfm, - const u8 *key, unsigned int keylen) -{ - struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(tfm); - - if (keylen > BLAKE2B_KEY_SIZE) - return -EINVAL; - - BUILD_BUG_ON(BLAKE2B_KEY_SIZE > BLAKE2B_BLOCK_SIZE); - - memcpy(tctx->key, key, keylen); - memset(tctx->key + keylen, 0, BLAKE2B_BLOCK_SIZE - keylen); - tctx->keylen = keylen; - - return 0; -} - -static inline void __crypto_blake2b_init(struct blake2b_state *state, - size_t outlen, size_t keylen) -{ - state->h[0] = BLAKE2B_IV0 ^ (0x01010000 | keylen << 8 | outlen); - state->h[1] = BLAKE2B_IV1; - state->h[2] = BLAKE2B_IV2; - state->h[3] = BLAKE2B_IV3; - state->h[4] = BLAKE2B_IV4; - state->h[5] = BLAKE2B_IV5; - state->h[6] = BLAKE2B_IV6; - state->h[7] = BLAKE2B_IV7; - state->t[0] = 0; - state->t[1] = 0; -} - -static inline int crypto_blake2b_init(struct shash_desc *desc) -{ - const struct blake2b_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); - struct blake2b_state *state = shash_desc_ctx(desc); - unsigned int outlen = crypto_shash_digestsize(desc->tfm); - - __crypto_blake2b_init(state, outlen, tctx->keylen); - return tctx->keylen ? - crypto_shash_update(desc, tctx->key, BLAKE2B_BLOCK_SIZE) : 0; -} - -static inline int crypto_blake2b_update_bo(struct shash_desc *desc, - const u8 *in, unsigned int inlen, - blake2b_compress_t compress) -{ - struct blake2b_state *state = shash_desc_ctx(desc); - - blake2b_set_nonlast(state); - compress(state, in, inlen / BLAKE2B_BLOCK_SIZE, BLAKE2B_BLOCK_SIZE); - return inlen - round_down(inlen, BLAKE2B_BLOCK_SIZE); -} - -static inline int crypto_blake2b_finup(struct shash_desc *desc, const u8 *in, - unsigned int inlen, u8 *out, - blake2b_compress_t compress) -{ - struct blake2b_state *state = shash_desc_ctx(desc); - u8 buf[BLAKE2B_BLOCK_SIZE]; - int i; - - memcpy(buf, in, inlen); - memset(buf + inlen, 0, BLAKE2B_BLOCK_SIZE - inlen); - blake2b_set_lastblock(state); - compress(state, buf, 1, inlen); - for (i = 0; i < ARRAY_SIZE(state->h); i++) - __cpu_to_le64s(&state->h[i]); - memcpy(out, state->h, crypto_shash_digestsize(desc->tfm)); - memzero_explicit(buf, sizeof(buf)); - return 0; -} - -#endif /* _CRYPTO_INTERNAL_BLAKE2B_H */ -- cgit v1.2.3 From db82ddeaf42b93799a52df347284062893ea2ad6 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 27 Oct 2025 14:22:14 +0200 Subject: wifi: mac80211: add RX flag to report radiotap VHT information mac80211 already reports some basic information in the radiotap header with the known fields declared by the driver. However, drivers may want to report more accurate information and in that case the full VHT radiotap structure needs to be provided. Add a new RX_FLAG_RADIOTAP_VHT which is set when the VHT information should be pulled from the skb. Update the code to fill in the VHT fields to only do so when requested by the driver or if the information has not yet been set. This way the driver can fully control the information if it chooses so. Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251027142118.0bad1c307a21.I2cf285c20a822698039603f2af00ed9c548f2ee0@changeid Signed-off-by: Johannes Berg --- include/net/ieee80211_radiotap.h | 20 +++++++++++++++++++- include/net/mac80211.h | 2 ++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ieee80211_radiotap.h b/include/net/ieee80211_radiotap.h index 813e163ce27c..c60867e7e43c 100644 --- a/include/net/ieee80211_radiotap.h +++ b/include/net/ieee80211_radiotap.h @@ -1,6 +1,6 @@ /* * Copyright (c) 2017 Intel Deutschland GmbH - * Copyright (c) 2018-2019, 2021-2022 Intel Corporation + * Copyright (c) 2018-2019, 2021-2022, 2025 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -202,6 +202,24 @@ enum ieee80211_radiotap_vht_coding { IEEE80211_RADIOTAP_CODING_LDPC_USER3 = 0x08, }; +enum ieee80211_radiotap_vht_bandwidth { + /* Note: more values are defined but can't really be used */ + IEEE80211_RADIOTAP_VHT_BW_20 = 0, + IEEE80211_RADIOTAP_VHT_BW_40 = 1, + IEEE80211_RADIOTAP_VHT_BW_80 = 4, + IEEE80211_RADIOTAP_VHT_BW_160 = 11, +}; + +struct ieee80211_radiotap_vht { + __le16 known; + u8 flags; + u8 bandwidth; + u8 mcs_nss[4]; + u8 coding; + u8 group_id; + __le16 partial_aid; +} __packed; + /* for IEEE80211_RADIOTAP_TIMESTAMP */ enum ieee80211_radiotap_timestamp_unit_spos { IEEE80211_RADIOTAP_TIMESTAMP_UNIT_MASK = 0x000F, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a55085cf4ec4..c326243e1f01 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1529,6 +1529,7 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * known the frame shouldn't be reported. * @RX_FLAG_8023: the frame has an 802.3 header (decap offload performed by * hardware or driver) + * @RX_FLAG_RADIOTAP_VHT: VHT radiotap data is present */ enum mac80211_rx_flags { RX_FLAG_MMIC_ERROR = BIT(0), @@ -1564,6 +1565,7 @@ enum mac80211_rx_flags { RX_FLAG_RADIOTAP_LSIG = BIT(28), RX_FLAG_NO_PSDU = BIT(29), RX_FLAG_8023 = BIT(30), + RX_FLAG_RADIOTAP_VHT = BIT(31), }; /** -- cgit v1.2.3 From 61fafbee6cfed283c02a320896089f658fa67e56 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Tue, 28 Oct 2025 04:22:48 +0200 Subject: xfrm: Determine inner GSO type from packet inner protocol The GSO segmentation functions for ESP tunnel mode (xfrm4_tunnel_gso_segment and xfrm6_tunnel_gso_segment) were determining the inner packet's L2 protocol type by checking the static x->inner_mode.family field from the xfrm state. This is unreliable. In tunnel mode, the state's actual inner family could be defined by x->inner_mode.family or by x->inner_mode_iaf.family. Checking only the former can lead to a mismatch with the actual packet being processed, causing GSO to create segments with the wrong L2 header type. This patch fixes the bug by deriving the inner mode directly from the packet's inner protocol stored in XFRM_MODE_SKB_CB(skb)->protocol. Instead of replicating the code, this patch modifies the xfrm_ip2inner_mode helper function. It now correctly returns &x->inner_mode if the selector family (x->sel.family) is already specified, thereby handling both specific and AF_UNSPEC cases appropriately. With this change, ESP GSO can use xfrm_ip2inner_mode to get the correct inner mode. It doesn't affect existing callers, as the updated logic now mirrors the checks they were already performing externally. Fixes: 26dbd66eab80 ("esp: choose the correct inner protocol for GSO on inter address family tunnels") Signed-off-by: Jianbo Liu Reviewed-by: Cosmin Ratiu Reviewed-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f3014e4f54fc..0a14daaa5dd4 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -536,7 +536,8 @@ static inline int xfrm_af2proto(unsigned int family) static inline const struct xfrm_mode *xfrm_ip2inner_mode(struct xfrm_state *x, int ipproto) { - if ((ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || + if ((x->sel.family != AF_UNSPEC) || + (ipproto == IPPROTO_IPIP && x->props.family == AF_INET) || (ipproto == IPPROTO_IPV6 && x->props.family == AF_INET6)) return &x->inner_mode; else -- cgit v1.2.3 From 57347d58a4011551e7d0e030f2f12e4d1a28feb6 Mon Sep 17 00:00:00 2001 From: "caivive (Weibiao Tu)" Date: Thu, 28 Nov 2024 20:52:04 +0800 Subject: netfilter: fix typo in nf_conntrack_l4proto.h comment In the comment for nf_conntrack_l4proto.h, the word "nfnetink" was incorrectly spelled. It has been corrected to "nfnetlink". Fixes a typo to enhance readability and ensure consistency. Signed-off-by: caivive (Weibiao Tu) Signed-off-by: Florian Westphal --- include/net/netfilter/nf_conntrack_l4proto.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 6929f8daf1ed..cd5020835a6d 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -30,7 +30,7 @@ struct nf_conntrack_l4proto { /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); - /* convert protoinfo to nfnetink attributes */ + /* convert protoinfo to nfnetlink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy); -- cgit v1.2.3 From 74a7b4f18396f07e87c7fda5c19d1fcfb8c1dd44 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Oct 2025 00:08:02 -0700 Subject: sysctl: fix kernel-doc format warning Describe the "type" struct member using '@type' and move it together with the rest of the doc for ctl_table_header to avoid a kernel-doc warning: Warning: include/linux/sysctl.h:178 Incorrect use of kernel-doc format: * enum type - Enumeration to differentiate between ctl target types Fixes: 2f2665c13af4 ("sysctl: replace child with an enumeration") Signed-off-by: Randy Dunlap Signed-off-by: Joel Granados --- include/linux/sysctl.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 92e9146b1104..28c4a997fd21 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -156,6 +156,10 @@ struct ctl_node { * @nreg: When nreg drops to 0 the ctl_table_header will be unregistered. * @rcu: Delays the freeing of the inode. Introduced with "unfuck proc_sysctl ->d_compare()" * + * @type: Enumeration to differentiate between ctl target types + * @type.SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations + * @type.SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Identifies a permanently empty dir + * target to serve as a mount point */ struct ctl_table_header { union { @@ -175,13 +179,6 @@ struct ctl_table_header { struct ctl_dir *parent; struct ctl_node *node; struct hlist_head inodes; /* head for proc_inode->sysctl_inodes */ - /** - * enum type - Enumeration to differentiate between ctl target types - * @SYSCTL_TABLE_TYPE_DEFAULT: ctl target with no special considerations - * @SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY: Used to identify a permanently - * empty directory target to serve - * as mount point. - */ enum { SYSCTL_TABLE_TYPE_DEFAULT, SYSCTL_TABLE_TYPE_PERMANENTLY_EMPTY, -- cgit v1.2.3 From aef3cdb47bbbef9fea9512ed6c02d64394449d53 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Mon, 27 Oct 2025 23:48:55 +0100 Subject: net/smc: make wr buffer count configurable Think SMC_WR_BUF_CNT_SEND := SMC_WR_BUF_CNT used in send context and SMC_WR_BUF_CNT_RECV := 3 * SMC_WR_BUF_CNT used in recv context. Those get replaced with lgr->max_send_wr and lgr->max_recv_wr respective. Please note that although with the default sysctl values qp_attr.cap.max_send_wr == qp_attr.cap.max_recv_wr is maintained but can not be assumed to be generally true any more. I see no downside to that, but my confidence level is rather modest. Signed-off-by: Halil Pasic Reviewed-by: Sidraya Jayagond Reviewed-by: Dust Li Tested-by: Mahanta Jambigi Link: https://patch.msgid.link/20251027224856.2970019-2-pasic@linux.ibm.com Signed-off-by: Paolo Abeni --- include/net/netns/smc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index fc752a50f91b..6ceb12baec24 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -24,5 +24,7 @@ struct netns_smc { int sysctl_rmem; int sysctl_max_links_per_lgr; int sysctl_max_conns_per_lgr; + unsigned int sysctl_smcr_max_send_wr; + unsigned int sysctl_smcr_max_recv_wr; }; #endif -- cgit v1.2.3 From 4061c43a99772c66c378cfacaa71550ab3b35909 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 Oct 2025 09:45:48 +0100 Subject: pidfs: add missing PIDFD_INFO_SIZE_VER1 We grew struct pidfd_info not too long ago. Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-3-ca449b7b7aa0@kernel.org Fixes: 1d8db6fd698d ("pidfs, coredump: add PIDFD_INFO_COREDUMP") Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/uapi/linux/pidfd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 957db425d459..6ccbabd9a68d 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -28,6 +28,7 @@ #define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */ #define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */ +#define PIDFD_INFO_SIZE_VER1 72 /* sizeof second published struct */ /* * Values for @coredump_mask in pidfd_info. -- cgit v1.2.3 From dfd78546c95330db2252e0d7e937a15ab5eddb4e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 Oct 2025 09:45:50 +0100 Subject: pidfd: add a new supported_mask field Some of the future fields in struct pidfd_info can be optional. If the kernel has nothing to emit in that field, then it doesn't set the flag in the reply. This presents a problem: There is currently no way to know what mask flags the kernel supports since one can't always count on them being in the reply. Add a new PIDFD_INFO_SUPPORTED_MASK flag and field that the kernel can set in the reply. Userspace can use this to determine if the fields it requires from the kernel are supported. This also gives us a way to deprecate fields in the future, if that should become necessary. Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-5-ca449b7b7aa0@kernel.org Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/uapi/linux/pidfd.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 6ccbabd9a68d..e05caa0e00fe 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -26,9 +26,11 @@ #define PIDFD_INFO_CGROUPID (1UL << 2) /* Always returned if available, even if not requested */ #define PIDFD_INFO_EXIT (1UL << 3) /* Only returned if requested. */ #define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */ +#define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) /* Want/got supported mask flags */ #define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */ #define PIDFD_INFO_SIZE_VER1 72 /* sizeof second published struct */ +#define PIDFD_INFO_SIZE_VER2 80 /* sizeof third published struct */ /* * Values for @coredump_mask in pidfd_info. @@ -94,6 +96,7 @@ struct pidfd_info { __s32 exit_code; __u32 coredump_mask; __u32 __spare1; + __u64 supported_mask; /* Mask flags that this kernel supports */ }; #define PIDFS_IOCTL_MAGIC 0xFF -- cgit v1.2.3 From 036375522be8425874e9e0f907c7127e315c7a52 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 28 Oct 2025 09:45:53 +0100 Subject: pidfs: expose coredump signal Userspace needs access to the signal that caused the coredump before the coredumping process has been reaped. Expose it as part of the coredump information in struct pidfd_info. After the process has been reaped that info is also available as part of PIDFD_INFO_EXIT's exit_code field. Link: https://patch.msgid.link/20251028-work-coredump-signal-v1-8-ca449b7b7aa0@kernel.org Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner --- include/uapi/linux/pidfd.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index e05caa0e00fe..ea9a6811fc76 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -27,6 +27,7 @@ #define PIDFD_INFO_EXIT (1UL << 3) /* Only returned if requested. */ #define PIDFD_INFO_COREDUMP (1UL << 4) /* Only returned if requested. */ #define PIDFD_INFO_SUPPORTED_MASK (1UL << 5) /* Want/got supported mask flags */ +#define PIDFD_INFO_COREDUMP_SIGNAL (1UL << 6) /* Always returned if PIDFD_INFO_COREDUMP is requested. */ #define PIDFD_INFO_SIZE_VER0 64 /* sizeof first published struct */ #define PIDFD_INFO_SIZE_VER1 72 /* sizeof second published struct */ @@ -94,8 +95,10 @@ struct pidfd_info { __u32 fsuid; __u32 fsgid; __s32 exit_code; - __u32 coredump_mask; - __u32 __spare1; + struct /* coredump info */ { + __u32 coredump_mask; + __u32 coredump_signal; + }; __u64 supported_mask; /* Mask flags that this kernel supports */ }; -- cgit v1.2.3 From 3c0c81de525d2a2718e23754a5795483167904ac Mon Sep 17 00:00:00 2001 From: Markus Theil Date: Tue, 11 Feb 2025 07:33:32 +0100 Subject: prandom: remove next_pseudo_random32 next_pseudo_random32 implements a LCG with known bad statistical properties and was only used in two pieces of testing code. With no remaining users now, remove it. Signed-off-by: Markus Theil Reviewed-by: Krzysztof Karas Signed-off-by: Jason A. Donenfeld --- include/linux/prandom.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/prandom.h b/include/linux/prandom.h index f2ed5b72b3d6..ff7dcc3fa105 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -47,10 +47,4 @@ static inline void prandom_seed_state(struct rnd_state *state, u64 seed) state->s4 = __seed(i, 128U); } -/* Pseudo random number generator from numerical recipes. */ -static inline u32 next_pseudo_random32(u32 seed) -{ - return seed * 1664525 + 1013904223; -} - #endif -- cgit v1.2.3 From 8e4ec90701efec7f2814c89b398d6d4272636814 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 20 Oct 2025 07:55:55 -1000 Subject: freezer: Clarify that only cgroup1 freezer uses PM freezer cgroup1 freezer piggybacks on the PM freezer, which inadvertently allowed userspace to produce uninterruptible tasks at will. To avoid the issue, cgroup2 freezer switched to a separate job control based mechanism. While this happened a long time ago, the code and comment haven't been updated making it confusing to people who aren't familiar with the history. Rename cgroup_freezing() to cgroup1_freezing() and update comments on top of freezing() and frozen() to clarify that cgroup2 freezer isn't covered by the PM freezer mechanism. Signed-off-by: Tejun Heo Suggested-by: Qu Wenruo Link: https://patch.msgid.link/aPZ3q6Hm865NicBC@slm.duckdns.org Signed-off-by: Rafael J. Wysocki --- include/linux/freezer.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 32884c9721e5..0a8c6c4d1a82 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -22,14 +22,18 @@ extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ extern unsigned int freeze_timeout_msecs; /* - * Check if a process has been frozen + * Check if a process has been frozen for PM or cgroup1 freezer. Note that + * cgroup2 freezer uses the job control mechanism and does not interact with + * the PM freezer. */ extern bool frozen(struct task_struct *p); extern bool freezing_slow_path(struct task_struct *p); /* - * Check if there is a request to freeze a process + * Check if there is a request to freeze a task from PM or cgroup1 freezer. + * Note that cgroup2 freezer uses the job control mechanism and does not + * interact with the PM freezer. */ static inline bool freezing(struct task_struct *p) { @@ -63,9 +67,9 @@ extern bool freeze_task(struct task_struct *p); extern bool set_freezable(void); #ifdef CONFIG_CGROUP_FREEZER -extern bool cgroup_freezing(struct task_struct *task); +extern bool cgroup1_freezing(struct task_struct *task); #else /* !CONFIG_CGROUP_FREEZER */ -static inline bool cgroup_freezing(struct task_struct *task) +static inline bool cgroup1_freezing(struct task_struct *task) { return false; } -- cgit v1.2.3 From 11e15a6f3287711e637e208df7089c710cef82b5 Mon Sep 17 00:00:00 2001 From: Raviteja Laggyshetty Date: Fri, 26 Sep 2025 12:12:10 +0530 Subject: dt-bindings: interconnect: qcom: Drop QPIC_CORE IDs As like other SDX targets, SDX75 QPIC BCM resource is also modeled as a RPMh clock in clk-rpmh driver. However, for SDX75, this resource was also described as an interconnect node mistakenly. Hence, drop the QPIC interconnect IDs and let the clients use clk-rpmh driver to vote for this resource. Even though this change is an ABI break, it is necessary to avoid describing the same resource provider in two different drivers, as it may lead to votes from clients overriding each other. Fixes: 956329ec7c5e ("dt-bindings: interconnect: Add compatibles for SDX75") Signed-off-by: Raviteja Laggyshetty [mani: kept the QUP defines value unchanged] Signed-off-by: Manivannan Sadhasivam Acked-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20250926-sdx75-icc-v2-2-20d6820e455c@oss.qualcomm.com Signed-off-by: Georgi Djakov --- include/dt-bindings/interconnect/qcom,sdx75.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/interconnect/qcom,sdx75.h b/include/dt-bindings/interconnect/qcom,sdx75.h index e903f5f3dd8f..0e19ee8f1687 100644 --- a/include/dt-bindings/interconnect/qcom,sdx75.h +++ b/include/dt-bindings/interconnect/qcom,sdx75.h @@ -6,9 +6,7 @@ #ifndef __DT_BINDINGS_INTERCONNECT_QCOM_SDX75_H #define __DT_BINDINGS_INTERCONNECT_QCOM_SDX75_H -#define MASTER_QPIC_CORE 0 #define MASTER_QUP_CORE_0 1 -#define SLAVE_QPIC_CORE 2 #define SLAVE_QUP_CORE_0 3 #define MASTER_LLCC 0 -- cgit v1.2.3 From c9822fad8038870bb690543539c8e9ad5213b12f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:14 +0100 Subject: libfs: allow to specify s_d_flags Make it possible for pseudo filesystems to specify default dentry flags. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-1-2e6f823ebdc0@kernel.org Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/pseudo_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pseudo_fs.h b/include/linux/pseudo_fs.h index 2503f7625d65..a651e60d9410 100644 --- a/include/linux/pseudo_fs.h +++ b/include/linux/pseudo_fs.h @@ -9,6 +9,7 @@ struct pseudo_fs_context { const struct xattr_handler * const *xattr; const struct dentry_operations *dops; unsigned long magic; + unsigned int s_d_flags; }; struct pseudo_fs_context *init_pseudo(struct fs_context *fc, -- cgit v1.2.3 From 85e1a7ec61d9829af5897da421eb135c6cc73e07 Mon Sep 17 00:00:00 2001 From: T Pratham Date: Wed, 22 Oct 2025 22:48:42 +0530 Subject: crypto: aead - Add support for on-stack AEAD req allocation This patch introduces infrastructure for allocating req objects on the stack for AEADs. The additions mirror the existing sync skcipher APIs. This can be used in cases where simple sync AEAD operations are being done. So allocating the request on stack avoides possible out-of-memory errors. The struct crypto_sync_aead is a wrapper around crypto_aead and should be used in its place when sync only requests will be done on the stack. Correspondingly, the request should be allocated with SYNC_AEAD_REQUEST_ON_STACK(). Similar to sync_skcipher APIs, the new sync_aead APIs are wrappers around the regular aead APIs to facilitate sync only operations. The following crypto APIs are added: - struct crypto_sync_aead - crypto_alloc_sync_aead() - crypto_free_sync_aead() - crypto_aync_aead_tfm() - crypto_sync_aead_setkey() - crypto_sync_aead_setauthsize() - crypto_sync_aead_authsize() - crypto_sync_aead_maxauthsize() - crypto_sync_aead_ivsize() - crypto_sync_aead_blocksize() - crypto_sync_aead_get_flags() - crypto_sync_aead_set_flags() - crypto_sync_aead_clear_flags() - crypto_sync_aead_reqtfm() - aead_request_set_sync_tfm() - SYNC_AEAD_REQUEST_ON_STACK() Signed-off-by: T Pratham Signed-off-by: Herbert Xu --- include/crypto/aead.h | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) (limited to 'include') diff --git a/include/crypto/aead.h b/include/crypto/aead.h index 0e8a41638678..8e66a1fa9c78 100644 --- a/include/crypto/aead.h +++ b/include/crypto/aead.h @@ -159,6 +159,21 @@ struct crypto_aead { struct crypto_tfm base; }; +struct crypto_sync_aead { + struct crypto_aead base; +}; + +#define MAX_SYNC_AEAD_REQSIZE 384 + +#define SYNC_AEAD_REQUEST_ON_STACK(name, _tfm) \ + char __##name##_desc[sizeof(struct aead_request) + \ + MAX_SYNC_AEAD_REQSIZE \ + ] CRYPTO_MINALIGN_ATTR; \ + struct aead_request *name = \ + (((struct aead_request *)__##name##_desc)->base.tfm = \ + crypto_sync_aead_tfm((_tfm)), \ + (void *)__##name##_desc) + static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_aead, base); @@ -180,11 +195,18 @@ static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm) */ struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask); +struct crypto_sync_aead *crypto_alloc_sync_aead(const char *alg_name, u32 type, u32 mask); + static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm) { return &tfm->base; } +static inline struct crypto_tfm *crypto_sync_aead_tfm(struct crypto_sync_aead *tfm) +{ + return crypto_aead_tfm(&tfm->base); +} + /** * crypto_free_aead() - zeroize and free aead handle * @tfm: cipher handle to be freed @@ -196,6 +218,11 @@ static inline void crypto_free_aead(struct crypto_aead *tfm) crypto_destroy_tfm(tfm, crypto_aead_tfm(tfm)); } +static inline void crypto_free_sync_aead(struct crypto_sync_aead *tfm) +{ + crypto_free_aead(&tfm->base); +} + /** * crypto_has_aead() - Search for the availability of an aead. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the @@ -238,6 +265,11 @@ static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm) return crypto_aead_alg_ivsize(crypto_aead_alg(tfm)); } +static inline unsigned int crypto_sync_aead_ivsize(struct crypto_sync_aead *tfm) +{ + return crypto_aead_ivsize(&tfm->base); +} + /** * crypto_aead_authsize() - obtain maximum authentication data size * @tfm: cipher handle @@ -255,6 +287,11 @@ static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm) return tfm->authsize; } +static inline unsigned int crypto_sync_aead_authsize(struct crypto_sync_aead *tfm) +{ + return crypto_aead_authsize(&tfm->base); +} + static inline unsigned int crypto_aead_alg_maxauthsize(struct aead_alg *alg) { return alg->maxauthsize; @@ -265,6 +302,11 @@ static inline unsigned int crypto_aead_maxauthsize(struct crypto_aead *aead) return crypto_aead_alg_maxauthsize(crypto_aead_alg(aead)); } +static inline unsigned int crypto_sync_aead_maxauthsize(struct crypto_sync_aead *tfm) +{ + return crypto_aead_maxauthsize(&tfm->base); +} + /** * crypto_aead_blocksize() - obtain block size of cipher * @tfm: cipher handle @@ -280,6 +322,11 @@ static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm) return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm)); } +static inline unsigned int crypto_sync_aead_blocksize(struct crypto_sync_aead *tfm) +{ + return crypto_aead_blocksize(&tfm->base); +} + static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm) { return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm)); @@ -300,6 +347,21 @@ static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags) crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags); } +static inline u32 crypto_sync_aead_get_flags(struct crypto_sync_aead *tfm) +{ + return crypto_aead_get_flags(&tfm->base); +} + +static inline void crypto_sync_aead_set_flags(struct crypto_sync_aead *tfm, u32 flags) +{ + crypto_aead_set_flags(&tfm->base, flags); +} + +static inline void crypto_sync_aead_clear_flags(struct crypto_sync_aead *tfm, u32 flags) +{ + crypto_aead_clear_flags(&tfm->base, flags); +} + /** * crypto_aead_setkey() - set key for cipher * @tfm: cipher handle @@ -319,6 +381,12 @@ static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags) int crypto_aead_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); +static inline int crypto_sync_aead_setkey(struct crypto_sync_aead *tfm, + const u8 *key, unsigned int keylen) +{ + return crypto_aead_setkey(&tfm->base, key, keylen); +} + /** * crypto_aead_setauthsize() - set authentication data size * @tfm: cipher handle @@ -331,11 +399,24 @@ int crypto_aead_setkey(struct crypto_aead *tfm, */ int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize); +static inline int crypto_sync_aead_setauthsize(struct crypto_sync_aead *tfm, + unsigned int authsize) +{ + return crypto_aead_setauthsize(&tfm->base, authsize); +} + static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req) { return __crypto_aead_cast(req->base.tfm); } +static inline struct crypto_sync_aead *crypto_sync_aead_reqtfm(struct aead_request *req) +{ + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + + return container_of(tfm, struct crypto_sync_aead, base); +} + /** * crypto_aead_encrypt() - encrypt plaintext * @req: reference to the aead_request handle that holds all information @@ -417,6 +498,12 @@ static inline void aead_request_set_tfm(struct aead_request *req, req->base.tfm = crypto_aead_tfm(tfm); } +static inline void aead_request_set_sync_tfm(struct aead_request *req, + struct crypto_sync_aead *tfm) +{ + aead_request_set_tfm(req, &tfm->base); +} + /** * aead_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request -- cgit v1.2.3 From 12ad5b2346f905a3962b4aee701191b7a8d1905a Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 23 Oct 2025 19:48:11 +0200 Subject: keys: Annotate struct asymmetric_key_id with __counted_by Add the __counted_by() compiler attribute to the flexible array member 'data' to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Signed-off-by: Thorsten Blum Reviewed-by: Lukas Wunner Reviewed-by: Jarkko Sakkinen Signed-off-by: Herbert Xu --- include/keys/asymmetric-type.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/keys/asymmetric-type.h b/include/keys/asymmetric-type.h index 69a13e1e5b2e..1b91c8f98688 100644 --- a/include/keys/asymmetric-type.h +++ b/include/keys/asymmetric-type.h @@ -49,7 +49,7 @@ enum asymmetric_payload_bits { */ struct asymmetric_key_id { unsigned short len; - unsigned char data[]; + unsigned char data[] __counted_by(len); }; struct asymmetric_key_ids { -- cgit v1.2.3 From 6568f14cb5ae68cd6c612604ca0c89301cf3a0d0 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 30 Oct 2025 18:01:54 -0700 Subject: vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN An ftrace warning was reported in ftrace_init_ool_stub(): WARNING: arch/powerpc/kernel/trace/ftrace.c:234 at ftrace_init_ool_stub+0x188/0x3f4, CPU#0: swapper/0 The problem is that the linker script is placing .text.startup in .text rather than in .init.text, due to an inadvertent match of the TEXT_MAIN '.text.[0-9a-zA-Z_]*' pattern. This bug existed for some configurations before, but is only now coming to light due to the TEXT_MAIN macro unification in commit 1ba9f8979426 ("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros"). The .text.startup section consists of constructors which are used by KASAN, KCSAN, and GCOV. The constructors are only called during boot, so .text.startup is supposed to match the INIT_TEXT pattern so it can be placed in .init.text and freed after init. But since INIT_TEXT comes *after* TEXT_MAIN in the linker script, TEXT_MAIN needs to manually exclude .text.startup. Update TEXT_MAIN to exclude .text.startup (and its .text.startup.* variant from -ffunction-sections), along with .text.exit and .text.exit.* which should match EXIT_TEXT. Specifically, use a series of more specific glob patterns to match generic .text.* sections (for -ffunction-sections) while explicitly excluding .text.startup[.*] and .text.exit[.*]. Also update INIT_TEXT and EXIT_TEXT to explicitly match their -ffunction-sections variants (.text.startup.* and .text.exit.*). Fixes: 1ba9f8979426 ("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros") Closes: https://lore.kernel.org/72469502-ca37-4287-90b9-a751cecc498c@linux.ibm.com Reported-by: Venkat Rao Bagalkote Debugged-by: Hari Bathini Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Tested-by: Venkat Rao Bagalkote Link: https://patch.msgid.link/07f74b4e5c43872572b7def30f2eac45f28675d9.1761872421.git.jpoimboe@kernel.org --- include/asm-generic/vmlinux.lds.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5facbc994634..9de1d900fa15 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -88,13 +88,29 @@ /* * Support -ffunction-sections by matching .text and .text.*, - * but exclude '.text..*'. + * but exclude '.text..*', .text.startup[.*], and .text.exit[.*]. * - * Special .text.* sections that are typically grouped separately, such as + * .text.startup and .text.startup.* are matched later by INIT_TEXT. + * .text.exit and .text.exit.* are matched later by EXIT_TEXT. + * + * Other .text.* sections that are typically grouped separately, such as * .text.unlikely or .text.hot, must be matched explicitly before using * TEXT_MAIN. */ -#define TEXT_MAIN .text .text.[0-9a-zA-Z_]* +#define TEXT_MAIN \ + .text \ + .text.[_0-9A-Za-df-rt-z]* \ + .text.s[_0-9A-Za-su-z]* \ + .text.st[_0-9A-Zb-z]* \ + .text.sta[_0-9A-Za-qs-z]* \ + .text.star[_0-9A-Za-su-z]* \ + .text.start[_0-9A-Za-tv-z]* \ + .text.startu[_0-9A-Za-oq-z]* \ + .text.startup[_0-9A-Za-z]* \ + .text.e[_0-9A-Za-wy-z]* \ + .text.ex[_0-9A-Za-hj-z]* \ + .text.exi[_0-9A-Za-su-z]* \ + .text.exit[_0-9A-Za-z]* /* * Support -fdata-sections by matching .data, .data.*, and others, @@ -713,16 +729,16 @@ #define INIT_TEXT \ *(.init.text .init.text.*) \ - *(.text.startup) + *(.text.startup .text.startup.*) #define EXIT_DATA \ *(.exit.data .exit.data.*) \ *(.fini_array .fini_array.*) \ - *(.dtors .dtors.*) \ + *(.dtors .dtors.*) #define EXIT_TEXT \ *(.exit.text) \ - *(.text.exit) \ + *(.text.exit .text.exit.*) #define EXIT_CALL \ *(.exitcall.exit) -- cgit v1.2.3 From 4511fd86db6f8f94f8aff01044f5c69aa38f81f4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 Oct 2025 18:08:09 +0100 Subject: filemap: Add folio_next_pos() Replace the open-coded implementation in ocfs2 (which loses the top 32 bits on 32-bit architectures) with a helper in pagemap.h. Fixes: 35edec1d52c0 (ocfs2: update truncate handling of partial clusters) Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251024170822.1427218-2-willy@infradead.org Reviewed-by: Joseph Qi Reviewed-by: Christoph Hellwig Cc: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Cc: ocfs2-devel@lists.linux.dev Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..e16576e3763a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -941,6 +941,17 @@ static inline pgoff_t folio_next_index(const struct folio *folio) return folio->index + folio_nr_pages(folio); } +/** + * folio_next_pos - Get the file position of the next folio. + * @folio: The current folio. + * + * Return: The position of the folio which follows this folio in the file. + */ +static inline loff_t folio_next_pos(const struct folio *folio) +{ + return (loff_t)folio_next_index(folio) << PAGE_SHIFT; +} + /** * folio_file_page - The page for a particular index. * @folio: The folio which contains this index. -- cgit v1.2.3 From 4f6b0435c613fdb76d85bb4aae009309a8ce8784 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 29 Oct 2025 23:16:18 +0000 Subject: can: convert generic HW timestamp ioctl to ndo_hwtstamp callbacks Can has generic implementation of ndo_eth_ioctl which implements only HW timestamping commands. Implement generic ndo_hwtstamp callbacks and use it in drivers instead of generic ioctl interface. Signed-off-by: Vadim Fedorenko Reviewed-by: Kory Maincent Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20251029231620.1135640-2-vadim.fedorenko@linux.dev Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 0fe8f80f223e..bd7410b5d8a6 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -129,7 +129,11 @@ void close_candev(struct net_device *dev); void can_set_default_mtu(struct net_device *dev); int __must_check can_set_static_ctrlmode(struct net_device *dev, u32 static_mode); -int can_eth_ioctl_hwts(struct net_device *netdev, struct ifreq *ifr, int cmd); +int can_hwtstamp_get(struct net_device *netdev, + struct kernel_hwtstamp_config *cfg); +int can_hwtstamp_set(struct net_device *netdev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack); int can_ethtool_op_get_ts_info_hwts(struct net_device *dev, struct kernel_ethtool_ts_info *info); -- cgit v1.2.3 From 7463f5ad36d8073a0e740433faf97f030d226398 Mon Sep 17 00:00:00 2001 From: Raviteja Laggyshetty Date: Fri, 31 Oct 2025 03:38:47 +0000 Subject: dt-bindings: interconnect: document the RPMh Network-On-Chip interconnect in Kaanapali SoC Document the RPMh Network-On-Chip Interconnect of the Kaanapali platform. Co-developed-by: Odelu Kukatla Signed-off-by: Odelu Kukatla Signed-off-by: Raviteja Laggyshetty Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20251031-knp-interconnect-v4-1-568bba2cb3e5@oss.qualcomm.com Signed-off-by: Georgi Djakov --- .../dt-bindings/interconnect/qcom,kaanapali-rpmh.h | 149 +++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 include/dt-bindings/interconnect/qcom,kaanapali-rpmh.h (limited to 'include') diff --git a/include/dt-bindings/interconnect/qcom,kaanapali-rpmh.h b/include/dt-bindings/interconnect/qcom,kaanapali-rpmh.h new file mode 100644 index 000000000000..dde3f9abd677 --- /dev/null +++ b/include/dt-bindings/interconnect/qcom,kaanapali-rpmh.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef __DT_BINDINGS_INTERCONNECT_QCOM_KAANAPALI_H +#define __DT_BINDINGS_INTERCONNECT_QCOM_KAANAPALI_H + +#define MASTER_QSPI_0 0 +#define MASTER_CRYPTO 1 +#define MASTER_QUP_1 2 +#define MASTER_SDCC_4 3 +#define MASTER_UFS_MEM 4 +#define MASTER_USB3 5 +#define MASTER_QUP_2 6 +#define MASTER_QUP_3 7 +#define MASTER_QUP_4 8 +#define MASTER_IPA 9 +#define MASTER_SOCCP_PROC 10 +#define MASTER_SP 11 +#define MASTER_QDSS_ETR 12 +#define MASTER_QDSS_ETR_1 13 +#define MASTER_SDCC_2 14 +#define SLAVE_A1NOC_SNOC 15 +#define SLAVE_A2NOC_SNOC 16 + +#define MASTER_QUP_CORE_0 0 +#define MASTER_QUP_CORE_1 1 +#define MASTER_QUP_CORE_2 2 +#define MASTER_QUP_CORE_3 3 +#define MASTER_QUP_CORE_4 4 +#define SLAVE_QUP_CORE_0 5 +#define SLAVE_QUP_CORE_1 6 +#define SLAVE_QUP_CORE_2 7 +#define SLAVE_QUP_CORE_3 8 +#define SLAVE_QUP_CORE_4 9 + +#define MASTER_CNOC_CFG 0 +#define SLAVE_AHB2PHY_SOUTH 1 +#define SLAVE_AHB2PHY_NORTH 2 +#define SLAVE_CAMERA_CFG 3 +#define SLAVE_CLK_CTL 4 +#define SLAVE_CRYPTO_0_CFG 5 +#define SLAVE_DISPLAY_CFG 6 +#define SLAVE_EVA_CFG 7 +#define SLAVE_GFX3D_CFG 8 +#define SLAVE_I2C 9 +#define SLAVE_I3C_IBI0_CFG 10 +#define SLAVE_I3C_IBI1_CFG 11 +#define SLAVE_IMEM_CFG 12 +#define SLAVE_IPC_ROUTER_CFG 13 +#define SLAVE_CNOC_MSS 14 +#define SLAVE_PCIE_CFG 15 +#define SLAVE_PRNG 16 +#define SLAVE_QDSS_CFG 17 +#define SLAVE_QSPI_0 18 +#define SLAVE_QUP_1 19 +#define SLAVE_QUP_2 20 +#define SLAVE_QUP_3 21 +#define SLAVE_QUP_4 22 +#define SLAVE_SDCC_2 23 +#define SLAVE_SDCC_4 24 +#define SLAVE_SPSS_CFG 25 +#define SLAVE_TCSR 26 +#define SLAVE_TLMM 27 +#define SLAVE_UFS_MEM_CFG 28 +#define SLAVE_USB3 29 +#define SLAVE_VENUS_CFG 30 +#define SLAVE_VSENSE_CTRL_CFG 31 +#define SLAVE_CNOC_MNOC_CFG 32 +#define SLAVE_PCIE_ANOC_CFG 33 +#define SLAVE_QDSS_STM 34 +#define SLAVE_TCU 35 + +#define MASTER_GEM_NOC_CNOC 0 +#define MASTER_GEM_NOC_PCIE_SNOC 1 +#define SLAVE_AOSS 2 +#define SLAVE_IPA_CFG 3 +#define SLAVE_IPC_ROUTER_FENCE 4 +#define SLAVE_SOCCP 5 +#define SLAVE_TME_CFG 6 +#define SLAVE_APPSS 7 +#define SLAVE_CNOC_CFG 8 +#define SLAVE_DDRSS_CFG 9 +#define SLAVE_BOOT_IMEM 10 +#define SLAVE_IMEM 11 +#define SLAVE_PCIE_0 12 + +#define MASTER_GPU_TCU 0 +#define MASTER_SYS_TCU 1 +#define MASTER_APPSS_PROC 2 +#define MASTER_GFX3D 3 +#define MASTER_LPASS_GEM_NOC 4 +#define MASTER_MSS_PROC 5 +#define MASTER_MNOC_HF_MEM_NOC 6 +#define MASTER_MNOC_SF_MEM_NOC 7 +#define MASTER_COMPUTE_NOC 8 +#define MASTER_ANOC_PCIE_GEM_NOC 9 +#define MASTER_QPACE 10 +#define MASTER_SNOC_SF_MEM_NOC 11 +#define MASTER_WLAN_Q6 12 +#define MASTER_GIC 13 +#define SLAVE_GEM_NOC_CNOC 14 +#define SLAVE_LLCC 15 +#define SLAVE_MEM_NOC_PCIE_SNOC 16 + +#define MASTER_LPIAON_NOC 0 +#define SLAVE_LPASS_GEM_NOC 1 + +#define MASTER_LPASS_LPINOC 0 +#define SLAVE_LPIAON_NOC_LPASS_AG_NOC 1 + +#define MASTER_LPASS_PROC 0 +#define SLAVE_LPICX_NOC_LPIAON_NOC 1 + +#define MASTER_LLCC 0 +#define SLAVE_EBI1 1 + +#define MASTER_CAMNOC_HF 0 +#define MASTER_CAMNOC_NRT_ICP_SF 1 +#define MASTER_CAMNOC_RT_CDM_SF 2 +#define MASTER_CAMNOC_SF 3 +#define MASTER_MDP 4 +#define MASTER_MDSS_DCP 5 +#define MASTER_CDSP_HCP 6 +#define MASTER_VIDEO_CV_PROC 7 +#define MASTER_VIDEO_EVA 8 +#define MASTER_VIDEO_MVP 9 +#define MASTER_VIDEO_V_PROC 10 +#define MASTER_CNOC_MNOC_CFG 11 +#define SLAVE_MNOC_HF_MEM_NOC 12 +#define SLAVE_MNOC_SF_MEM_NOC 13 +#define SLAVE_SERVICE_MNOC 14 + +#define MASTER_CDSP_PROC 0 +#define SLAVE_CDSP_MEM_NOC 1 + +#define MASTER_PCIE_ANOC_CFG 0 +#define MASTER_PCIE_0 1 +#define SLAVE_ANOC_PCIE_GEM_NOC 2 +#define SLAVE_SERVICE_PCIE_ANOC 3 + +#define MASTER_A1NOC_SNOC 0 +#define MASTER_A2NOC_SNOC 1 +#define MASTER_APSS_NOC 2 +#define MASTER_CNOC_SNOC 3 +#define SLAVE_SNOC_GEM_NOC_SF 4 + +#endif -- cgit v1.2.3 From 0de4c70d04a46a3c266547dd4275ce25f623796a Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Thu, 25 Sep 2025 09:56:47 +0900 Subject: tracing: fprobe: use rhltable for fprobe_ip_table For now, all the kernel functions who are hooked by the fprobe will be added to the hash table "fprobe_ip_table". The key of it is the function address, and the value of it is "struct fprobe_hlist_node". The budget of the hash table is FPROBE_IP_TABLE_SIZE, which is 256. And this means the overhead of the hash table lookup will grow linearly if the count of the functions in the fprobe more than 256. When we try to hook all the kernel functions, the overhead will be huge. Therefore, replace the hash table with rhltable to reduce the overhead. Link: https://lore.kernel.org/all/20250819031825.55653-1-dongml2@chinatelecom.cn/ Signed-off-by: Menglong Dong Signed-off-by: Masami Hiramatsu (Google) --- include/linux/fprobe.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fprobe.h b/include/linux/fprobe.h index 7964db96e41a..0a3bcd1718f3 100644 --- a/include/linux/fprobe.h +++ b/include/linux/fprobe.h @@ -7,6 +7,7 @@ #include #include #include +#include #include struct fprobe; @@ -26,7 +27,7 @@ typedef void (*fprobe_exit_cb)(struct fprobe *fp, unsigned long entry_ip, * @fp: The fprobe which owns this. */ struct fprobe_hlist_node { - struct hlist_node hlist; + struct rhlist_head hlist; unsigned long addr; struct fprobe *fp; }; -- cgit v1.2.3 From 68c4c159a0db4409a5d6b5f4703d71b89a96f06a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Oct 2025 14:30:32 +0000 Subject: genirq: Fix percpu_devid irq affinity documentation Stephen points out that some of the percpu_devid irq affinity documentation is either missing or not matching the data structures. Address all the issues in one go. Fixes: 87b0031f7f73 ("irqdomain: Add firmware info reporting interface") Fixes: 258e7d28a3dc ("genirq: Add affinity to percpu_devid interrupt requests") Reported-by: Stephen Rothwell Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251030143032.2035987-1-maz@kernel.org --- include/linux/interrupt.h | 1 + include/linux/irqdomain.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index fa62ab556ee3..266f2b39213a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -109,6 +109,7 @@ typedef irqreturn_t (*irq_handler_t)(int, void *); * @name: name of the device * @dev_id: cookie to identify the device * @percpu_dev_id: cookie to identify the device + * @affinity: CPUs this irqaction is allowed to run on * @next: pointer to the next irqaction for shared interrupts * @irq: interrupt number * @flags: flags (see IRQF_* above) diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 5907baf6099d..952d3c8dd6b7 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -48,7 +48,7 @@ struct irq_fwspec { * struct irq_fwspec_info - firmware provided IRQ information structure * * @flags: Information validity flags - * @cpumask: Affinity mask for this interrupt + * @affinity: Affinity mask for this interrupt * * This structure reports firmware-specific information about an * interrupt. The only significant information is the affinity of a -- cgit v1.2.3 From 54133f9b4b53ffa2204eb27cfc9d50072c9a52d2 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Wed, 29 Oct 2025 13:43:10 -0700 Subject: net: mana: Support HW link state events Handle the NIC hardware link state events received from the HW channel, then set the proper link state accordingly. And, add a feature bit, GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE, to inform the NIC hardware this handler exists. Our MANA NIC only sends out the link state down/up messages when we need to let the VM rerun DHCP client and change IP address. So, add netif_carrier_on() in the probe(), let the NIC show the right initial state in /sys/class/net/ethX/operstate. Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/1761770601-16920-1-git-send-email-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski --- include/net/mana/gdma.h | 4 +++- include/net/mana/hw_channel.h | 2 ++ include/net/mana/mana.h | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 57df78cfbf82..637f42485dba 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -590,6 +590,7 @@ enum { /* Driver can self reset on FPGA Reconfig EQE notification */ #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) +#define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6) #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ @@ -599,7 +600,8 @@ enum { GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \ GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \ - GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE) + GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \ + GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE) #define GDMA_DRV_CAP_FLAGS2 0 diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h index 83cf93338eb3..16feb39616c1 100644 --- a/include/net/mana/hw_channel.h +++ b/include/net/mana/hw_channel.h @@ -24,6 +24,8 @@ #define HWC_INIT_DATA_PF_DEST_CQ_ID 11 #define HWC_DATA_CFG_HWC_TIMEOUT 1 +#define HWC_DATA_HW_LINK_CONNECT 2 +#define HWC_DATA_HW_LINK_DISCONNECT 3 #define HW_CHANNEL_WAIT_RESOURCE_TIMEOUT_MS 30000 diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 0921485565c0..8906901535f5 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -477,6 +477,10 @@ struct mana_context { struct dentry *mana_eqs_debugfs; struct net_device *ports[MAX_PORTS_IN_MANA_DEV]; + + /* Link state change work */ + struct work_struct link_change_work; + u32 link_event; }; struct mana_port_context { -- cgit v1.2.3 From 30176bf7c871681df506f3165ffe76ec462db991 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Wed, 29 Oct 2025 16:32:06 +0100 Subject: dpll: add phase-adjust-gran pin attribute Phase-adjust values are currently limited by a min-max range. Some hardware requires, for certain pin types, that values be multiples of a specific granularity, as in the zl3073x driver. Add a `phase-adjust-gran` pin attribute and an appropriate field in dpll_pin_properties. If set by the driver, use its value to validate user-provided phase-adjust values. Reviewed-by: Michal Schmidt Reviewed-by: Petr Oros Tested-by: Prathosh Satish Signed-off-by: Ivan Vecera Reviewed-by: Jiri Pirko Reviewed-by: Arkadiusz Kubalewski Link: https://patch.msgid.link/20251029153207.178448-2-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- include/linux/dpll.h | 1 + include/uapi/linux/dpll.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dpll.h b/include/linux/dpll.h index 25be745bf41f..562f520b23c2 100644 --- a/include/linux/dpll.h +++ b/include/linux/dpll.h @@ -163,6 +163,7 @@ struct dpll_pin_properties { u32 freq_supported_num; struct dpll_pin_frequency *freq_supported; struct dpll_pin_phase_adjust_range phase_range; + u32 phase_gran; }; #if IS_ENABLED(CONFIG_DPLL) diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index ab1725a954d7..69d35570ac4f 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -251,6 +251,7 @@ enum dpll_a_pin { DPLL_A_PIN_ESYNC_FREQUENCY_SUPPORTED, DPLL_A_PIN_ESYNC_PULSE, DPLL_A_PIN_REFERENCE_SYNC, + DPLL_A_PIN_PHASE_ADJUST_GRAN, __DPLL_A_PIN_MAX, DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1) -- cgit v1.2.3 From 652a86b24c5ac444afaf7625c9340d55aab7f105 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Fri, 31 Oct 2025 14:08:32 +0100 Subject: err.h: add INIT_ERR_PTR() macro Add INIT_ERR_PTR() macro to initialize static variables with error pointers. This might be useful for specific case where there is a static variable initialized to an error condition and then later set to the real handle once probe finish/completes. This is to handle compilation problems like: error: initializer element is not constant where ERR_PTR() can't be used. Signed-off-by: Christian Marangi Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20251031130835.7953-2-ansuelsmth@gmail.com [bjorn: Added () suffix on macro references] Signed-off-by: Bjorn Andersson --- include/linux/err.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/err.h b/include/linux/err.h index 1d60aa86db53..8c37be0620ab 100644 --- a/include/linux/err.h +++ b/include/linux/err.h @@ -41,6 +41,14 @@ static inline void * __must_check ERR_PTR(long error) return (void *) error; } +/** + * INIT_ERR_PTR - Init a const error pointer. + * @error: A negative error code. + * + * Like ERR_PTR(), but usable to initialize static variables. + */ +#define INIT_ERR_PTR(error) ((void *)(error)) + /* Return the pointer in the percpu address space. */ #define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error)) -- cgit v1.2.3 From 51d0656959bcdb743232f9b530b4cca569e74e7f Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Mon, 27 Oct 2025 13:59:31 +0100 Subject: genirq/manage: Reduce priority of forced secondary interrupt handler Crystal reports that the PCIe Advanced Error Reporting driver gets stuck in an infinite loop on PREEMPT_RT: Both the primary interrupt handler aer_irq() as well as the secondary handler aer_isr() are forced into threads with identical priority. Crystal writes that on the ARM system in question, the primary handler has to clear an error in the Root Error Status register... "before the next error happens, or else the hardware will set the Multiple ERR_COR Received bit. If that bit is set, then aer_isr() can't rely on the Error Source Identification register, so it scans through all devices looking for errors -- and for some reason, on this system, accessing the AER registers (or any Config Space above 0x400, even though there are capabilities located there) generates an Unsupported Request Error (but returns valid data). Since this happens more than once, without aer_irq() preempting, it causes another multi error and we get stuck in a loop." The issue does not show on non-PREEMPT_RT because the primary handler runs in hardirq context and thus can preempt the threaded secondary handler, clear the Root Error Status register and prevent the secondary handler from getting stuck. Emulate the same behavior on PREEMPT_RT by assigning a lower default priority to the secondary handler if the primary handler is forced into a thread. Reported-by: Crystal Wood Signed-off-by: Lukas Wunner Signed-off-by: Thomas Gleixner Tested-by: Crystal Wood Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/f6dcdb41be2694886b8dbf4fe7b3ab89e9d5114c.1761569303.git.lukas@wunner.de Closes: https://lore.kernel.org/r/20250902224441.368483-1-crwood@redhat.com/ --- include/linux/sched.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..cd6be74d87b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1901,6 +1901,7 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_para extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); +extern void sched_set_fifo_secondary(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); -- cgit v1.2.3 From 933ecf591275e850a46b28c6016d2688b92e23f6 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 1 Nov 2025 21:19:41 -0700 Subject: random: remove unused get_random_var_wait functions None of these functions are used, so remove them. This renders the two bugs moot: - get_random_u64_wait() used the wrong pointer type, making it provide only 32 bits. - The '#undef' directive used the wrong identifier, leaving the helper macro defined. Signed-off-by: Eric Biggers Signed-off-by: Jason A. Donenfeld --- include/linux/random.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include') diff --git a/include/linux/random.h b/include/linux/random.h index 333cecfca93f..8a8064dc3970 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -130,21 +130,6 @@ static inline int get_random_bytes_wait(void *buf, size_t nbytes) return ret; } -#define declare_get_random_var_wait(name, ret_type) \ - static inline int get_random_ ## name ## _wait(ret_type *out) { \ - int ret = wait_for_random_bytes(); \ - if (unlikely(ret)) \ - return ret; \ - *out = get_random_ ## name(); \ - return 0; \ - } -declare_get_random_var_wait(u8, u8) -declare_get_random_var_wait(u16, u16) -declare_get_random_var_wait(u32, u32) -declare_get_random_var_wait(u64, u32) -declare_get_random_var_wait(long, unsigned long) -#undef declare_get_random_var - #ifdef CONFIG_SMP int random_prepare_cpu(unsigned int cpu); int random_online_cpu(unsigned int cpu); -- cgit v1.2.3 From 8ed6b8842c44a4a716dfd536e7f13aff77039a02 Mon Sep 17 00:00:00 2001 From: Dzmitry Sankouski Date: Thu, 25 Sep 2025 22:09:56 +0300 Subject: power: supply: max77705_charger: implement aicl feature Adaptive input current allows charger to reduce it's current consumption, when source is not able to provide enough power. Signed-off-by: Dzmitry Sankouski Link: https://patch.msgid.link/20250925-max77705_77976_charger_improvement-v6-1-972c716c17d1@gmail.com Signed-off-by: Sebastian Reichel --- include/linux/power/max77705_charger.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/power/max77705_charger.h b/include/linux/power/max77705_charger.h index 6653abfdf747..b3950ce0625e 100644 --- a/include/linux/power/max77705_charger.h +++ b/include/linux/power/max77705_charger.h @@ -123,6 +123,8 @@ #define MAX77705_DISABLE_SKIP 1 #define MAX77705_AUTO_SKIP 0 +#define AICL_WORK_DELAY_MS 100 + /* uA */ #define MAX77705_CURRENT_CHGIN_STEP 25000 #define MAX77705_CURRENT_CHG_STEP 50000 -- cgit v1.2.3 From 3434be392051a2fdb295df3cfe07bf75235250a0 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 20 Oct 2025 10:38:14 +0000 Subject: scsi: target: Rename target_configure_unmap_from_queue() Rename target_configure_unmap_from_queue() to target_configure_unmap_from_bdev() since it now takes a bdev. Signed-off-by: Mike Christie Signed-off-by: John Garry Reviewed-by: John Garry Link: https://patch.msgid.link/20251020103820.2917593-2-john.g.garry@oracle.com Signed-off-by: Martin K. Petersen --- include/target/target_core_backend.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index 4063a701081b..d394306f8f49 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -121,8 +121,8 @@ sense_reason_t passthrough_parse_cdb(struct se_cmd *cmd, bool target_sense_desc_format(struct se_device *dev); sector_t target_to_linux_sector(struct se_device *dev, sector_t lb); -bool target_configure_unmap_from_queue(struct se_dev_attrib *attrib, - struct block_device *bdev); +bool target_configure_unmap_from_bdev(struct se_dev_attrib *attrib, + struct block_device *bdev); static inline bool target_dev_configured(struct se_device *se_dev) { -- cgit v1.2.3 From d505447b8d78f4d81a67d492ac72b8d3a1805e72 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 20 Oct 2025 10:38:15 +0000 Subject: scsi: target: Add atomic se_device fields Add atomic fields to the se_device and export them in configfs. Initially only target_core_iblock will be supported and we will inherit all the settings from the block layer. Signed-off-by: Mike Christie jpg: Stop being allowed to configure atomic write alignment, Signed-off-by: John Garry Link: https://patch.msgid.link/20251020103820.2917593-3-john.g.garry@oracle.com Signed-off-by: Martin K. Petersen --- include/target/target_core_base.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index c4d9116904aa..70ece58d3078 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -731,6 +731,11 @@ struct se_dev_attrib { u32 unmap_granularity; u32 unmap_granularity_alignment; u32 max_write_same_len; + u32 atomic_max_len; + u32 atomic_alignment; + u32 atomic_granularity; + u32 atomic_max_with_boundary; + u32 atomic_max_boundary; u8 submit_type; struct se_device *da_dev; struct config_group da_group; -- cgit v1.2.3 From c486634fe2b10301bd8f0319c70a919433bfdf17 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 20 Oct 2025 10:38:16 +0000 Subject: scsi: target: Add helper to set up atomic values from block_device Add a helper function that sets up the atomic value based on a block_device similar to what we do for unmap. Signed-off-by: Mike Christie jpg: Set atomic alignment, drop atomic_supported reference Signed-off-by: John Garry Link: https://patch.msgid.link/20251020103820.2917593-4-john.g.garry@oracle.com Signed-off-by: Martin K. Petersen --- include/target/target_core_backend.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index d394306f8f49..e32de80854b6 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -123,6 +123,8 @@ bool target_sense_desc_format(struct se_device *dev); sector_t target_to_linux_sector(struct se_device *dev, sector_t lb); bool target_configure_unmap_from_bdev(struct se_dev_attrib *attrib, struct block_device *bdev); +void target_configure_write_atomic_from_bdev(struct se_dev_attrib *attrib, + struct block_device *bdev); static inline bool target_dev_configured(struct se_device *se_dev) { -- cgit v1.2.3 From 526145725106b490b0c2d9f200b705b17a3da6b6 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Mon, 20 Oct 2025 10:38:17 +0000 Subject: scsi: target: Add WRITE_ATOMIC_16 handler Add the core LIO code to process the WRITE_ATOMIC_16 command. Signed-off-by: Mike Christie jpg: fix return code from sbc_check_atomic, reformat Signed-off-by: John Garry Link: https://patch.msgid.link/20251020103820.2917593-5-john.g.garry@oracle.com Signed-off-by: Martin K. Petersen --- include/target/target_core_base.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 70ece58d3078..56333b5726c8 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -158,6 +158,7 @@ enum se_cmd_flags_table { SCF_TASK_ATTR_SET = (1 << 17), SCF_TREAT_READ_AS_NORMAL = (1 << 18), SCF_TASK_ORDERED_SYNC = (1 << 19), + SCF_ATOMIC = (1 << 20), }; /* -- cgit v1.2.3 From 95aa2041c654161d1b5c1eca5379d67d91ef1cf2 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 17 Sep 2025 17:12:53 -0500 Subject: scsi: target: Fix LUN/device R/W and total command stats In commit 9cf2317b795d ("scsi: target: Move I/O path stats to per CPU") I saw we sometimes use %u and also misread the spec. As a result I thought all the stats were supposed to be 32-bit only. However, for the majority of cases we support currently, the spec specifies u64 bit stats. This patch converts the stats changed in the commit above to u64. Fixes: 9cf2317b795d ("scsi: target: Move I/O path stats to per CPU") Signed-off-by: Mike Christie Reviewed-by: Dmitry Bogdanov Link: https://patch.msgid.link/20250917221338.14813-2-michael.christie@oracle.com Signed-off-by: Martin K. Petersen --- include/target/target_core_base.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index c4d9116904aa..27e1f9d5f0c6 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -671,9 +671,9 @@ struct se_lun_acl { }; struct se_dev_entry_io_stats { - u32 total_cmds; - u32 read_bytes; - u32 write_bytes; + u64 total_cmds; + u64 read_bytes; + u64 write_bytes; }; struct se_dev_entry { @@ -806,9 +806,9 @@ struct se_device_queue { }; struct se_dev_io_stats { - u32 total_cmds; - u32 read_bytes; - u32 write_bytes; + u64 total_cmds; + u64 read_bytes; + u64 write_bytes; }; struct se_device { -- cgit v1.2.3 From bbb490053173b737604a87af03f2113fb1c279a0 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 17 Sep 2025 17:12:55 -0500 Subject: scsi: target: Move LUN stats to per-CPU The atomic use in the main I/O path is causing perf issues when using higher performance backend devices and multiple queues (more than 10 when using vhost-scsi) like with this fio workload: [global] bs=4K iodepth=128 direct=1 ioengine=libaio group_reporting time_based runtime=120 name=standard-iops rw=randread numjobs=16 cpus_allowed=0-15 To fix this issue, move the LUN stats to per CPU. Note: I forgot to include this patch with the delayed/ordered per CPU tracking and per device/device entry per CPU stats. With this patch you get the full 33% improvements when using fast backends, multiple queues and multiple IO submiters. Signed-off-by: Mike Christie Reviewed-by: Dmitry Bogdanov Link: https://patch.msgid.link/20250917221338.14813-4-michael.christie@oracle.com Signed-off-by: Martin K. Petersen --- include/target/target_core_base.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 27e1f9d5f0c6..372da2eadf54 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -744,9 +744,9 @@ struct se_port_stat_grps { }; struct scsi_port_stats { - atomic_long_t cmd_pdus; - atomic_long_t tx_data_octets; - atomic_long_t rx_data_octets; + u64 cmd_pdus; + u64 tx_data_octets; + u64 rx_data_octets; }; struct se_lun { @@ -773,7 +773,7 @@ struct se_lun { spinlock_t lun_tg_pt_gp_lock; struct se_portal_group *lun_tpg; - struct scsi_port_stats lun_stats; + struct scsi_port_stats __percpu *lun_stats; struct config_group lun_group; struct se_port_stat_grps port_stat_grps; struct completion lun_shutdown_comp; -- cgit v1.2.3 From e21d451a82f39e91b7635c4fc3ff5ac082873ec3 Mon Sep 17 00:00:00 2001 From: Pierre Barre Date: Thu, 16 Oct 2025 15:58:36 +0200 Subject: 9p: Use kvmalloc for message buffers on supported transports While developing a 9P server (https://github.com/Barre/ZeroFS) and testing it under high-load, I was running into allocation failures. The failures occur even with plenty of free memory available because kmalloc requires contiguous physical memory. This results in errors like: ls: page allocation failure: order:7, mode:0x40c40(GFP_NOFS|__GFP_COMP) This patch introduces a transport capability flag (supports_vmalloc) that indicates whether a transport can work with vmalloc'd buffers (non-physically contiguous memory). Transports requiring DMA should leave this flag as false. The fd-based transports (tcp, unix, fd) set this flag to true, and p9_fcall_init will use kvmalloc instead of kmalloc for these transports. This allows the allocator to fall back to vmalloc when contiguous physical memory is not available. Additionally, if kmem_cache_alloc fails, the code falls back to kvmalloc for transports that support it. Signed-off-by: Pierre Barre Reviewed-by: Christian Schoenebeck Message-ID: Signed-off-by: Dominique Martinet --- include/net/9p/transport.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index 766ec07c9599..f0981515148d 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -24,6 +24,9 @@ * we're less flexible when choosing the response message * size in this case * @def: set if this transport should be considered the default + * @supports_vmalloc: set if this transport can work with vmalloc'd buffers + * (non-physically contiguous memory). Transports requiring + * DMA should leave this as false. * @create: member function to create a new connection on this transport * @close: member function to discard a connection on this transport * @request: member function to issue a request to the transport @@ -44,6 +47,7 @@ struct p9_trans_module { int maxsize; /* max message size of transport */ bool pooled_rbuffers; int def; /* this transport should be default */ + bool supports_vmalloc; /* can work with vmalloc'd buffers */ struct module *owner; int (*create)(struct p9_client *client, const char *devname, char *args); -- cgit v1.2.3 From eeaf38a798aff6384983e5a0ac464d146de7ff55 Mon Sep 17 00:00:00 2001 From: Dominique Martinet Date: Fri, 31 Oct 2025 16:40:07 +0900 Subject: net/9p: cleanup: change p9_trans_module->def to bool '->def' is only ever used as a true/false flag Reported-by: Christophe JAILLET Message-ID: <20251103-v9fs_trans_def_bool-v1-1-f33dc7ed9e81@codewreck.org> Signed-off-by: Dominique Martinet --- include/net/9p/transport.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index f0981515148d..0aedabc9b7eb 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -46,7 +46,7 @@ struct p9_trans_module { char *name; /* name of transport */ int maxsize; /* max message size of transport */ bool pooled_rbuffers; - int def; /* this transport should be default */ + bool def; /* this transport should be default */ bool supports_vmalloc; /* can work with vmalloc'd buffers */ struct module *owner; int (*create)(struct p9_client *client, -- cgit v1.2.3 From 695f2ca1b4247724576d57eae7b74b90dc69ba3c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 10 Oct 2025 16:36:16 -0500 Subject: fs/fs_parse: add back fsparam_u32hex 296b67059 removed fsparam_u32hex because there were no callers (yet) and it didn't build due to using the nonexistent symbol fs_param_is_u32_hex. fs/9p will need this parser, so add it back with the appropriate fix (use fs_param_is_u32). Signed-off-by: Eric Sandeen Message-ID: <20251010214222.1347785-2-sandeen@redhat.com> Signed-off-by: Dominique Martinet --- include/linux/fs_parser.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 5a0e897cae80..5e8a3b546033 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -120,6 +120,8 @@ static inline bool fs_validate_description(const char *name, #define fsparam_u32(NAME, OPT) __fsparam(fs_param_is_u32, NAME, OPT, 0, NULL) #define fsparam_u32oct(NAME, OPT) \ __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)8) +#define fsparam_u32hex(NAME, OPT) \ + __fsparam(fs_param_is_u32, NAME, OPT, 0, (void *)16) #define fsparam_s32(NAME, OPT) __fsparam(fs_param_is_s32, NAME, OPT, 0, NULL) #define fsparam_u64(NAME, OPT) __fsparam(fs_param_is_u64, NAME, OPT, 0, NULL) #define fsparam_enum(NAME, OPT, array) __fsparam(fs_param_is_enum, NAME, OPT, 0, array) -- cgit v1.2.3 From c44393d84149d6fc91d94fa39321c9657e91b388 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 10 Oct 2025 16:36:17 -0500 Subject: net/9p: move structures and macros to header files With the new mount API all option parsing will need to happen in fs/v9fs.c, so move some existing data structures and macros to header files to facilitate this. Rename some to reflect the transport they are used for (rdma, fd, etc), for clarity. Signed-off-by: Eric Sandeen Message-ID: <20251010214222.1347785-3-sandeen@redhat.com> Signed-off-by: Dominique Martinet --- include/net/9p/client.h | 6 ++++++ include/net/9p/transport.h | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'include') diff --git a/include/net/9p/client.h b/include/net/9p/client.h index 4f785098c67a..2d46f8017bd5 100644 --- a/include/net/9p/client.h +++ b/include/net/9p/client.h @@ -16,6 +16,12 @@ /* Number of requests per row */ #define P9_ROW_MAXTAG 255 +/* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ + + * room for write (16 extra) or read (11 extra) operands. + */ + +#define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ) + /** enum p9_proto_versions - 9P protocol versions * @p9_proto_legacy: 9P Legacy mode, pre-9P2000.u * @p9_proto_2000u: 9P2000.u extension diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index 0aedabc9b7eb..db6ad369a171 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -14,6 +14,45 @@ #define P9_DEF_MIN_RESVPORT (665U) #define P9_DEF_MAX_RESVPORT (1023U) +#define P9_FD_PORT 564 + +#define P9_RDMA_PORT 5640 +#define P9_RDMA_SQ_DEPTH 32 +#define P9_RDMA_RQ_DEPTH 32 +#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ + +/** + * struct p9_fd_opts - per-transport options for fd transport + * @rfd: file descriptor for reading (trans=fd) + * @wfd: file descriptor for writing (trans=fd) + * @port: port to connect to (trans=tcp) + * @privport: port is privileged + */ + +struct p9_fd_opts { + int rfd; + int wfd; + u16 port; + bool privport; +}; + +/** + * struct p9_rdma_opts - Collection of mount options for rdma transport + * @port: port of connection + * @privport: Whether a privileged port may be used + * @sq_depth: The requested depth of the SQ. This really doesn't need + * to be any deeper than the number of threads used in the client + * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth + * @timeout: Time to wait in msecs for CM events + */ +struct p9_rdma_opts { + short port; + bool privport; + int sq_depth; + int rq_depth; + long timeout; +}; + /** * struct p9_trans_module - transport module interface * @list: used to maintain a list of currently available transports -- cgit v1.2.3 From 075e8bd4127f007910fc302ad5c3c471d0be4799 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 10 Oct 2025 16:36:18 -0500 Subject: 9p: create a v9fs_context structure to hold parsed options This patch creates a new v9fs_context structure which includes new p9_session_opts and p9_client_opts structures, as well as re-using the existing p9_fd_opts and p9_rdma_opts to store options during parsing. The new structure will be used in the next commit to pass all parsed options to the appropriate transports. Signed-off-by: Eric Sandeen Message-ID: <20251010214222.1347785-4-sandeen@redhat.com> Signed-off-by: Dominique Martinet --- include/net/9p/client.h | 90 ++++++++++++++++++++++++++++++++++++++++++++++ include/net/9p/transport.h | 32 ----------------- 2 files changed, 90 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/net/9p/client.h b/include/net/9p/client.h index 2d46f8017bd5..cc18443f7d51 100644 --- a/include/net/9p/client.h +++ b/include/net/9p/client.h @@ -132,6 +132,96 @@ struct p9_client { char name[__NEW_UTS_LEN + 1]; }; +/** + * struct p9_fd_opts - holds client options during parsing + * @msize: maximum data size negotiated by protocol + * @prot-Oversion: 9P protocol version to use + * @trans_mod: module API instantiated with this client + * + * These parsed options get transferred into client in + * apply_client_options() + */ +struct p9_client_opts { + unsigned int msize; + unsigned char proto_version; + struct p9_trans_module *trans_mod; +}; + +/** + * struct p9_fd_opts - per-transport options for fd transport + * @rfd: file descriptor for reading (trans=fd) + * @wfd: file descriptor for writing (trans=fd) + * @port: port to connect to (trans=tcp) + * @privport: port is privileged + */ +struct p9_fd_opts { + int rfd; + int wfd; + u16 port; + bool privport; +}; + +/** + * struct p9_rdma_opts - Collection of mount options for rdma transport + * @port: port of connection + * @privport: Whether a privileged port may be used + * @sq_depth: The requested depth of the SQ. This really doesn't need + * to be any deeper than the number of threads used in the client + * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth + * @timeout: Time to wait in msecs for CM events + */ +struct p9_rdma_opts { + short port; + bool privport; + int sq_depth; + int rq_depth; + long timeout; +}; + +/** + * struct p9_session_opts - holds parsed options for v9fs_session_info + * @flags: session options of type &p9_session_flags + * @nodev: set to 1 to disable device mapping + * @debug: debug level + * @afid: authentication handle + * @cache: cache mode of type &p9_cache_bits + * @cachetag: the tag of the cache associated with this session + * @uname: string user name to mount hierarchy as + * @aname: mount specifier for remote hierarchy + * @dfltuid: default numeric userid to mount hierarchy as + * @dfltgid: default numeric groupid to mount hierarchy as + * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy + * @session_lock_timeout: retry interval for blocking locks + * + * This strucure holds options which are parsed and will be transferred + * to the v9fs_session_info structure when mounted, and therefore largely + * duplicates struct v9fs_session_info. + */ +struct p9_session_opts { + unsigned int flags; + unsigned char nodev; + unsigned short debug; + unsigned int afid; + unsigned int cache; +#ifdef CONFIG_9P_FSCACHE + char *cachetag; +#endif + char *uname; + char *aname; + kuid_t dfltuid; + kgid_t dfltgid; + kuid_t uid; + long session_lock_timeout; +}; + +/* Used by mount API to store parsed mount options */ +struct v9fs_context { + struct p9_client_opts client_opts; + struct p9_fd_opts fd_opts; + struct p9_rdma_opts rdma_opts; + struct p9_session_opts session_opts; +}; + /** * struct p9_fid - file system entity handle * @clnt: back pointer to instantiating &p9_client diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index db6ad369a171..898a432a8063 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -21,38 +21,6 @@ #define P9_RDMA_RQ_DEPTH 32 #define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ -/** - * struct p9_fd_opts - per-transport options for fd transport - * @rfd: file descriptor for reading (trans=fd) - * @wfd: file descriptor for writing (trans=fd) - * @port: port to connect to (trans=tcp) - * @privport: port is privileged - */ - -struct p9_fd_opts { - int rfd; - int wfd; - u16 port; - bool privport; -}; - -/** - * struct p9_rdma_opts - Collection of mount options for rdma transport - * @port: port of connection - * @privport: Whether a privileged port may be used - * @sq_depth: The requested depth of the SQ. This really doesn't need - * to be any deeper than the number of threads used in the client - * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth - * @timeout: Time to wait in msecs for CM events - */ -struct p9_rdma_opts { - short port; - bool privport; - int sq_depth; - int rq_depth; - long timeout; -}; - /** * struct p9_trans_module - transport module interface * @list: used to maintain a list of currently available transports -- cgit v1.2.3 From 1f3e4142c0eb178089ea0cbc97506a061470ad27 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 10 Oct 2025 16:36:19 -0500 Subject: 9p: convert to the new mount API Convert 9p to the new mount API. This patch consolidates all parsing into fs/9p/v9fs.c, which stores all results into a filesystem context which can be passed to the various transports as needed. Some of the parsing helper functions such as get_cache_mode() have been eliminated in favor of using the new mount API's enum param type, for simplicity. Signed-off-by: Eric Sandeen Message-ID: <20251010214222.1347785-5-sandeen@redhat.com> [ Dominique: handled source explicitly as per follow-up discussion ] Signed-off-by: Dominique Martinet --- include/net/9p/client.h | 2 +- include/net/9p/transport.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/9p/client.h b/include/net/9p/client.h index cc18443f7d51..838a94218b59 100644 --- a/include/net/9p/client.h +++ b/include/net/9p/client.h @@ -279,7 +279,7 @@ int p9_client_rename(struct p9_fid *fid, struct p9_fid *newdirfid, const char *name); int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name, struct p9_fid *newdirfid, const char *new_name); -struct p9_client *p9_client_create(const char *dev_name, char *options); +struct p9_client *p9_client_create(struct fs_context *fc); void p9_client_destroy(struct p9_client *clnt); void p9_client_disconnect(struct p9_client *clnt); void p9_client_begin_disconnect(struct p9_client *clnt); diff --git a/include/net/9p/transport.h b/include/net/9p/transport.h index 898a432a8063..a912bbaa862f 100644 --- a/include/net/9p/transport.h +++ b/include/net/9p/transport.h @@ -57,7 +57,7 @@ struct p9_trans_module { bool supports_vmalloc; /* can work with vmalloc'd buffers */ struct module *owner; int (*create)(struct p9_client *client, - const char *devname, char *args); + struct fs_context *fc); void (*close)(struct p9_client *client); int (*request)(struct p9_client *client, struct p9_req_t *req); int (*cancel)(struct p9_client *client, struct p9_req_t *req); -- cgit v1.2.3 From c95de73da12bf4586b7bcd6b23a6968c21991cc7 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 30 Oct 2025 22:41:18 -0700 Subject: mtd: spear_smi: fix kernel-doc warnings Correct most kernel-doc warnings in include/linux/mtd/spear_smi.h by adding a leading '@' to the description of struct members. Add a new description for the missing @np member. Warning: spear_smi.h:48 struct member 'name' not described in 'spear_smi_flash_info' Warning: spear_smi.h:48 struct member 'mem_base' not described in 'spear_smi_flash_info' Warning: spear_smi.h:48 struct member 'size' not described in 'spear_smi_flash_info' Warning: spear_smi.h:48 struct member 'partitions' not described in 'spear_smi_flash_info' Warning: spear_smi.h:48 struct member 'nr_partitions' not described in 'spear_smi_flash_info' Warning: spear_smi.h:48 struct member 'fast_mode' not described in 'spear_smi_flash_info' Warning: spear_smi.h:62 struct member 'clk_rate' not described in 'spear_smi_plat_data' Warning: spear_smi.h:62 struct member 'num_flashes' not described in 'spear_smi_plat_data' Warning: spear_smi.h:62 struct member 'board_flash_info' not described in 'spear_smi_plat_data' Warning: spear_smi.h:62 struct member 'np' not described in 'spear_smi_plat_data' Signed-off-by: Randy Dunlap Signed-off-by: Miquel Raynal --- include/linux/mtd/spear_smi.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mtd/spear_smi.h b/include/linux/mtd/spear_smi.h index 581603ac1277..871634862627 100644 --- a/include/linux/mtd/spear_smi.h +++ b/include/linux/mtd/spear_smi.h @@ -31,12 +31,12 @@ * struct spear_smi_flash_info - platform structure for passing flash * information * - * name: name of the serial nor flash for identification - * mem_base: the memory base on which the flash is mapped - * size: size of the flash in bytes - * partitions: parition details - * nr_partitions: number of partitions - * fast_mode: whether flash supports fast mode + * @name: name of the serial nor flash for identification + * @mem_base: the memory base on which the flash is mapped + * @size: size of the flash in bytes + * @partitions: parition details + * @nr_partitions: number of partitions + * @fast_mode: whether flash supports fast mode */ struct spear_smi_flash_info { @@ -51,9 +51,10 @@ struct spear_smi_flash_info { /** * struct spear_smi_plat_data - platform structure for configuring smi * - * clk_rate: clk rate at which SMI must operate - * num_flashes: number of flashes present on board - * board_flash_info: specific details of each flash present on board + * @clk_rate: clk rate at which SMI must operate + * @num_flashes: number of flashes present on board + * @board_flash_info: specific details of each flash present on board + * @np: array of DT node pointers for all possible flash chip devices */ struct spear_smi_plat_data { unsigned long clk_rate; -- cgit v1.2.3 From c3d78c34ad009a7cce57ae5b5c93e1bd03bb31a3 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 22 Sep 2025 11:30:10 +0800 Subject: perf: arm_pmuv3: Don't use PMCCNTR_EL0 on SMT cores CPU_CYCLES is expected to count the logical CPU (PE) clock. Currently it's preferred to use PMCCNTR_EL0 for counting CPU_CYCLES, but it'll count processor clock rather than the PE clock (ARM DDI0487 L.b D13.1.3) if one of the SMT siblings is not idle on a multi-threaded implementation. So don't use it on SMT cores. Introduce topology_core_has_smt() for knowing the SMT implementation and cached it in arm_pmu::has_smt during allocation. When counting cycles on SMT CPU 2-3 and CPU 3 is idle, without this patch we'll get: [root@client1 tmp]# perf stat -e cycles -A -C 2-3 -- stress-ng -c 1 --taskset 2 --timeout 1 [...] Performance counter stats for 'CPU(s) 2-3': CPU2 2880457316 cycles CPU3 2880459810 cycles 1.254688470 seconds time elapsed With this patch the idle state of CPU3 is observed as expected: [root@client1 ~]# perf stat -e cycles -A -C 2-3 -- stress-ng -c 1 --taskset 2 --timeout 1 [...] Performance counter stats for 'CPU(s) 2-3': CPU2 2558580492 cycles CPU3 305749 cycles 1.113626410 seconds time elapsed Signed-off-by: Yicong Yang Signed-off-by: Will Deacon --- include/linux/arch_topology.h | 11 +++++++++++ include/linux/perf/arm_pmu.h | 1 + 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d72d6e5aa200..daa1af2e8204 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -89,6 +89,17 @@ void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); void freq_inv_set_max_ratio(int cpu, u64 max_rate); + +/* + * Architectures like ARM64 don't have reliable architectural way to get SMT + * information and depend on the firmware (ACPI/OF) report. Non-SMT core won't + * initialize thread_id so we can use this to detect the SMT implementation. + */ +static inline bool topology_core_has_smt(int cpu) +{ + return cpu_topology[cpu].thread_id != -1; +} + #endif #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 93c9a26492fc..2d39322c40c4 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -119,6 +119,7 @@ struct arm_pmu { /* PMUv3 only */ int pmuver; + bool has_smt; u64 reg_pmmir; u64 reg_brbidr; #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 -- cgit v1.2.3 From 56d9df41ef1847ed0523f57ec6117649d581401d Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Sat, 1 Nov 2025 01:45:04 +0100 Subject: rtc: ds1685: stop setting max_user_freq max_user_freq has not been related to the hardware RTC since commit 6610e0893b8b ("RTC: Rework RTC code to use timerqueue for events"). Stop setting it from individual driver to avoid confusing new contributors. Acked-by: Joshua Kinard Link: https://patch.msgid.link/20251101-max_user_freq-v1-2-c9a274fd6883@bootlin.com Signed-off-by: Alexandre Belloni --- include/linux/rtc/ds1685.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/rtc/ds1685.h b/include/linux/rtc/ds1685.h index 01da4582db6d..8ec0ebfaef04 100644 --- a/include/linux/rtc/ds1685.h +++ b/include/linux/rtc/ds1685.h @@ -324,7 +324,6 @@ struct ds1685_rtc_platform_data { #define RTC_SQW_2HZ 0x0f /* 0 1 1 1 1 */ #define RTC_SQW_0HZ 0x00 /* 0 0 0 0 0 */ #define RTC_SQW_32768HZ 32768 /* 1 - - - - */ -#define RTC_MAX_USER_FREQ 8192 /* -- cgit v1.2.3 From 3eb6660f26d13acdbcb9241ac3e95d44419f2284 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 29 Oct 2025 10:40:52 +0100 Subject: uaccess: Provide ASM GOTO safe wrappers for unsafe_*_user() ASM GOTO is miscompiled by GCC when it is used inside a auto cleanup scope: bool foo(u32 __user *p, u32 val) { scoped_guard(pagefault) unsafe_put_user(val, p, efault); return true; efault: return false; } e80: e8 00 00 00 00 call e85 e85: 65 48 8b 05 00 00 00 00 mov %gs:0x0(%rip),%rax e8d: 83 80 04 14 00 00 01 addl $0x1,0x1404(%rax) // pf_disable++ e94: 89 37 mov %esi,(%rdi) e96: 83 a8 04 14 00 00 01 subl $0x1,0x1404(%rax) // pf_disable-- e9d: b8 01 00 00 00 mov $0x1,%eax // success ea2: e9 00 00 00 00 jmp ea7 // ret ea7: 31 c0 xor %eax,%eax // fail ea9: e9 00 00 00 00 jmp eae // ret which is broken as it leaks the pagefault disable counter on failure. Clang at least fails the build. Linus suggested to add a local label into the macro scope and let that jump to the actual caller supplied error label. __label__ local_label; \ arch_unsafe_get_user(x, ptr, local_label); \ if (0) { \ local_label: \ goto label; \ That works for both GCC and clang. clang: c80: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) c85: 65 48 8b 0c 25 00 00 00 00 mov %gs:0x0,%rcx c8e: ff 81 04 14 00 00 incl 0x1404(%rcx) // pf_disable++ c94: 31 c0 xor %eax,%eax // set retval to false c96: 89 37 mov %esi,(%rdi) // write c98: b0 01 mov $0x1,%al // set retval to true c9a: ff 89 04 14 00 00 decl 0x1404(%rcx) // pf_disable-- ca0: 2e e9 00 00 00 00 cs jmp ca6 // ret The exception table entry points correctly to c9a GCC: f70: e8 00 00 00 00 call f75 f75: 65 48 8b 05 00 00 00 00 mov %gs:0x0(%rip),%rax f7d: 83 80 04 14 00 00 01 addl $0x1,0x1404(%rax) // pf_disable++ f84: 8b 17 mov (%rdi),%edx f86: 89 16 mov %edx,(%rsi) f88: 83 a8 04 14 00 00 01 subl $0x1,0x1404(%rax) // pf_disable-- f8f: b8 01 00 00 00 mov $0x1,%eax // success f94: e9 00 00 00 00 jmp f99 // ret f99: 83 a8 04 14 00 00 01 subl $0x1,0x1404(%rax) // pf_disable-- fa0: 31 c0 xor %eax,%eax // fail fa2: e9 00 00 00 00 jmp fa7 // ret The exception table entry points correctly to f99 So both compilers optimize out the extra goto and emit correct and efficient code. Provide a generic wrapper to do that to avoid modifying all the affected architecture specific implementation with that workaround. The only change required for architectures is to rename unsafe_*_user() to arch_unsafe_*_user(). That's done in subsequent changes. Suggested-by: Linus Torvalds Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/877bweujtn.ffs@tglx --- include/linux/uaccess.h | 72 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 68 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1beb5b395d81..8aa82b1d6013 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -518,7 +518,34 @@ long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr, long count); long strnlen_user_nofault(const void __user *unsafe_addr, long count); -#ifndef __get_kernel_nofault +#ifdef arch_get_kernel_nofault +/* + * Wrap the architecture implementation so that @label can be outside of a + * cleanup() scope. A regular C goto works correctly, but ASM goto does + * not. Clang rejects such an attempt, but GCC silently emits buggy code. + */ +#define __get_kernel_nofault(dst, src, type, label) \ +do { \ + __label__ local_label; \ + arch_get_kernel_nofault(dst, src, type, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#define __put_kernel_nofault(dst, src, type, label) \ +do { \ + __label__ local_label; \ + arch_put_kernel_nofault(dst, src, type, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#elif !defined(__get_kernel_nofault) /* arch_get_kernel_nofault */ + #define __get_kernel_nofault(dst, src, type, label) \ do { \ type __user *p = (type __force __user *)(src); \ @@ -535,7 +562,8 @@ do { \ if (__put_user(data, p)) \ goto label; \ } while (0) -#endif + +#endif /* !__get_kernel_nofault */ /** * get_kernel_nofault(): safely attempt to read from a location @@ -549,7 +577,42 @@ do { \ copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\ }) -#ifndef user_access_begin +#ifdef user_access_begin + +#ifdef arch_unsafe_get_user +/* + * Wrap the architecture implementation so that @label can be outside of a + * cleanup() scope. A regular C goto works correctly, but ASM goto does + * not. Clang rejects such an attempt, but GCC silently emits buggy code. + * + * Some architectures use internal local labels already, but this extra + * indirection here is harmless because the compiler optimizes it out + * completely in any case. This construct just ensures that the ASM GOTO + * target is always in the local scope. The C goto 'label' works correctly + * when leaving a cleanup() scope. + */ +#define unsafe_get_user(x, ptr, label) \ +do { \ + __label__ local_label; \ + arch_unsafe_get_user(x, ptr, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) + +#define unsafe_put_user(x, ptr, label) \ +do { \ + __label__ local_label; \ + arch_unsafe_put_user(x, ptr, local_label); \ + if (0) { \ + local_label: \ + goto label; \ + } \ +} while (0) +#endif /* arch_unsafe_get_user */ + +#else /* user_access_begin */ #define user_access_begin(ptr,len) access_ok(ptr, len) #define user_access_end() do { } while (0) #define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) @@ -559,7 +622,8 @@ do { \ #define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e) static inline unsigned long user_access_save(void) { return 0UL; } static inline void user_access_restore(unsigned long flags) { } -#endif +#endif /* !user_access_begin */ + #ifndef user_write_access_begin #define user_write_access_begin user_access_begin #define user_write_access_end user_access_end -- cgit v1.2.3 From f17d28968b7ba8722aa218d2e1362e8b5e010bc6 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 2 Oct 2025 13:32:53 +0300 Subject: media: v4l2-subdev: Make media_entity_to_v4l2_subdev() const-aware Retain the constness of the object in media_entity_to_v4l2_subdev(), by switching to container_of_const(). Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- include/media/v4l2-subdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/media/v4l2-subdev.h b/include/media/v4l2-subdev.h index e0bb58cb6d04..a37d9a847196 100644 --- a/include/media/v4l2-subdev.h +++ b/include/media/v4l2-subdev.h @@ -1103,7 +1103,7 @@ struct v4l2_subdev { typeof(ent) __me_sd_ent = (ent); \ \ __me_sd_ent ? \ - container_of(__me_sd_ent, struct v4l2_subdev, entity) : \ + container_of_const(__me_sd_ent, struct v4l2_subdev, entity) : \ NULL; \ }) -- cgit v1.2.3 From 68871116f961532910ccb97b6f437acf7e00548c Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 2 Oct 2025 13:32:54 +0300 Subject: media: v4l2-dev: Make macros to obtain containers const-aware Retain the constness of the object in media_entity_to_video_device() and to_video_device(), by switching to container_of_const(). Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- include/media/v4l2-dev.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/media/v4l2-dev.h b/include/media/v4l2-dev.h index a213c3398dcf..2e0f6d2e6a78 100644 --- a/include/media/v4l2-dev.h +++ b/include/media/v4l2-dev.h @@ -320,8 +320,8 @@ struct video_device { typeof(__entity) __me_vdev_ent = __entity; \ \ __me_vdev_ent ? \ - container_of(__me_vdev_ent, struct video_device, entity) : \ - NULL; \ + container_of_const(__me_vdev_ent, struct video_device, \ + entity) : NULL; \ }) /** @@ -330,7 +330,7 @@ struct video_device { * * @cd: pointer to &struct device */ -#define to_video_device(cd) container_of(cd, struct video_device, dev) +#define to_video_device(cd) container_of_const(cd, struct video_device, dev) /** * __video_register_device - register video4linux devices -- cgit v1.2.3 From 35f29b44ac0958cb4f4cb042b877d2546f3f6d27 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Thu, 2 Oct 2025 13:32:55 +0300 Subject: media: mc: Make macros to obtain containers const-aware Retain the constness of the graph objects and interfaces in macros to obtain their containers, by switching to container_of_const(). Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- include/media/media-entity.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/media/media-entity.h b/include/media/media-entity.h index 64cf590b1134..b91ff6f8c3bb 100644 --- a/include/media/media-entity.h +++ b/include/media/media-entity.h @@ -627,7 +627,7 @@ static inline bool media_entity_enum_intersects( * @gobj: Pointer to the struct &media_gobj graph object */ #define gobj_to_entity(gobj) \ - container_of(gobj, struct media_entity, graph_obj) + container_of_const(gobj, struct media_entity, graph_obj) /** * gobj_to_pad - returns the struct &media_pad pointer from the @@ -636,7 +636,7 @@ static inline bool media_entity_enum_intersects( * @gobj: Pointer to the struct &media_gobj graph object */ #define gobj_to_pad(gobj) \ - container_of(gobj, struct media_pad, graph_obj) + container_of_const(gobj, struct media_pad, graph_obj) /** * gobj_to_link - returns the struct &media_link pointer from the @@ -645,7 +645,7 @@ static inline bool media_entity_enum_intersects( * @gobj: Pointer to the struct &media_gobj graph object */ #define gobj_to_link(gobj) \ - container_of(gobj, struct media_link, graph_obj) + container_of_const(gobj, struct media_link, graph_obj) /** * gobj_to_intf - returns the struct &media_interface pointer from the @@ -654,7 +654,7 @@ static inline bool media_entity_enum_intersects( * @gobj: Pointer to the struct &media_gobj graph object */ #define gobj_to_intf(gobj) \ - container_of(gobj, struct media_interface, graph_obj) + container_of_const(gobj, struct media_interface, graph_obj) /** * intf_to_devnode - returns the struct media_intf_devnode pointer from the @@ -663,7 +663,7 @@ static inline bool media_entity_enum_intersects( * @intf: Pointer to struct &media_intf_devnode */ #define intf_to_devnode(intf) \ - container_of(intf, struct media_intf_devnode, intf) + container_of_const(intf, struct media_intf_devnode, intf) /** * media_gobj_create - Initialize a graph object -- cgit v1.2.3 From ba92a96b1e95a67cb736d095dceb788207b90a7b Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 26 Oct 2025 20:08:29 +0200 Subject: media: saa7146: Replace saa7146_ext_vv.vbi_fops with write function The vbi_fops stored in struct saa7146_ext_vv is a full v4l2_file_operations, but only its .write field is used. Replace it with a single vbi_write function pointer to save memory. Signed-off-by: Laurent Pinchart Signed-off-by: Hans Verkuil --- include/media/drv-intf/saa7146_vv.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/media/drv-intf/saa7146_vv.h b/include/media/drv-intf/saa7146_vv.h index 55c7d70b9feb..f66f4dfccf14 100644 --- a/include/media/drv-intf/saa7146_vv.h +++ b/include/media/drv-intf/saa7146_vv.h @@ -130,7 +130,8 @@ struct saa7146_ext_vv /* pointer to the saa7146 core ops */ const struct v4l2_ioctl_ops *core_ops; - struct v4l2_file_operations vbi_fops; + ssize_t (*vbi_write)(struct file *file, const char __user *data, + size_t count, loff_t *ppos); }; struct saa7146_use_ops { -- cgit v1.2.3 From bc49af56eea866c34d21bf582f65b02fc8c06ec3 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 28 Oct 2025 20:34:23 -0700 Subject: blktrace: add support for REQ_OP_WRITE_ZEROES tracing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, REQ_OP_WRITE_ZEROES operations are not handled in the blktrace infrastructure, resulting in incorrect or missing operation labels in ftrace blktrace output. This manifests as write-zeroes operations appearing with incorrect labels like "N" instead of a proper "WZ" designation. This patch adds complete support for REQ_OP_WRITE_ZEROES across the blktrace infrastructure: Add BLK_TC_WRITE_ZEROES trace category in blktrace_api.h and update BLK_TC_END_V2 marker accordingly Map REQ_OP_WRITE_ZEROES to BLK_TC_WRITE_ZEROES in __blk_add_trace() to ensure proper trace event categorization Update fill_rwbs() to generate "WZ" label for write-zeroes operations in ftrace output, making them easily identifiable Add "write-zeroes" string mapping in act_to_str array for debugfs filter interface Update blk_fill_rwbs() to handle REQ_OP_WRITE_ZEROES for block layer event tracing With this fix, write-zeroes operations are now correctly traced and displayed. =========================================================== BEFORE THIS PATCH =========================================================== blkdiscard -z -o 0 -l 40960 /dev/nvme0n1 blkdiscard-3809 [030] ..... 1212.253701: block_bio_queue: 259,0 NS 0 + 80 [blkdiscard] blkdiscard-3809 [030] ..... 1212.253703: block_getrq: 259,0 NS 0 + 80 [blkdiscard] blkdiscard-3809 [030] ..... 1212.253704: block_io_start: 259,0 NS 40960 () 0 + 80 be,0,4 [blkdiscard] blkdiscard-3809 [030] ..... 1212.253704: block_plug: [blkdiscard] blkdiscard-3809 [030] ..... 1212.253706: block_unplug: [blkdiscard] 1 blkdiscard-3809 [030] ..... 1212.253706: block_rq_insert: 259,0 NS 40960 () 0 + 80 be,0,4 [blkdiscard] kworker/30:1H-566 [030] ..... 1212.253726: block_rq_issue: 259,0 NS 40960 () 0 + 80 be,0,4 [kworker/30:1H] -0 [030] d.h1. 1212.253957: block_rq_complete: 259,0 NS () 0 + 80 be,0,4 [0] -0 [030] dNh1. 1212.253960: block_io_done: 259,0 NS 0 () 0 + 0 none,0,0 [swapper/30] Trace Event Breakdown: Event | Device | Op | Sector | Sectors | Byte Size | Calculation block_bio_queue | 259,0 | NS | 0 | 80 | - | 80 × 512 = 40,960 block_getrq | 259,0 | NS | 0 | 80 | - | 80 × 512 = 40,960 block_io_start | 259,0 | NS | 0 | 80 | 40960 | Direct from trace block_rq_insert | 259,0 | NS | 0 | 80 | 40960 | Direct from trace block_rq_issue | 259,0 | NS | 0 | 80 | 40960 | Direct from trace block_rq_complete | 259,0 | NS | 0 | 80 | - | 80 × 512 = 40,960 block_io_done | 259,0 | NS | 0 | 0 | 0 | Completion (no data) Total Bytes Transferred: Sectors: 80 Bytes: 80 × 512 = 40,960 bytes =========================================================== AFTER THIS PATCH =========================================================== blkdiscard -z -o 0 -l 40960 /dev/nvme0n1 blkdiscard-2477 [020] ..... 960.989131: block_bio_queue: 259,0 WZS 0 + 80 [blkdiscard] blkdiscard-2477 [020] ..... 960.989134: block_getrq: 259,0 WZS 0 + 80 [blkdiscard] blkdiscard-2477 [020] ..... 960.989135: block_io_start: 259,0 WZS 40960 () 0 + 80 be,0,4 [blkdiscard] blkdiscard-2477 [020] ..... 960.989138: block_plug: [blkdiscard] blkdiscard-2477 [020] ..... 960.989140: block_unplug: [blkdiscard] 1 blkdiscard-2477 [020] ..... 960.989141: block_rq_insert: 259,0 WZS 40960 () 0 + 80 be,0,4 [blkdiscard] kworker/20:1H-736 [020] ..... 960.989166: block_rq_issue: 259,0 WZS 40960 () 0 + 80 be,0,4 [kworker/20:1H] -0 [020] d.h1. 960.989476: block_rq_complete: 259,0 WZS () 0 + 80 be,0,4 [0] -0 [020] dNh1. 960.989482: block_io_done: 259,0 WZS 0 () 0 + 0 none,0,0 [swapper/20] Trace Event Breakdown: Event | Device | Op | Sector | Sectors | Byte Size | Calculation block_bio_queue | 259,0 | WZS | 0 | 80 | - | 80 × 512 = 40,960 block_getrq | 259,0 | WZS | 0 | 80 | - | 80 × 512 = 40,960 block_io_start | 259,0 | WZS | 0 | 80 | 40960 | Direct from trace block_rq_insert | 259,0 | WZS | 0 | 80 | 40960 | Direct from trace block_rq_issue | 259,0 | WZS | 0 | 80 | 40960 | Direct from trace block_rq_complete | 259,0 | WZS | 0 | 80 | - | 80 × 512 = 40,960 block_io_done | 259,0 | WZS | 0 | 0 | 0 | Completion (no data) Total Bytes Transferred: Sectors: 80 Bytes: 80 × 512 = 40,960 bytes Tested with ftrace blktrace on NVMe devices using blkdiscard with the -z (write-zeroes) flag. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/uapi/linux/blktrace_api.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/blktrace_api.h b/include/uapi/linux/blktrace_api.h index 30f3d2589365..7c092d9f3aa4 100644 --- a/include/uapi/linux/blktrace_api.h +++ b/include/uapi/linux/blktrace_api.h @@ -35,7 +35,9 @@ enum blktrace_cat { BLK_TC_ZONE_OPEN = 1ull << 20, /* zone open */ BLK_TC_ZONE_CLOSE = 1ull << 21, /* zone close */ - BLK_TC_END_V2 = 1ull << 21, + BLK_TC_WRITE_ZEROES = 1ull << 22, /* write-zeroes */ + + BLK_TC_END_V2 = 1ull << 22, }; #define BLK_TC_SHIFT (16) -- cgit v1.2.3 From c33e779aba6804778c1440192a8033a145ba588d Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 31 Oct 2025 14:34:29 -0600 Subject: io_uring: add wrapper type for io_req_tw_func_t arg In preparation for uring_cmd implementations to implement functions with the io_req_tw_func_t signature, introduce a wrapper struct io_tw_req to hide the struct io_kiocb * argument. The intention is for only the io_uring core to access the inner struct io_kiocb *. uring_cmd implementations should instead call a helper from io_uring/cmd.h to convert struct io_tw_req to struct io_uring_cmd *. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 25ee982eb435..f064a438ce43 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -615,7 +615,11 @@ enum { REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); +struct io_tw_req { + struct io_kiocb *req; +}; + +typedef void (*io_req_tw_func_t)(struct io_tw_req tw_req, io_tw_token_t tw); struct io_task_work { struct llist_node node; -- cgit v1.2.3 From 20fb3d05a34b55c8ec28ec3d3555e70c5bc0c72d Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 31 Oct 2025 14:34:30 -0600 Subject: io_uring/uring_cmd: avoid double indirect call in task work dispatch io_uring task work dispatch makes an indirect call to struct io_kiocb's io_task_work.func field to allow running arbitrary task work functions. In the uring_cmd case, this calls io_uring_cmd_work(), which immediately makes another indirect call to struct io_uring_cmd's task_work_cb field. Change the uring_cmd task work callbacks to functions whose signatures match io_req_tw_func_t. Add a function io_uring_cmd_from_tw() to convert from the task work's struct io_tw_req argument to struct io_uring_cmd *. Define a constant IO_URING_CMD_TASK_WORK_ISSUE_FLAGS to avoid manufacturing issue_flags in the uring_cmd task work callbacks. Now uring_cmd task work dispatch makes a single indirect call to the uring_cmd implementation's callback. This also allows removing the task_work_cb field from struct io_uring_cmd, freeing up 8 bytes for future storage. Since fuse_uring_send_in_task() now has access to the io_tw_token_t, check its cancel field directly instead of relying on the IO_URING_F_TASK_DEAD issue flag. Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 22 +++++++++++++--------- include/linux/io_uring_types.h | 1 - 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 7509025b4071..375fd048c4cb 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -11,17 +11,13 @@ /* io_uring_cmd is being issued again */ #define IORING_URING_CMD_REISSUE (1U << 31) -typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd, - unsigned issue_flags); - struct io_uring_cmd { struct file *file; const struct io_uring_sqe *sqe; - /* callback to defer completions to task context */ - io_uring_cmd_tw_t task_work_cb; u32 cmd_op; u32 flags; u8 pdu[32]; /* available inline for free use */ + u8 unused[8]; }; static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) @@ -60,7 +56,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2, unsigned issue_flags, bool is_cqe32); void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb, + io_req_tw_func_t task_work_cb, unsigned flags); /* @@ -109,7 +105,7 @@ static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, { } static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb, unsigned flags) + io_req_tw_func_t task_work_cb, unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, @@ -132,15 +128,23 @@ static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd, } #endif +static inline struct io_uring_cmd *io_uring_cmd_from_tw(struct io_tw_req tw_req) +{ + return io_kiocb_to_cmd(tw_req.req, struct io_uring_cmd); +} + +/* task_work executor checks the deferred list completion */ +#define IO_URING_CMD_TASK_WORK_ISSUE_FLAGS IO_URING_F_COMPLETE_DEFER + /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb) + io_req_tw_func_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - io_uring_cmd_tw_t task_work_cb) + io_req_tw_func_t task_work_cb) { __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f064a438ce43..92780764d5fa 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -39,7 +39,6 @@ enum io_uring_cmd_flags { /* set when uring wants to cancel a previously issued command */ IO_URING_F_CANCEL = (1 << 11), IO_URING_F_COMPAT = (1 << 12), - IO_URING_F_TASK_DEAD = (1 << 13), }; struct io_wq_work_node { -- cgit v1.2.3 From 8627bc8c7d815d929ad59407e13458b564870acf Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 17:34:18 +0100 Subject: ns: add missing authorship I authored the files a short while ago. Signed-off-by: Christian Brauner --- include/linux/nstree.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 8b8636690473..43aa262c0ea1 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner */ #ifndef _LINUX_NSTREE_H #define _LINUX_NSTREE_H -- cgit v1.2.3 From d915fe20e5cba4bd50e41e792a32dcddc7490e25 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 16:10:10 +0100 Subject: ns: add NS_COMMON_INIT() Add an initializer that can be used for the ns common initialization for static namespace such as most init namespaces. Suggested-by: Thomas Gleixner Link: https://patch.msgid.link/87ecqhy2y5.ffs@tglx Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index f5b68b8abb54..3a72c3f81eca 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -119,6 +119,16 @@ void __ns_common_free(struct ns_common *ns); struct user_namespace *: CLONE_NEWUSER, \ struct uts_namespace *: CLONE_NEWUTS) +#define NS_COMMON_INIT(nsname, refs) \ +{ \ + .ns_type = ns_common_type(&nsname), \ + .ns_id = 0, \ + .inum = ns_init_inum(&nsname), \ + .ops = to_ns_operations(&nsname), \ + .stashed = NULL, \ + .__ns_ref = REFCOUNT_INIT(refs), \ +} + #define ns_common_init(__ns) \ __ns_common_init(to_ns_common(__ns), \ ns_common_type(__ns), \ -- cgit v1.2.3 From 3dd50c58664e2684bd610a57bf3ab713cbb0ea91 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:21 +0100 Subject: ns: initialize ns_list_node for initial namespaces Make sure that the list is always initialized for initial namespaces. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-8-2e6f823ebdc0@kernel.org Fixes: 885fc8ac0a4d ("nstree: make iterator generic") Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 3a72c3f81eca..71a5e28344d1 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -127,6 +127,7 @@ void __ns_common_free(struct ns_common *ns); .ops = to_ns_operations(&nsname), \ .stashed = NULL, \ .__ns_ref = REFCOUNT_INIT(refs), \ + .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ } #define ns_common_init(__ns) \ -- cgit v1.2.3 From 6b053576edb12c7739ea9c7c9900031361922631 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:22 +0100 Subject: ns: add __ns_ref_read() Implement ns_ref_read() the same way as ns_ref_{get,put}(). No point in making that any more special or different from the other helpers. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-9-2e6f823ebdc0@kernel.org Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 71a5e28344d1..5e09facafd93 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -154,7 +154,12 @@ static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) return refcount_inc_not_zero(&ns->__ns_ref); } -#define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) +static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) +{ + return refcount_read(&ns->__ns_ref); +} + +#define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns))) #define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) -- cgit v1.2.3 From 4b06b70c8244b442d58ae0fb59870cf31fdb422e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:23 +0100 Subject: ns: rename to exit_nsproxy_namespaces() The current naming is very misleading as this really isn't exiting all of the task's namespaces. It is only exiting the namespaces that hang of off nsproxy. Reflect that in the name. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-10-2e6f823ebdc0@kernel.org Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/nsproxy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index bd118a187dec..538ba8dba184 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -93,7 +93,7 @@ static inline struct cred *nsset_cred(struct nsset *set) */ int copy_namespaces(u64 flags, struct task_struct *tsk); -void exit_task_namespaces(struct task_struct *tsk); +void exit_nsproxy_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); -- cgit v1.2.3 From 3a18f809184bc5a1cfad7cde5b8b026e2ff61587 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:24 +0100 Subject: ns: add active reference count The namespace tree is, among other things, currently used to support file handles for namespaces. When a namespace is created it is placed on the namespace trees and when it is destroyed it is removed from the namespace trees. While a namespace is on the namespace trees with a valid reference count it is possible to reopen it through a namespace file handle. This is all fine but has some issues that should be addressed. On current kernels a namespace is visible to userspace in the following cases: (1) The namespace is in use by a task. (2) The namespace is persisted through a VFS object (namespace file descriptor or bind-mount). Note that (2) only cares about direct persistence of the namespace itself not indirectly via e.g., file->f_cred file references or similar. (3) The namespace is a hierarchical namespace type and is the parent of a single or multiple child namespaces. Case (3) is interesting because it is possible that a parent namespace might not fulfill any of (1) or (2), i.e., is invisible to userspace but it may still be resurrected through the NS_GET_PARENT ioctl(). Currently namespace file handles allow much broader access to namespaces than what is currently possible via (1)-(3). The reason is that namespaces may remain pinned for completely internal reasons yet are inaccessible to userspace. For example, a user namespace my remain pinned by get_cred() calls to stash the opener's credentials into file->f_cred. As it stands file handles allow to resurrect such a users namespace even though this should not be possible via (1)-(3). This is a fundamental uapi change that we shouldn't do if we don't have to. Consider the following insane case: Various architectures support the CONFIG_MMU_LAZY_TLB_REFCOUNT option which uses lazy TLB destruction. When this option is set a userspace task's struct mm_struct may be used for kernel threads such as the idle task and will only be destroyed once the cpu's runqueue switches back to another task. But because of ptrace() permission checks struct mm_struct stashes the user namespace of the task that struct mm_struct originally belonged to. The kernel thread will take a reference on the struct mm_struct and thus pin it. So on an idle system user namespaces can be persisted for arbitrary amounts of time which also means that they can be resurrected using namespace file handles. That makes no sense whatsoever. The problem is of course excarabted on large systems with a huge number of cpus. To handle this nicely we introduce an active reference count which tracks (1)-(3). This is easy to do as all of these things are already managed centrally. Only (1)-(3) will count towards the active reference count and only namespaces which are active may be opened via namespace file handles. The problem is that namespaces may be resurrected. Which means that they can become temporarily inactive and will be reactived some time later. Currently the only example of this is the SIOGCSKNS socket ioctl. The SIOCGSKNS ioctl allows to open a network namespace file descriptor based on a socket file descriptor. If a socket is tied to a network namespace that subsequently becomes inactive but that socket is persisted by another process in another network namespace (e.g., via SCM_RIGHTS of pidfd_getfd()) then the SIOCGSKNS ioctl will resurrect this network namespace. So calls to open_related_ns() and open_namespace() will end up resurrecting the corresponding namespace tree. Note that the active reference count does not regulate the lifetime of the namespace itself. This is still done by the normal reference count. The active reference count can only be elevated if the regular reference count is elevated. The active reference count also doesn't regulate the presence of a namespace on the namespace trees. It only regulates its visiblity to namespace file handles (and in later patches to listns()). A namespace remains on the namespace trees from creation until its actual destruction. This will allow the kernel to always reach any namespace trivially and it will also enable subsystems like bpf to walk the namespace lists on the system for tracing or general introspection purposes. Note that different namespaces have different visibility lifetimes on current kernels. While most namespace are immediately released when the last task using them exits, the user- and pid namespace are persisted and thus both remain accessible via /proc//ns/. The user namespace lifetime is aliged with struct cred and is only released through exit_creds(). However, it becomes inaccessible to userspace once the last task using it is reaped, i.e., when release_task() is called and all proc entries are flushed. Similarly, the pid namespace is also visible until the last task using it has been reaped and the associated pid numbers are freed. The active reference counts of the user- and pid namespace are decremented once the task is reaped. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-11-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 141 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/nsfs.h | 3 + include/linux/nsproxy.h | 3 + 3 files changed, 145 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 5e09facafd93..bdd0df15ad9c 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -4,7 +4,9 @@ #include #include +#include #include +#include struct proc_ns_operations; @@ -37,6 +39,67 @@ extern const struct proc_ns_operations cgroupns_operations; extern const struct proc_ns_operations timens_operations; extern const struct proc_ns_operations timens_for_children_operations; +/* + * Namespace lifetimes are managed via a two-tier reference counting model: + * + * (1) __ns_ref (refcount_t): Main reference count tracking memory + * lifetime. Controls when the namespace structure itself is freed. + * It also pins the namespace on the namespace trees whereas (2) + * only regulates their visibility to userspace. + * + * (2) __ns_ref_active (atomic_t): Reference count tracking active users. + * Controls visibility of the namespace in the namespace trees. + * Any live task that uses the namespace (via nsproxy or cred) holds + * an active reference. Any open file descriptor or bind-mount of + * the namespace holds an active reference. Once all tasks have + * called exited their namespaces and all file descriptors and + * bind-mounts have been released the active reference count drops + * to zero and the namespace becomes inactive. IOW, the namespace + * cannot be listed or opened via file handles anymore. + * + * Note that it is valid to transition from active to inactive and + * back from inactive to active e.g., when resurrecting an inactive + * namespace tree via the SIOCGSKNS ioctl(). + * + * Relationship and lifecycle states: + * + * - Active (__ns_ref_active > 0): + * Namespace is actively used and visible to userspace. The namespace + * can be reopened via /proc//ns/, via namespace file + * handles, or discovered via listns(). + * + * - Inactive (__ns_ref_active == 0, __ns_ref > 0): + * No tasks are actively using the namespace and it isn't pinned by + * any bind-mounts or open file descriptors anymore. But the namespace + * is still kept alive by internal references. For example, the user + * namespace could be pinned by an open file through file->f_cred + * references when one of the now defunct tasks had opened a file and + * handed the file descriptor off to another process via a UNIX + * sockets. Such references keep the namespace structure alive through + * __ns_ref but will not hold an active reference. + * + * - Destroyed (__ns_ref == 0): + * No references remain. The namespace is removed from the tree and freed. + * + * State transitions: + * + * Active -> Inactive: + * When the last task using the namespace exits it drops its active + * references to all namespaces. However, user and pid namespaces + * remain accessible until the task has been reaped. + * + * Inactive -> Active: + * An inactive namespace tree might be resurrected due to e.g., the + * SIOCGSKNS ioctl() on a socket. + * + * Inactive -> Destroyed: + * When __ns_ref drops to zero the namespace is removed from the + * namespaces trees and the memory is freed (after RCU grace period). + * + * Initial namespaces: + * Boot-time namespaces (init_net, init_pid_ns, etc.) start with + * __ns_ref_active = 1 and remain active forever. + */ struct ns_common { u32 ns_type; struct dentry *stashed; @@ -48,6 +111,7 @@ struct ns_common { u64 ns_id; struct rb_node ns_tree_node; struct list_head ns_list_node; + atomic_t __ns_ref_active; /* do not use directly */ }; struct rcu_head ns_rcu; }; @@ -56,6 +120,13 @@ struct ns_common { int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); +static __always_inline bool is_initial_namespace(struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->inum == 0); + return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, + IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); +} + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ @@ -127,6 +198,7 @@ void __ns_common_free(struct ns_common *ns); .ops = to_ns_operations(&nsname), \ .stashed = NULL, \ .__ns_ref = REFCOUNT_INIT(refs), \ + .__ns_ref_active = ATOMIC_INIT(1), \ .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ } @@ -144,14 +216,26 @@ void __ns_common_free(struct ns_common *ns); #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) +static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) +{ + return atomic_read(&ns->__ns_ref_active); +} + static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - return refcount_dec_and_test(&ns->__ns_ref); + if (refcount_dec_and_test(&ns->__ns_ref)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); + return true; + } + return false; } static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - return refcount_inc_not_zero(&ns->__ns_ref); + if (refcount_inc_not_zero(&ns->__ns_ref)) + return true; + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); + return false; } static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) @@ -166,4 +250,57 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns #define ns_ref_put_and_lock(__ns, __lock) \ refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) +#define ns_ref_active_read(__ns) \ + ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) + +void __ns_ref_active_get_owner(struct ns_common *ns); + +static __always_inline void __ns_ref_active_get(struct ns_common *ns) +{ + WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); + VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0); +} +#define ns_ref_active_get(__ns) \ + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) + +static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) +{ + if (atomic_inc_not_zero(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + return true; + } + return false; +} + +#define ns_ref_active_get_owner(__ns) \ + do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0) + +void __ns_ref_active_put_owner(struct ns_common *ns); + +static __always_inline void __ns_ref_active_put(struct ns_common *ns) +{ + if (atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(is_initial_namespace(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + __ns_ref_active_put_owner(ns); + } +} +#define ns_ref_active_put(__ns) \ + do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) + +static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); + if (!__ns_ref_active_read(ns)) + return NULL; + if (!__ns_ref_get(ns)) + return NULL; + return ns; +} + +void __ns_ref_active_resurrect(struct ns_common *ns); + +#define ns_ref_active_resurrect(__ns) \ + do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0) + #endif diff --git a/include/linux/nsfs.h b/include/linux/nsfs.h index e5a5fa83d36b..731b67fc2fec 100644 --- a/include/linux/nsfs.h +++ b/include/linux/nsfs.h @@ -37,4 +37,7 @@ void nsfs_init(void); #define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) +void nsproxy_ns_active_get(struct nsproxy *ns); +void nsproxy_ns_active_put(struct nsproxy *ns); + #endif /* _LINUX_NSFS_H */ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 538ba8dba184..ac825eddec59 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -93,7 +93,10 @@ static inline struct cred *nsset_cred(struct nsset *set) */ int copy_namespaces(u64 flags, struct task_struct *tsk); +void switch_cred_namespaces(const struct cred *old, const struct cred *new); void exit_nsproxy_namespaces(struct task_struct *tsk); +void get_cred_namespaces(struct task_struct *tsk); +void exit_cred_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); -- cgit v1.2.3 From 8895d2a3dbf49f23622ab8da9fb3909826edd6dc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:25 +0100 Subject: ns: use anonymous struct to group list member Make it easier to spot that they belong together conceptually. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-12-2e6f823ebdc0@kernel.org Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index bdd0df15ad9c..32463203c824 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -109,8 +109,10 @@ struct ns_common { union { struct { u64 ns_id; - struct rb_node ns_tree_node; - struct list_head ns_list_node; + struct /* per type rbtree and list */ { + struct rb_node ns_tree_node; + struct list_head ns_list_node; + }; atomic_t __ns_ref_active; /* do not use directly */ }; struct rcu_head ns_rcu; -- cgit v1.2.3 From 2ccaebc686e9ef7e94b3a8d89706daed6e696667 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:26 +0100 Subject: nstree: introduce a unified tree This will allow userspace to lookup and stat a namespace simply by its identifier without having to know what type of namespace it is. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-13-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 32463203c824..7a3c71b3a76f 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -109,6 +109,9 @@ struct ns_common { union { struct { u64 ns_id; + struct /* global namespace rbtree and list */ { + struct rb_node ns_unified_tree_node; + }; struct /* per type rbtree and list */ { struct rb_node ns_tree_node; struct list_head ns_list_node; -- cgit v1.2.3 From 3760342fd6312416491d536144e39297fa5b1950 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:28 +0100 Subject: nstree: assign fixed ids to the initial namespaces The initial set of namespace comes with fixed inode numbers making it easy for userspace to identify them solely based on that information. This has long preceeded anything here. Similarly, let's assign fixed namespace ids for the initial namespaces. Kill the cookie and use a sequentially increasing number. This has the nice side-effect that the owning user namespace will always have a namespace id that is smaller than any of it's descendant namespaces. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-15-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 13 ++++++++++++- include/linux/nstree.h | 15 +++++++++++---- include/uapi/linux/nsfs.h | 14 ++++++++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7a3c71b3a76f..009a6dea724f 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -173,6 +173,17 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) struct user_namespace *: &init_user_ns, \ struct uts_namespace *: &init_uts_ns) +#define ns_init_id(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_ID, \ + struct ipc_namespace *: IPC_NS_INIT_ID, \ + struct mnt_namespace *: MNT_NS_INIT_ID, \ + struct net *: NET_NS_INIT_ID, \ + struct pid_namespace *: PID_NS_INIT_ID, \ + struct time_namespace *: TIME_NS_INIT_ID, \ + struct user_namespace *: USER_NS_INIT_ID, \ + struct uts_namespace *: UTS_NS_INIT_ID) + #define to_ns_operations(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ @@ -198,7 +209,7 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) #define NS_COMMON_INIT(nsname, refs) \ { \ .ns_type = ns_common_type(&nsname), \ - .ns_id = 0, \ + .ns_id = ns_init_id(&nsname), \ .inum = ns_init_inum(&nsname), \ .ops = to_ns_operations(&nsname), \ .stashed = NULL, \ diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 43aa262c0ea1..38674c6fa4f7 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -9,6 +9,7 @@ #include #include #include +#include extern struct ns_tree cgroup_ns_tree; extern struct ns_tree ipc_ns_tree; @@ -30,7 +31,11 @@ extern struct ns_tree uts_ns_tree; struct user_namespace *: &(user_ns_tree), \ struct uts_namespace *: &(uts_ns_tree)) -u64 ns_tree_gen_id(struct ns_common *ns); +#define ns_tree_gen_id(__ns) \ + __ns_tree_gen_id(to_ns_common(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) + +u64 __ns_tree_gen_id(struct ns_common *ns, u64 id); void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); @@ -38,9 +43,9 @@ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, struct ns_tree *ns_tree, bool previous); -static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree, u64 id) { - ns_tree_gen_id(ns); + __ns_tree_gen_id(ns, id); __ns_tree_add_raw(ns, ns_tree); } @@ -60,7 +65,9 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) * This function assigns a new id to the namespace and adds it to the * appropriate namespace tree and list. */ -#define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) +#define ns_tree_add(__ns) \ + __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \ + (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) /** * ns_tree_remove - Remove a namespace from a namespace tree diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index e098759ec917..f8bc2aad74d6 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -67,4 +67,18 @@ struct nsfs_file_handle { #define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ #define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ +enum init_ns_id { + IPC_NS_INIT_ID = 1ULL, + UTS_NS_INIT_ID = 2ULL, + USER_NS_INIT_ID = 3ULL, + PID_NS_INIT_ID = 4ULL, + CGROUP_NS_INIT_ID = 5ULL, + TIME_NS_INIT_ID = 6ULL, + NET_NS_INIT_ID = 7ULL, + MNT_NS_INIT_ID = 8ULL, +#ifdef __KERNEL__ + NS_LAST_INIT_ID = MNT_NS_INIT_ID, +#endif +}; + #endif /* __LINUX_NSFS_H */ -- cgit v1.2.3 From 3c1a52f2a6c865464babe7a85c2796aa31cc9744 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:29 +0100 Subject: nstree: maintain list of owned namespaces The namespace tree doesn't express the ownership concept of namespace appropriately. Maintain a list of directly owned namespaces per user namespace. This will allow userspace and the kernel to use the listns() system call to walk the namespace tree by owning user namespace. The rbtree is used to find the relevant namespace entry point which allows to continue iteration and the owner list can be used to walk the tree completely lock free. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-16-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 009a6dea724f..698aa2f7f486 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -116,6 +116,12 @@ struct ns_common { struct rb_node ns_tree_node; struct list_head ns_list_node; }; + struct /* namespace ownership rbtree and list */ { + struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */ + struct list_head ns_owner; /* list of namespaces owned by this namespace */ + struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */ + struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */ + }; atomic_t __ns_ref_active; /* do not use directly */ }; struct rcu_head ns_rcu; @@ -216,6 +222,8 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) .__ns_ref = REFCOUNT_INIT(refs), \ .__ns_ref_active = ATOMIC_INIT(1), \ .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ + .ns_owner_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_entry), \ + .ns_owner = LIST_HEAD_INIT(nsname.ns.ns_owner), \ } #define ns_common_init(__ns) \ -- cgit v1.2.3 From 560e25e70fa40ec69f97f14207bde9bc18bec9b8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:31 +0100 Subject: nstree: add unified namespace list Allow to walk the unified namespace list completely locklessly. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-18-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 698aa2f7f486..3f05dd7d40c7 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -111,6 +111,7 @@ struct ns_common { u64 ns_id; struct /* global namespace rbtree and list */ { struct rb_node ns_unified_tree_node; + struct list_head ns_unified_list_node; }; struct /* per type rbtree and list */ { struct rb_node ns_tree_node; @@ -224,6 +225,7 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ .ns_owner_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_entry), \ .ns_owner = LIST_HEAD_INIT(nsname.ns.ns_owner), \ + .ns_unified_list_node = LIST_HEAD_INIT(nsname.ns.ns_unified_list_node), \ } #define ns_common_init(__ns) \ -- cgit v1.2.3 From 76b6f5dfb3fda76fce1f9990d6fa58adc711122b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:32 +0100 Subject: nstree: add listns() Add a new listns() system call that allows userspace to iterate through namespaces in the system. This provides a programmatic interface to discover and inspect namespaces, enhancing existing namespace apis. Currently, there is no direct way for userspace to enumerate namespaces in the system. Applications must resort to scanning /proc//ns/ across all processes, which is: 1. Inefficient - requires iterating over all processes 2. Incomplete - misses inactive namespaces that aren't attached to any running process but are kept alive by file descriptors, bind mounts, or parent namespace references 3. Permission-heavy - requires access to /proc for many processes 4. No ordering or ownership. 5. No filtering per namespace type: Must always iterate and check all namespaces. The list goes on. The listns() system call solves these problems by providing direct kernel-level enumeration of namespaces. It is similar to listmount() but obviously tailored to namespaces. /* * @req: Pointer to struct ns_id_req specifying search parameters * @ns_ids: User buffer to receive namespace IDs * @nr_ns_ids: Size of ns_ids buffer (maximum number of IDs to return) * @flags: Reserved for future use (must be 0) */ ssize_t listns(const struct ns_id_req *req, u64 *ns_ids, size_t nr_ns_ids, unsigned int flags); Returns: - On success: Number of namespace IDs written to ns_ids - On error: Negative error code /* * @size: Structure size * @ns_id: Starting point for iteration; use 0 for first call, then * use the last returned ID for subsequent calls to paginate * @ns_type: Bitmask of namespace types to include (from enum ns_type): * 0: Return all namespace types * MNT_NS: Mount namespaces * NET_NS: Network namespaces * USER_NS: User namespaces * etc. Can be OR'd together * @user_ns_id: Filter results to namespaces owned by this user namespace: * 0: Return all namespaces (subject to permission checks) * LISTNS_CURRENT_USER: Namespaces owned by caller's user namespace * Other value: Namespaces owned by the specified user namespace ID */ struct ns_id_req { __u32 size; /* sizeof(struct ns_id_req) */ __u32 spare; /* Reserved, must be 0 */ __u64 ns_id; /* Last seen namespace ID (for pagination) */ __u32 ns_type; /* Filter by namespace type(s) */ __u32 spare2; /* Reserved, must be 0 */ __u64 user_ns_id; /* Filter by owning user namespace */ }; Example 1: List all namespaces void list_all_namespaces(void) { struct ns_id_req req = { .size = sizeof(req), .ns_id = 0, /* Start from beginning */ .ns_type = 0, /* All types */ .user_ns_id = 0, /* All user namespaces */ }; uint64_t ids[100]; ssize_t ret; printf("All namespaces in the system:\n"); do { ret = listns(&req, ids, 100, 0); if (ret < 0) { perror("listns"); break; } for (ssize_t i = 0; i < ret; i++) printf(" Namespace ID: %llu\n", (unsigned long long)ids[i]); /* Continue from last seen ID */ if (ret > 0) req.ns_id = ids[ret - 1]; } while (ret == 100); /* Buffer was full, more may exist */ } Example 2: List network namespaces only void list_network_namespaces(void) { struct ns_id_req req = { .size = sizeof(req), .ns_id = 0, .ns_type = NET_NS, /* Only network namespaces */ .user_ns_id = 0, }; uint64_t ids[100]; ssize_t ret; ret = listns(&req, ids, 100, 0); if (ret < 0) { perror("listns"); return; } printf("Network namespaces: %zd found\n", ret); for (ssize_t i = 0; i < ret; i++) printf(" netns ID: %llu\n", (unsigned long long)ids[i]); } Example 3: List namespaces owned by current user namespace void list_owned_namespaces(void) { struct ns_id_req req = { .size = sizeof(req), .ns_id = 0, .ns_type = 0, /* All types */ .user_ns_id = LISTNS_CURRENT_USER, /* Current userns */ }; uint64_t ids[100]; ssize_t ret; ret = listns(&req, ids, 100, 0); if (ret < 0) { perror("listns"); return; } printf("Namespaces owned by my user namespace: %zd\n", ret); for (ssize_t i = 0; i < ret; i++) printf(" ns ID: %llu\n", (unsigned long long)ids[i]); } Example 4: List multiple namespace types void list_network_and_mount_namespaces(void) { struct ns_id_req req = { .size = sizeof(req), .ns_id = 0, .ns_type = NET_NS | MNT_NS, /* Network and mount */ .user_ns_id = 0, }; uint64_t ids[100]; ssize_t ret; ret = listns(&req, ids, 100, 0); printf("Network and mount namespaces: %zd found\n", ret); } Example 5: Pagination through large namespace sets void list_all_with_pagination(void) { struct ns_id_req req = { .size = sizeof(req), .ns_id = 0, .ns_type = 0, .user_ns_id = 0, }; uint64_t ids[50]; size_t total = 0; ssize_t ret; printf("Enumerating all namespaces with pagination:\n"); while (1) { ret = listns(&req, ids, 50, 0); if (ret < 0) { perror("listns"); break; } if (ret == 0) break; /* No more namespaces */ total += ret; printf(" Batch: %zd namespaces\n", ret); /* Last ID in this batch becomes start of next batch */ req.ns_id = ids[ret - 1]; if (ret < 50) break; /* Partial batch = end of results */ } printf("Total: %zu namespaces\n", total); } Permission Model listns() respects namespace isolation and capabilities: (1) Global listing (user_ns_id = 0): - Requires CAP_SYS_ADMIN in the namespace's owning user namespace - OR the namespace must be in the caller's namespace context (e.g., a namespace the caller is currently using) - User namespaces additionally allow listing if the caller has CAP_SYS_ADMIN in that user namespace itself (2) Owner-filtered listing (user_ns_id != 0): - Requires CAP_SYS_ADMIN in the specified owner user namespace - OR the namespace must be in the caller's namespace context - This allows unprivileged processes to enumerate namespaces they own (3) Visibility: - Only "active" namespaces are listed - A namespace is active if it has a non-zero __ns_ref_active count - This includes namespaces used by running processes, held by open file descriptors, or kept active by bind mounts - Inactive namespaces (kept alive only by internal kernel references) are not visible via listns() Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-19-2e6f823ebdc0@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 ++ include/linux/syscalls.h | 4 ++++ include/linux/user_namespace.h | 4 ++-- include/uapi/linux/nsfs.h | 44 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 52 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 3f05dd7d40c7..bd4492ef6ffc 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -129,8 +129,10 @@ struct ns_common { }; }; +bool is_current_namespace(struct ns_common *ns); int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); +struct ns_common *__must_check ns_owner(struct ns_common *ns); static __always_inline bool is_initial_namespace(struct ns_common *ns) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 66c06fcdfe19..cf84d98964b2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -77,6 +77,7 @@ struct cachestat_range; struct cachestat; struct statmount; struct mnt_id_req; +struct ns_id_req; struct xattr_args; struct file_attr; @@ -437,6 +438,9 @@ asmlinkage long sys_statmount(const struct mnt_id_req __user *req, asmlinkage long sys_listmount(const struct mnt_id_req __user *req, u64 __user *mnt_ids, size_t nr_mnt_ids, unsigned int flags); +asmlinkage long sys_listns(const struct ns_id_req __user *req, + u64 __user *ns_ids, size_t nr_ns_ids, + unsigned int flags); asmlinkage long sys_truncate(const char __user *path, long length); asmlinkage long sys_ftruncate(unsigned int fd, off_t length); #if BITS_PER_LONG == 32 diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 9a9aebbf96b9..9c3be157397e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -166,13 +166,13 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; } -#ifdef CONFIG_USER_NS - static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } +#ifdef CONFIG_USER_NS + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) diff --git a/include/uapi/linux/nsfs.h b/include/uapi/linux/nsfs.h index f8bc2aad74d6..a25e38d1c874 100644 --- a/include/uapi/linux/nsfs.h +++ b/include/uapi/linux/nsfs.h @@ -81,4 +81,48 @@ enum init_ns_id { #endif }; +enum ns_type { + TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */ + MNT_NS = (1ULL << 17), /* CLONE_NEWNS */ + CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */ + UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */ + IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */ + USER_NS = (1ULL << 28), /* CLONE_NEWUSER */ + PID_NS = (1ULL << 29), /* CLONE_NEWPID */ + NET_NS = (1ULL << 30), /* CLONE_NEWNET */ +}; + +/** + * struct ns_id_req - namespace ID request structure + * @size: size of this structure + * @spare: reserved for future use + * @filter: filter mask + * @ns_id: last namespace id + * @user_ns_id: owning user namespace ID + * + * Structure for passing namespace ID and miscellaneous parameters to + * statns(2) and listns(2). + * + * For statns(2) @param represents the request mask. + * For listns(2) @param represents the last listed mount id (or zero). + */ +struct ns_id_req { + __u32 size; + __u32 spare; + __u64 ns_id; + struct /* listns */ { + __u32 ns_type; + __u32 spare2; + __u64 user_ns_id; + }; +}; + +/* + * Special @user_ns_id value that can be passed to listns() + */ +#define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */ + +/* List of all ns_id_req versions. */ +#define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */ + #endif /* __LINUX_NSFS_H */ -- cgit v1.2.3 From b36d4b6aa88ef039647228b98c59a875e92f8c8e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 29 Oct 2025 13:20:33 +0100 Subject: arch: hookup listns() system call Add the listns() system call to all architectures. Link: https://patch.msgid.link/20251029-work-namespace-nstree-listns-v4-20-2e6f823ebdc0@kernel.org Tested-by: syzbot@syzkaller.appspotmail.com Reviewed-by: Arnd Bergmann Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/uapi/asm-generic/unistd.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 04e0077fb4c9..942370b3f5d2 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -857,9 +857,11 @@ __SYSCALL(__NR_open_tree_attr, sys_open_tree_attr) __SYSCALL(__NR_file_getattr, sys_file_getattr) #define __NR_file_setattr 469 __SYSCALL(__NR_file_setattr, sys_file_setattr) +#define __NR_listns 470 +__SYSCALL(__NR_listns, sys_listns) #undef __NR_syscalls -#define __NR_syscalls 470 +#define __NR_syscalls 471 /* * 32 bit systems traditionally used different -- cgit v1.2.3 From 16dad7801aad73138a2dff5ea950130646914d1f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Oct 2025 20:19:15 -1000 Subject: cgroup: Rename cgroup lifecycle hooks to cgroup_task_*() The current names cgroup_exit(), cgroup_release(), and cgroup_free() are confusing because they look like they're operating on cgroups themselves when they're actually task lifecycle hooks. For example, cgroup_init() initializes the cgroup subsystem while cgroup_exit() is a task exit notification to cgroup. Rename them to cgroup_task_exit(), cgroup_task_release(), and cgroup_task_free() to make it clear that these operate on tasks. Cc: Dan Schatzberg Cc: Peter Zijlstra Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6ed477338b16..4068035176c4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -137,9 +137,9 @@ extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); -void cgroup_exit(struct task_struct *p); -void cgroup_release(struct task_struct *p); -void cgroup_free(struct task_struct *p); +void cgroup_task_exit(struct task_struct *p); +void cgroup_task_release(struct task_struct *p); +void cgroup_task_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); @@ -680,9 +680,9 @@ static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} -static inline void cgroup_exit(struct task_struct *p) {} -static inline void cgroup_release(struct task_struct *p) {} -static inline void cgroup_free(struct task_struct *p) {} +static inline void cgroup_task_exit(struct task_struct *p) {} +static inline void cgroup_task_release(struct task_struct *p) {} +static inline void cgroup_task_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } -- cgit v1.2.3 From d245698d727ab8f5420b3e28d1243f96a5234851 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Oct 2025 20:19:17 -1000 Subject: cgroup: Defer task cgroup unlink until after the task is done switching out When a task exits, css_set_move_task(tsk, cset, NULL, false) unlinks the task from its cgroup. From the cgroup's perspective, the task is now gone. If this makes the cgroup empty, it can be removed, triggering ->css_offline() callbacks that notify controllers the cgroup is going offline resource-wise. However, the exiting task can still run, perform memory operations, and schedule until the final context switch in finish_task_switch(). This creates a confusing situation where controllers are told a cgroup is offline while resource activities are still happening in it. While this hasn't broken existing controllers, it has caused direct confusion for sched_ext schedulers. Split cgroup_task_exit() into two functions. cgroup_task_exit() now only calls the subsystem exit callbacks and continues to be called from do_exit(). The css_set cleanup is moved to the new cgroup_task_dead() which is called from finish_task_switch() after the final context switch, so that the cgroup only appears empty after the task is truly done running. This also reorders operations so that subsys->exit() is now called before unlinking from the cgroup, which shouldn't break anything. Cc: Dan Schatzberg Cc: Peter Zijlstra Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4068035176c4..bc892e3b37ee 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -138,6 +138,7 @@ extern void cgroup_cancel_fork(struct task_struct *p, extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_task_exit(struct task_struct *p); +void cgroup_task_dead(struct task_struct *p); void cgroup_task_release(struct task_struct *p); void cgroup_task_free(struct task_struct *p); @@ -681,6 +682,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p, static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_task_exit(struct task_struct *p) {} +static inline void cgroup_task_dead(struct task_struct *p) {} static inline void cgroup_task_release(struct task_struct *p) {} static inline void cgroup_task_free(struct task_struct *p) {} -- cgit v1.2.3 From 7900aa699c34401cf5d0c701d9ef72880ddc1a83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Nov 2025 10:25:13 -1000 Subject: sched_ext: Fix cgroup exit ordering by moving sched_ext_free() to finish_task_switch() sched_ext_free() was called from __put_task_struct() when the last reference to the task is dropped, which could be long after the task has finished running. This causes cgroup-related problems: - ops.init_task() can be called on a cgroup which didn't get ops.cgroup_init()'d during scheduler load, because the cgroup might be destroyed/unlinked while the zombie or dead task is still lingering on the scx_tasks list. - ops.cgroup_exit() could be called before ops.exit_task() is called on all member tasks, leading to incorrect exit ordering. Fix by moving it to finish_task_switch() to be called right after the final context switch away from the dying task, matching when sched_class->task_dead() is called. Rename it to sched_ext_dead() to match the new calling context. By calling sched_ext_dead() before cgroup_task_dead(), we ensure that: - Tasks visible on scx_tasks list have valid cgroups during scheduler load, as cgroup_mutex prevents cgroup destruction while the task is still linked. - All member tasks have ops.exit_task() called and are removed from scx_tasks before the cgroup can be destroyed and trigger ops.cgroup_exit(). This fix is made possible by the cgroup_task_dead() split in the previous patch. This also makes more sense resource-wise as there's no point in keeping scheduler side resources around for dead tasks. Reported-by: Dan Schatzberg Cc: Peter Zijlstra Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 4713f374acc0..eb776b094d36 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -208,14 +208,14 @@ struct sched_ext_entity { struct list_head tasks_node; }; -void sched_ext_free(struct task_struct *p); +void sched_ext_dead(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); void scx_softlockup(u32 dur_s); bool scx_rcu_cpu_stall(void); #else /* !CONFIG_SCHED_CLASS_EXT */ -static inline void sched_ext_free(struct task_struct *p) {} +static inline void sched_ext_dead(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} static inline bool scx_rcu_cpu_stall(void) { return false; } -- cgit v1.2.3 From 98c92de40f6ab05452f8919cc2ff800ade5dd9a3 Mon Sep 17 00:00:00 2001 From: Komal Bajaj Date: Mon, 3 Nov 2025 16:53:10 +0530 Subject: dt-bindings: arm: qcom,ids: Add SoC ID for QCS6490 Add unique ID for Qualcomm QCS6490 SoC. Signed-off-by: Komal Bajaj Link: https://lore.kernel.org/r/20251103-qcs6490_soc_id-v1-1-c139dd1e32c8@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/arm/qcom,ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/arm/qcom,ids.h b/include/dt-bindings/arm/qcom,ids.h index 19598ed4679e..8776844e0eeb 100644 --- a/include/dt-bindings/arm/qcom,ids.h +++ b/include/dt-bindings/arm/qcom,ids.h @@ -240,6 +240,7 @@ #define QCOM_ID_SC7280 487 #define QCOM_ID_SC7180P 495 #define QCOM_ID_QCM6490 497 +#define QCOM_ID_QCS6490 498 #define QCOM_ID_SM7325P 499 #define QCOM_ID_IPQ5000 503 #define QCOM_ID_IPQ0509 504 -- cgit v1.2.3 From 342d2a607450f256105781d29aa6300921c6152e Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Thu, 30 Oct 2025 16:39:06 +0530 Subject: dt-bindings: clock: qcom: Add Kaanapali Global clock controller Add device tree bindings for the global clock controller on Qualcomm Kaanapali platform. Signed-off-by: Jingyi Wang Reviewed-by: Krzysztof Kozlowski Signed-off-by: Taniya Das Link: https://lore.kernel.org/r/20251030-gcc_kaanapali-v2-v2-3-a774a587af6f@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,kaanapali-gcc.h | 241 +++++++++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,kaanapali-gcc.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,kaanapali-gcc.h b/include/dt-bindings/clock/qcom,kaanapali-gcc.h new file mode 100644 index 000000000000..890e48709f09 --- /dev/null +++ b/include/dt-bindings/clock/qcom,kaanapali-gcc.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_GCC_KAANAPALI_H +#define _DT_BINDINGS_CLK_QCOM_GCC_KAANAPALI_H + +/* GCC clocks */ +#define GCC_AGGRE_NOC_PCIE_AXI_CLK 0 +#define GCC_AGGRE_UFS_PHY_AXI_CLK 1 +#define GCC_AGGRE_USB3_PRIM_AXI_CLK 2 +#define GCC_BOOT_ROM_AHB_CLK 3 +#define GCC_CAM_BIST_MCLK_AHB_CLK 4 +#define GCC_CAMERA_AHB_CLK 5 +#define GCC_CAMERA_HF_AXI_CLK 6 +#define GCC_CAMERA_SF_AXI_CLK 7 +#define GCC_CAMERA_XO_CLK 8 +#define GCC_CFG_NOC_PCIE_ANOC_AHB_CLK 9 +#define GCC_CFG_NOC_USB3_PRIM_AXI_CLK 10 +#define GCC_CNOC_PCIE_SF_AXI_CLK 11 +#define GCC_DDRSS_PCIE_SF_QTB_CLK 12 +#define GCC_QMIP_CAMERA_CMD_AHB_CLK 13 +#define GCC_DISP_HF_AXI_CLK 14 +#define GCC_DISP_SF_AXI_CLK 15 +#define GCC_EVA_AHB_CLK 16 +#define GCC_EVA_AXI0_CLK 17 +#define GCC_EVA_AXI0C_CLK 18 +#define GCC_EVA_XO_CLK 19 +#define GCC_GP1_CLK 20 +#define GCC_GP1_CLK_SRC 21 +#define GCC_GP2_CLK 22 +#define GCC_GP2_CLK_SRC 23 +#define GCC_GP3_CLK 24 +#define GCC_GP3_CLK_SRC 25 +#define GCC_GPLL0 26 +#define GCC_GPLL0_OUT_EVEN 27 +#define GCC_GPLL1 28 +#define GCC_GPLL4 29 +#define GCC_GPLL7 30 +#define GCC_GPLL9 31 +#define GCC_GPU_CFG_AHB_CLK 32 +#define GCC_GPU_GEMNOC_GFX_CLK 33 +#define GCC_GPU_GPLL0_CLK_SRC 34 +#define GCC_GPU_GPLL0_DIV_CLK_SRC 35 +#define GCC_QMIP_VIDEO_VCODEC_AHB_CLK 36 +#define GCC_QMIP_GPU_AHB_CLK 37 +#define GCC_PCIE_0_AUX_CLK 38 +#define GCC_PCIE_0_AUX_CLK_SRC 39 +#define GCC_PCIE_0_CFG_AHB_CLK 40 +#define GCC_PCIE_0_MSTR_AXI_CLK 41 +#define GCC_PCIE_0_PHY_AUX_CLK 42 +#define GCC_PCIE_0_PHY_AUX_CLK_SRC 43 +#define GCC_PCIE_0_PHY_RCHNG_CLK 44 +#define GCC_PCIE_0_PHY_RCHNG_CLK_SRC 45 +#define GCC_PCIE_0_PIPE_CLK 46 +#define GCC_PCIE_0_PIPE_CLK_SRC 47 +#define GCC_PCIE_0_SLV_AXI_CLK 48 +#define GCC_PCIE_0_SLV_Q2A_AXI_CLK 49 +#define GCC_PCIE_RSCC_CFG_AHB_CLK 50 +#define GCC_PCIE_RSCC_XO_CLK 51 +#define GCC_PDM2_CLK 52 +#define GCC_PDM2_CLK_SRC 53 +#define GCC_PDM_AHB_CLK 54 +#define GCC_PDM_XO4_CLK 55 +#define GCC_QUPV3_I2C_CORE_CLK 56 +#define GCC_QUPV3_I2C_S0_CLK 57 +#define GCC_QUPV3_I2C_S0_CLK_SRC 58 +#define GCC_QUPV3_I2C_S1_CLK 59 +#define GCC_QUPV3_I2C_S1_CLK_SRC 60 +#define GCC_QUPV3_I2C_S2_CLK 61 +#define GCC_QUPV3_I2C_S2_CLK_SRC 62 +#define GCC_QUPV3_I2C_S3_CLK 63 +#define GCC_QUPV3_I2C_S3_CLK_SRC 64 +#define GCC_QUPV3_I2C_S4_CLK 65 +#define GCC_QUPV3_I2C_S4_CLK_SRC 66 +#define GCC_QUPV3_I2C_S_AHB_CLK 67 +#define GCC_QUPV3_WRAP1_CORE_2X_CLK 68 +#define GCC_QUPV3_WRAP1_CORE_CLK 69 +#define GCC_QUPV3_WRAP1_QSPI_REF_CLK 70 +#define GCC_QUPV3_WRAP1_QSPI_REF_CLK_SRC 71 +#define GCC_QUPV3_WRAP1_S0_CLK 72 +#define GCC_QUPV3_WRAP1_S0_CLK_SRC 73 +#define GCC_QUPV3_WRAP1_S1_CLK 74 +#define GCC_QUPV3_WRAP1_S1_CLK_SRC 75 +#define GCC_QUPV3_WRAP1_S2_CLK 76 +#define GCC_QUPV3_WRAP1_S2_CLK_SRC 77 +#define GCC_QUPV3_WRAP1_S3_CLK 78 +#define GCC_QUPV3_WRAP1_S3_CLK_SRC 79 +#define GCC_QUPV3_WRAP1_S4_CLK 80 +#define GCC_QUPV3_WRAP1_S4_CLK_SRC 81 +#define GCC_QUPV3_WRAP1_S5_CLK 82 +#define GCC_QUPV3_WRAP1_S5_CLK_SRC 83 +#define GCC_QUPV3_WRAP1_S6_CLK 84 +#define GCC_QUPV3_WRAP1_S6_CLK_SRC 85 +#define GCC_QUPV3_WRAP1_S7_CLK 86 +#define GCC_QUPV3_WRAP1_S7_CLK_SRC 87 +#define GCC_QUPV3_WRAP2_CORE_2X_CLK 88 +#define GCC_QUPV3_WRAP2_CORE_CLK 89 +#define GCC_QUPV3_WRAP2_S0_CLK 90 +#define GCC_QUPV3_WRAP2_S0_CLK_SRC 91 +#define GCC_QUPV3_WRAP2_S1_CLK 92 +#define GCC_QUPV3_WRAP2_S1_CLK_SRC 93 +#define GCC_QUPV3_WRAP2_S2_CLK 94 +#define GCC_QUPV3_WRAP2_S2_CLK_SRC 95 +#define GCC_QUPV3_WRAP2_S3_CLK 96 +#define GCC_QUPV3_WRAP2_S3_CLK_SRC 97 +#define GCC_QUPV3_WRAP2_S4_CLK 98 +#define GCC_QUPV3_WRAP2_S4_CLK_SRC 99 +#define GCC_QUPV3_WRAP3_CORE_2X_CLK 100 +#define GCC_QUPV3_WRAP3_CORE_CLK 101 +#define GCC_QUPV3_WRAP3_IBI_CTRL_0_CLK_SRC 102 +#define GCC_QUPV3_WRAP3_IBI_CTRL_1_CLK 103 +#define GCC_QUPV3_WRAP3_IBI_CTRL_2_CLK 104 +#define GCC_QUPV3_WRAP3_S0_CLK 105 +#define GCC_QUPV3_WRAP3_S0_CLK_SRC 106 +#define GCC_QUPV3_WRAP3_S1_CLK 107 +#define GCC_QUPV3_WRAP3_S1_CLK_SRC 108 +#define GCC_QUPV3_WRAP3_S2_CLK 109 +#define GCC_QUPV3_WRAP3_S2_CLK_SRC 110 +#define GCC_QUPV3_WRAP3_S3_CLK 111 +#define GCC_QUPV3_WRAP3_S3_CLK_SRC 112 +#define GCC_QUPV3_WRAP3_S4_CLK 113 +#define GCC_QUPV3_WRAP3_S4_CLK_SRC 114 +#define GCC_QUPV3_WRAP3_S5_CLK 115 +#define GCC_QUPV3_WRAP3_S5_CLK_SRC 116 +#define GCC_QUPV3_WRAP4_CORE_2X_CLK 117 +#define GCC_QUPV3_WRAP4_CORE_CLK 118 +#define GCC_QUPV3_WRAP4_S0_CLK 119 +#define GCC_QUPV3_WRAP4_S0_CLK_SRC 120 +#define GCC_QUPV3_WRAP4_S1_CLK 121 +#define GCC_QUPV3_WRAP4_S1_CLK_SRC 122 +#define GCC_QUPV3_WRAP4_S2_CLK 123 +#define GCC_QUPV3_WRAP4_S2_CLK_SRC 124 +#define GCC_QUPV3_WRAP4_S3_CLK 125 +#define GCC_QUPV3_WRAP4_S3_CLK_SRC 126 +#define GCC_QUPV3_WRAP4_S4_CLK 127 +#define GCC_QUPV3_WRAP4_S4_CLK_SRC 128 +#define GCC_QUPV3_WRAP_1_M_AXI_CLK 129 +#define GCC_QUPV3_WRAP_1_S_AHB_CLK 130 +#define GCC_QUPV3_WRAP_2_M_AHB_CLK 131 +#define GCC_QUPV3_WRAP_2_S_AHB_CLK 132 +#define GCC_QUPV3_WRAP_3_IBI_1_AHB_CLK 133 +#define GCC_QUPV3_WRAP_3_IBI_2_AHB_CLK 134 +#define GCC_QUPV3_WRAP_3_M_AHB_CLK 135 +#define GCC_QUPV3_WRAP_3_S_AHB_CLK 136 +#define GCC_QUPV3_WRAP_4_M_AHB_CLK 137 +#define GCC_QUPV3_WRAP_4_S_AHB_CLK 138 +#define GCC_SDCC2_AHB_CLK 139 +#define GCC_SDCC2_APPS_CLK 140 +#define GCC_SDCC2_APPS_CLK_SRC 141 +#define GCC_SDCC4_AHB_CLK 142 +#define GCC_SDCC4_APPS_CLK 143 +#define GCC_SDCC4_APPS_CLK_SRC 144 +#define GCC_UFS_PHY_AHB_CLK 145 +#define GCC_UFS_PHY_AXI_CLK 146 +#define GCC_UFS_PHY_AXI_CLK_SRC 147 +#define GCC_UFS_PHY_ICE_CORE_CLK 148 +#define GCC_UFS_PHY_ICE_CORE_CLK_SRC 149 +#define GCC_UFS_PHY_PHY_AUX_CLK 150 +#define GCC_UFS_PHY_PHY_AUX_CLK_SRC 151 +#define GCC_UFS_PHY_RX_SYMBOL_0_CLK 152 +#define GCC_UFS_PHY_RX_SYMBOL_0_CLK_SRC 153 +#define GCC_UFS_PHY_RX_SYMBOL_1_CLK 154 +#define GCC_UFS_PHY_RX_SYMBOL_1_CLK_SRC 155 +#define GCC_UFS_PHY_TX_SYMBOL_0_CLK 156 +#define GCC_UFS_PHY_TX_SYMBOL_0_CLK_SRC 157 +#define GCC_UFS_PHY_UNIPRO_CORE_CLK 158 +#define GCC_UFS_PHY_UNIPRO_CORE_CLK_SRC 159 +#define GCC_USB30_PRIM_MASTER_CLK 160 +#define GCC_USB30_PRIM_MASTER_CLK_SRC 161 +#define GCC_USB30_PRIM_MOCK_UTMI_CLK 162 +#define GCC_USB30_PRIM_MOCK_UTMI_CLK_SRC 163 +#define GCC_USB30_PRIM_MOCK_UTMI_POSTDIV_CLK_SRC 164 +#define GCC_USB30_PRIM_SLEEP_CLK 165 +#define GCC_USB3_PRIM_PHY_AUX_CLK 166 +#define GCC_USB3_PRIM_PHY_AUX_CLK_SRC 167 +#define GCC_USB3_PRIM_PHY_COM_AUX_CLK 168 +#define GCC_USB3_PRIM_PHY_PIPE_CLK 169 +#define GCC_USB3_PRIM_PHY_PIPE_CLK_SRC 170 +#define GCC_VIDEO_AHB_CLK 171 +#define GCC_VIDEO_AXI0_CLK 172 +#define GCC_VIDEO_AXI1_CLK 173 +#define GCC_VIDEO_XO_CLK 174 +#define GCC_QMIP_CAMERA_NRT_AHB_CLK 175 +#define GCC_QMIP_CAMERA_RT_AHB_CLK 176 +#define GCC_QMIP_DISP_DCP_SF_AHB_CLK 177 +#define GCC_QMIP_PCIE_AHB_CLK 178 +#define GCC_QMIP_VIDEO_CV_CPU_AHB_CLK 179 +#define GCC_QMIP_VIDEO_CVP_AHB_CLK 180 +#define GCC_QMIP_VIDEO_V_CPU_AHB_CLK 181 +#define GCC_DISP_AHB_CLK 182 + +/* GCC power domains */ +#define GCC_PCIE_0_GDSC 0 +#define GCC_PCIE_0_PHY_GDSC 1 +#define GCC_UFS_MEM_PHY_GDSC 2 +#define GCC_UFS_PHY_GDSC 3 +#define GCC_USB30_PRIM_GDSC 4 +#define GCC_USB3_PHY_GDSC 5 + +/* GCC resets */ +#define GCC_CAMERA_BCR 0 +#define GCC_DISPLAY_BCR 1 +#define GCC_EVA_AXI0_CLK_ARES 2 +#define GCC_EVA_AXI0C_CLK_ARES 3 +#define GCC_EVA_BCR 4 +#define GCC_GPU_BCR 5 +#define GCC_PCIE_0_BCR 6 +#define GCC_PCIE_0_LINK_DOWN_BCR 7 +#define GCC_PCIE_0_NOCSR_COM_PHY_BCR 8 +#define GCC_PCIE_0_PHY_BCR 9 +#define GCC_PCIE_0_PHY_NOCSR_COM_PHY_BCR 10 +#define GCC_PCIE_PHY_BCR 11 +#define GCC_PCIE_PHY_CFG_AHB_BCR 12 +#define GCC_PCIE_PHY_COM_BCR 13 +#define GCC_PCIE_RSCC_BCR 14 +#define GCC_PDM_BCR 15 +#define GCC_QUPV3_WRAPPER_1_BCR 16 +#define GCC_QUPV3_WRAPPER_2_BCR 17 +#define GCC_QUPV3_WRAPPER_3_BCR 18 +#define GCC_QUPV3_WRAPPER_4_BCR 19 +#define GCC_QUPV3_WRAPPER_I2C_BCR 20 +#define GCC_QUSB2PHY_PRIM_BCR 21 +#define GCC_QUSB2PHY_SEC_BCR 22 +#define GCC_SDCC2_BCR 23 +#define GCC_SDCC4_BCR 24 +#define GCC_UFS_PHY_BCR 25 +#define GCC_USB30_PRIM_BCR 26 +#define GCC_USB3_DP_PHY_PRIM_BCR 27 +#define GCC_USB3_DP_PHY_SEC_BCR 28 +#define GCC_USB3_PHY_PRIM_BCR 29 +#define GCC_USB3_PHY_SEC_BCR 30 +#define GCC_USB3PHY_PHY_PRIM_BCR 31 +#define GCC_USB3PHY_PHY_SEC_BCR 32 +#define GCC_VIDEO_AXI0_CLK_ARES 33 +#define GCC_VIDEO_AXI1_CLK_ARES 34 +#define GCC_VIDEO_BCR 35 +#define GCC_VIDEO_XO_CLK_ARES 36 + +#endif -- cgit v1.2.3 From d8f9581e1b7f1fe2e1ac985f4ea508d044c90733 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 29 Oct 2025 17:32:56 +0000 Subject: ipv6: Add in6_dev_rcu(). rcu_dereference_rtnl() does not clearly tell whether the caller is under RCU or RTNL. Let's add in6_dev_rcu() to make it easy to remove __in6_dev_get() in the future. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Guillaume Nault Link: https://patch.msgid.link/20251029173344.2934622-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/addrconf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 9e5e95988b9e..78e8b877fb25 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -347,6 +347,11 @@ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->ip6_ptr); } +static inline struct inet6_dev *in6_dev_rcu(const struct net_device *dev) +{ + return rcu_dereference(dev->ip6_ptr); +} + static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev) { return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr); -- cgit v1.2.3 From e833eb25161aae6cd0caf14782f405d0ed5765ed Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 29 Oct 2025 17:33:04 +0000 Subject: mpls: Protect net->mpls.platform_label with a per-netns mutex. MPLS (re)uses RTNL to protect net->mpls.platform_label, but the lock does not need to be RTNL at all. Let's protect net->mpls.platform_label with a dedicated per-netns mutex. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Guillaume Nault Link: https://patch.msgid.link/20251029173344.2934622-13-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/mpls.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h index 19ad2574b267..6682e51513ef 100644 --- a/include/net/netns/mpls.h +++ b/include/net/netns/mpls.h @@ -16,6 +16,7 @@ struct netns_mpls { int default_ttl; size_t platform_labels; struct mpls_route __rcu * __rcu *platform_label; + struct mutex platform_mutex; struct ctl_table_header *ctl; }; -- cgit v1.2.3 From c18d4b190a46651726c9a952667c74d2deb33c28 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Tue, 28 Oct 2025 20:30:05 +0000 Subject: net: Extend NAPI threaded polling to allow kthread based busy polling Add a new state NAPI_STATE_THREADED_BUSY_POLL to the NAPI state enum to enable and disable threaded busy polling. When threaded busy polling is enabled for a NAPI, enable NAPI_STATE_THREADED also. When the threaded NAPI is scheduled, set NAPI_STATE_IN_BUSY_POLL to signal napi_complete_done not to rearm interrupts. Whenever NAPI_STATE_THREADED_BUSY_POLL is unset, the NAPI_STATE_IN_BUSY_POLL will be unset, napi_complete_done unsets the NAPI_STATE_SCHED_THREADED bit also, which in turn will make the kthread go to sleep. Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn Acked-by: Martin Karsten Tested-by: Martin Karsten Link: https://patch.msgid.link/20251028203007.575686-2-skhawaja@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 4 +++- include/uapi/linux/netdev.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9c1e5042c5e7..e808071dbb7d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -423,11 +423,12 @@ enum { NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_LISTED, /* NAPI added to system lists */ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ - NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ + NAPI_STATE_IN_BUSY_POLL, /* Do not rearm NAPI interrupt */ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */ + NAPI_STATE_THREADED_BUSY_POLL, /* The threaded NAPI poller will busy poll */ }; enum { @@ -442,6 +443,7 @@ enum { NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER), + NAPIF_STATE_THREADED_BUSY_POLL = BIT(NAPI_STATE_THREADED_BUSY_POLL), }; enum gro_result { diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 48eb49aa03d4..048c8de1a130 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -80,6 +80,7 @@ enum netdev_qstats_scope { enum netdev_napi_threaded { NETDEV_NAPI_THREADED_DISABLED, NETDEV_NAPI_THREADED_ENABLED, + NETDEV_NAPI_THREADED_BUSY_POLL, }; enum { -- cgit v1.2.3 From abcf6eef90c6e47efed62a7c233ffc1a6a90797e Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 27 Oct 2025 13:27:58 +0100 Subject: net: phy: introduce internal API for PHY MSE diagnostics Add the base infrastructure for Mean Square Error (MSE) diagnostics, as proposed by the OPEN Alliance "Advanced diagnostic features for 100BASE-T1 automotive Ethernet PHYs" [1] specification. The OPEN Alliance spec defines only average MSE and average peak MSE over a fixed number of symbols. However, other PHYs, such as the KSZ9131, additionally expose a worst-peak MSE value latched since the last channel capture. This API accounts for such vendor extensions by adding a distinct capability bit and snapshot field. Channel-to-pair mapping is normally straightforward, but in some cases (e.g. 100BASE-TX with MDI-X resolution unknown) the mapping is ambiguous. If hardware does not expose MDI-X status, the exact pair cannot be determined. To avoid returning misleading per-channel data in this case, a LINK selector is defined for aggregate MSE measurements. All investigated devices differ in MSE capabilities, such as sample rate, number of analyzed symbols, and scaling factors. For example, the KSZ9131 uses different scaling for MSE and pMSE. To make this visible to callers, scale limits and timing information are returned via get_mse_capability(). Some PHYs sample very few symbols at high frequency (e.g. 2 us update rate). To cover such cases and allow for future high-speed PHYs with even shorter intervals, the refresh rate is reported as u64 in picoseconds. This patch introduces the internal PHY API for Mean Square Error diagnostics. It defines new kernel-side data types and driver hooks: - struct phy_mse_capability: describes supported metrics, scale limits, update interval, and sampling length. - struct phy_mse_snapshot: holds one correlated measurement set. - New phy_driver ops: `get_mse_capability()` and `get_mse_snapshot()`. These definitions form the core kernel API. No user-visible interfaces are added in this commit. Standardization notes: OPEN Alliance defines presence and interpretation of some metrics but does not fix numeric scales or sampling internals: - SQI (3-bit, 0..7) is mandatory; correlation to SNR/BER is informative (OA 100BASE-T1 TC1 v1.0 6.1.2; OA 1000BASE-T1 TC12 v2.2 6.1.2). - MSE is optional; OA recommends 2^16 symbols and scaling to 0..511, with a worst-case latch since last read (OA 100BASE-T1 TC1 v1.0 6.1.1; OA 1000BASE-T1 TC12 v2.2 6.1.1). Refresh is recommended (~0.8-2.0 ms for 100BASE-T1; ~80-200 us for 1000BASE-T1). Exact scaling/time windows are vendor-specific. - Peak MSE (pMSE) is defined only for 100BASE-T1 as optional, e.g. 128-symbol sliding window with 8-bit range and worst-case latch (OA 100BASE-T1 TC1 v1.0 6.1.3). Therefore this API exposes which measures and selectors a PHY supports, and documents where behavior is standard-referenced vs vendor-specific. [1] Signed-off-by: Oleksij Rempel Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20251027122801.982364-2-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 206 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 358dd6f0ff96..e3474f03cbc1 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -903,6 +903,165 @@ struct phy_led { #define to_phy_led(d) container_of(d, struct phy_led, led_cdev) +/* + * PHY_MSE_CAP_* - Bitmask flags for Mean Square Error (MSE) capabilities + * + * These flags describe which MSE metrics and selectors are implemented + * by the PHY for the current link mode. They are used in + * struct phy_mse_capability.supported_caps. + * + * Standardization: + * The OPEN Alliance (OA) defines the presence of MSE/SQI/pMSE but not their + * numeric scaling, update intervals, or aggregation windows. See: + * OA 100BASE-T1 TC1 v1.0, sections 6.1.1-6.1.3 + * OA 1000BASE-T1 TC12 v2.2, sections 6.1.1-6.1.2 + * + * Description of flags: + * + * PHY_MSE_CAP_CHANNEL_A + * Per-pair diagnostics for Channel A are supported. Mapping to the + * physical wire pair may depend on MDI/MDI-X polarity. + * + * PHY_MSE_CAP_CHANNEL_B, _C, _D + * Same as above for channels B-D. + * + * PHY_MSE_CAP_WORST_CHANNEL + * The PHY or driver can identify and report the single worst-performing + * channel without querying each one individually. + * + * PHY_MSE_CAP_LINK + * The PHY provides only a link-wide aggregate measurement or cannot map + * results to a specific pair (for example 100BASE-TX with unknown + * MDI/MDI-X). + * + * PHY_MSE_CAP_AVG + * Average MSE (mean DCQ metric) is supported. For 100/1000BASE-T1 the OA + * recommends 2^16 symbols, scaled 0..511, but the exact scaling is + * vendor-specific. + * + * PHY_MSE_CAP_PEAK + * Peak MSE (current peak within the measurement window) is supported. + * Defined as pMSE for 100BASE-T1; vendor-specific for others. + * + * PHY_MSE_CAP_WORST_PEAK + * Latched worst-case peak MSE since the last read (read-to-clear if + * implemented). Optional in OA 100BASE-T1 TC1 6.1.3. + */ +#define PHY_MSE_CAP_CHANNEL_A BIT(0) +#define PHY_MSE_CAP_CHANNEL_B BIT(1) +#define PHY_MSE_CAP_CHANNEL_C BIT(2) +#define PHY_MSE_CAP_CHANNEL_D BIT(3) +#define PHY_MSE_CAP_WORST_CHANNEL BIT(4) +#define PHY_MSE_CAP_LINK BIT(5) +#define PHY_MSE_CAP_AVG BIT(6) +#define PHY_MSE_CAP_PEAK BIT(7) +#define PHY_MSE_CAP_WORST_PEAK BIT(8) + +/* + * enum phy_mse_channel - Identifiers for selecting MSE measurement channels + * + * PHY_MSE_CHANNEL_A - PHY_MSE_CHANNEL_D + * Select per-pair measurement for the corresponding channel. + * + * PHY_MSE_CHANNEL_WORST + * Select the single worst-performing channel reported by hardware. + * + * PHY_MSE_CHANNEL_LINK + * Select link-wide aggregate data (used when per-pair results are + * unavailable). + */ +enum phy_mse_channel { + PHY_MSE_CHANNEL_A, + PHY_MSE_CHANNEL_B, + PHY_MSE_CHANNEL_C, + PHY_MSE_CHANNEL_D, + PHY_MSE_CHANNEL_WORST, + PHY_MSE_CHANNEL_LINK, +}; + +/** + * struct phy_mse_capability - Capabilities of Mean Square Error (MSE) + * measurement interface + * + * Standardization notes: + * + * - Presence of MSE/SQI/pMSE is defined by OPEN Alliance specs, but numeric + * scaling, refresh/update rate and aggregation windows are not fixed and + * are vendor-/product-specific. (OA 100BASE-T1 TC1 v1.0 6.1.*; + * OA 1000BASE-T1 TC12 v2.2 6.1.*) + * + * - Typical recommendations: 2^16 symbols and 0..511 scaling for MSE; pMSE only + * defined for 100BASE-T1 (sliding window example), others are vendor + * extensions. Drivers must report actual scale/limits here. + * + * Describes the MSE measurement capabilities for the current link mode. These + * properties are dynamic and may change when link settings are modified. + * Callers should re-query this capability after any link state change to + * ensure they have the most up-to-date information. + * + * Callers should only request measurements for channels and types that are + * indicated as supported by the @supported_caps bitmask. If @supported_caps + * is 0, the device provides no MSE diagnostics, and driver operations should + * typically return -EOPNOTSUPP. + * + * Snapshot values for average and peak MSE can be normalized to a 0..1 ratio + * by dividing the raw snapshot by the corresponding @max_average_mse or + * @max_peak_mse value. + * + * @max_average_mse: The maximum value for an average MSE snapshot. This + * defines the scale for the measurement. If the PHY_MSE_CAP_AVG capability is + * supported, this value MUST be greater than 0. (vendor-specific units). + * @max_peak_mse: The maximum value for a peak MSE snapshot. If either + * PHY_MSE_CAP_PEAK or PHY_MSE_CAP_WORST_PEAK is supported, this value MUST + * be greater than 0. (vendor-specific units). + * @refresh_rate_ps: The typical interval, in picoseconds, between hardware + * updates of the MSE values. This is an estimate, and callers should not + * assume synchronous sampling. (vendor-specific units). + * @num_symbols: The number of symbols aggregated per hardware sample to + * calculate the MSE. (vendor-specific units). + * @supported_caps: A bitmask of PHY_MSE_CAP_* values indicating which + * measurement types (e.g., average, peak) and channels + * (e.g., per-pair or link-wide) are supported. + */ +struct phy_mse_capability { + u64 max_average_mse; + u64 max_peak_mse; + u64 refresh_rate_ps; + u64 num_symbols; + u32 supported_caps; +}; + +/** + * struct phy_mse_snapshot - A snapshot of Mean Square Error (MSE) diagnostics + * + * Holds a set of MSE diagnostic values that were all captured from a single + * measurement window. + * + * Values are raw, device-scaled and not normalized. Use struct + * phy_mse_capability to interpret the scale and sampling window. + * + * @average_mse: The average MSE value over the measurement window. + * OPEN Alliance references MSE as a DCQ metric; recommends 2^16 symbols and + * 0..511 scaling. Exact scale and refresh are vendor-specific. + * (100BASE-T1 TC1 v1.0 6.1.1; 1000BASE-T1 TC12 v2.2 6.1.1). + * + * @peak_mse: The peak MSE value observed within the measurement window. + * For 100BASE-T1, "pMSE" is optional and may be implemented via a sliding + * 128-symbol window with periodic capture; not standardized for 1000BASE-T1. + * (100BASE-T1 TC1 v1.0 6.1.3, Table "DCQ.peakMSE"). + * + * @worst_peak_mse: A latched high-water mark of the peak MSE since last read + * (read-to-clear if implemented). OPEN Alliance shows a latched "worst case + * peak MSE" for 100BASE-T1 pMSE; availability/semantics outside that are + * vendor-specific. (100BASE-T1 TC1 v1.0 6.1.3, DCQ.peakMSE high byte; + * 1000BASE-T1 TC12 v2.2 treats DCQ details as vendor-specific.) + */ +struct phy_mse_snapshot { + u64 average_mse; + u64 peak_mse; + u64 worst_peak_mse; +}; + /** * struct phy_driver - Driver structure for a particular PHY type * @@ -1184,6 +1343,53 @@ struct phy_driver { /** @get_sqi_max: Get the maximum signal quality indication */ int (*get_sqi_max)(struct phy_device *dev); + /** + * @get_mse_capability: Get capabilities and scale of MSE measurement + * @dev: PHY device + * @cap: Output (filled on success) + * + * Fill @cap with the PHY's MSE capability for the current + * link mode: scale limits (max_average_mse, max_peak_mse), update + * interval (refresh_rate_ps), sample length (num_symbols) and the + * capability bitmask (supported_caps). + * + * Implementations may defer capability report until hardware has + * converged; in that case they should return -EAGAIN and allow the + * caller to retry later. + * + * Return: 0 on success. On failure, returns a negative errno code, such + * as -EOPNOTSUPP if MSE measurement is not supported by the PHY or in + * the current link mode, or -EAGAIN if the capability information is + * not yet available. + */ + int (*get_mse_capability)(struct phy_device *dev, + struct phy_mse_capability *cap); + + /** + * @get_mse_snapshot: Retrieve a snapshot of MSE diagnostic values + * @dev: PHY device + * @channel: Channel identifier (PHY_MSE_CHANNEL_*) + * @snapshot: Output (filled on success) + * + * Fill @snapshot with a correlated set of MSE values from the most + * recent measurement window. + * + * Callers must validate @channel against supported_caps returned by + * get_mse_capability(). Drivers must not coerce @channel; if the + * requested selector is not implemented by the device or current link + * mode, the operation must fail. + * + * worst_peak_mse is latched and must be treated as read-to-clear. + * + * Return: 0 on success. On failure, returns a negative errno code, such + * as -EOPNOTSUPP if MSE measurement is not supported by the PHY or in + * the current link mode, or -EAGAIN if measurements are not yet + * available. + */ + int (*get_mse_snapshot)(struct phy_device *dev, + enum phy_mse_channel channel, + struct phy_mse_snapshot *snapshot); + /* PLCA RS interface */ /** @get_plca_cfg: Return the current PLCA configuration */ int (*get_plca_cfg)(struct phy_device *dev, -- cgit v1.2.3 From e6e93fb01302e9b7a15d17f3b8a00eff8a601654 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 27 Oct 2025 13:27:59 +0100 Subject: ethtool: netlink: add ETHTOOL_MSG_MSE_GET and wire up PHY MSE access Introduce the userspace entry point for PHY MSE diagnostics via ethtool netlink. This exposes the core API added previously and returns both capability information and one or more snapshots. Userspace sends ETHTOOL_MSG_MSE_GET. The reply carries: - ETHTOOL_A_MSE_CAPABILITIES: scale limits and timing information - ETHTOOL_A_MSE_CHANNEL_* nests: one or more snapshots (per-channel if available, otherwise WORST, otherwise LINK) Link down returns -ENETDOWN. Changes: - YAML: add attribute sets (mse, mse-capabilities, mse-snapshot) and the mse-get operation - UAPI (generated): add ETHTOOL_A_MSE_* enums and message IDs, ETHTOOL_MSG_MSE_GET/REPLY - ethtool core: add net/ethtool/mse.c implementing the request, register genl op, and hook into ethnl dispatch - docs: document MSE_GET in ethtool-netlink.rst The include/uapi/linux/ethtool_netlink_generated.h is generated from Documentation/netlink/specs/ethtool.yaml. Signed-off-by: Oleksij Rempel Link: https://patch.msgid.link/20251027122801.982364-3-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 35 ++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 0e8ac0d974e2..b71b175df46d 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -803,6 +803,39 @@ enum { ETHTOOL_A_PSE_NTF_MAX = (__ETHTOOL_A_PSE_NTF_CNT - 1) }; +enum { + ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE = 1, + ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE, + ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS, + ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS, + + __ETHTOOL_A_MSE_CAPABILITIES_CNT, + ETHTOOL_A_MSE_CAPABILITIES_MAX = (__ETHTOOL_A_MSE_CAPABILITIES_CNT - 1) +}; + +enum { + ETHTOOL_A_MSE_SNAPSHOT_AVERAGE_MSE = 1, + ETHTOOL_A_MSE_SNAPSHOT_PEAK_MSE, + ETHTOOL_A_MSE_SNAPSHOT_WORST_PEAK_MSE, + + __ETHTOOL_A_MSE_SNAPSHOT_CNT, + ETHTOOL_A_MSE_SNAPSHOT_MAX = (__ETHTOOL_A_MSE_SNAPSHOT_CNT - 1) +}; + +enum { + ETHTOOL_A_MSE_HEADER = 1, + ETHTOOL_A_MSE_CAPABILITIES, + ETHTOOL_A_MSE_CHANNEL_A, + ETHTOOL_A_MSE_CHANNEL_B, + ETHTOOL_A_MSE_CHANNEL_C, + ETHTOOL_A_MSE_CHANNEL_D, + ETHTOOL_A_MSE_WORST_CHANNEL, + ETHTOOL_A_MSE_LINK, + + __ETHTOOL_A_MSE_CNT, + ETHTOOL_A_MSE_MAX = (__ETHTOOL_A_MSE_CNT - 1) +}; + enum { ETHTOOL_MSG_USER_NONE = 0, ETHTOOL_MSG_STRSET_GET = 1, @@ -855,6 +888,7 @@ enum { ETHTOOL_MSG_RSS_SET, ETHTOOL_MSG_RSS_CREATE_ACT, ETHTOOL_MSG_RSS_DELETE_ACT, + ETHTOOL_MSG_MSE_GET, __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) @@ -915,6 +949,7 @@ enum { ETHTOOL_MSG_RSS_CREATE_ACT_REPLY, ETHTOOL_MSG_RSS_CREATE_NTF, ETHTOOL_MSG_RSS_DELETE_NTF, + ETHTOOL_MSG_MSE_GET_REPLY, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) -- cgit v1.2.3 From 85d55d8cc3ef7f77b249c97e9fac6a0fc5f5daa7 Mon Sep 17 00:00:00 2001 From: Akhil P Oommen Date: Tue, 30 Sep 2025 11:18:06 +0530 Subject: soc: qcom: ubwc: Add config for Kaanapali Add the ubwc configuration for Kaanapali chipset. This chipset brings support for UBWC v6 version. The rest of the configurations remains as usual. Signed-off-by: Akhil P Oommen Reviewed-by: Konrad Dybcio Reviewed-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250930-kaana-gpu-support-v1-1-73530b0700ed@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/ubwc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/soc/qcom/ubwc.h b/include/linux/soc/qcom/ubwc.h index 1ed8b1b16bc9..0a4edfe3d96d 100644 --- a/include/linux/soc/qcom/ubwc.h +++ b/include/linux/soc/qcom/ubwc.h @@ -52,6 +52,7 @@ struct qcom_ubwc_cfg_data { #define UBWC_4_0 0x40000000 #define UBWC_4_3 0x40030000 #define UBWC_5_0 0x50000000 +#define UBWC_6_0 0x60000000 #if IS_ENABLED(CONFIG_QCOM_UBWC_CONFIG) const struct qcom_ubwc_cfg_data *qcom_ubwc_config_get_data(void); -- cgit v1.2.3 From 603c646f001008eaf8b5a7a888043e5cc8c494a2 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:53 -0700 Subject: coco/tsm: Introduce a core device for TEE Security Managers A "TSM" is a platform component that provides an API for securely provisioning resources for a confidential guest (TVM) to consume. The name originates from the PCI specification for platform agent that carries out operations for PCIe TDISP (TEE Device Interface Security Protocol). Instances of this core device are parented by a device representing the platform security function like CONFIG_CRYPTO_DEV_CCP or CONFIG_INTEL_TDX_HOST. This device interface is a frontend to the aspects of a TSM and TEE I/O that are cross-architecture common. This includes mechanisms like enumerating available platform TEE I/O capabilities and provisioning connections between the platform TSM and device DSMs (Device Security Manager (TDISP)). For now this is just the scaffolding for registering a TSM device sysfs interface. Cc: Xu Yilun Reviewed-by: Jonathan Cameron Co-developed-by: Aneesh Kumar K.V (Arm) Signed-off-by: Aneesh Kumar K.V (Arm) Acked-by: Bjorn Helgaas Reviewed-by: Alexey Kardashevskiy Link: https://patch.msgid.link/20251031212902.2256310-2-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/tsm.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/tsm.h b/include/linux/tsm.h index 431054810dca..cd97c63ffa32 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -5,6 +5,7 @@ #include #include #include +#include #define TSM_REPORT_INBLOB_MAX 64 #define TSM_REPORT_OUTBLOB_MAX SZ_32K @@ -107,6 +108,16 @@ struct tsm_report_ops { bool (*report_bin_attr_visible)(int n); }; +struct tsm_dev { + struct device dev; + int id; +}; + +DEFINE_FREE(put_tsm_dev, struct tsm_dev *, + if (!IS_ERR_OR_NULL(_T)) put_device(&_T->dev)) + int tsm_report_register(const struct tsm_report_ops *ops, void *priv); int tsm_report_unregister(const struct tsm_report_ops *ops); +struct tsm_dev *tsm_register(struct device *parent); +void tsm_unregister(struct tsm_dev *tsm_dev); #endif /* __TSM_H */ -- cgit v1.2.3 From f16469ee733ac52b2373216803699cbb05e82786 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:54 -0700 Subject: PCI/IDE: Enumerate Selective Stream IDE capabilities Link encryption is a new PCIe feature enumerated by "PCIe r7.0 section 7.9.26 IDE Extended Capability". It is both a standalone port + endpoint capability, and a building block for the security protocol defined by "PCIe r7.0 section 11 TEE Device Interface Security Protocol (TDISP)". That protocol coordinates device security setup between a platform TSM (TEE Security Manager) and a device DSM (Device Security Manager). While the platform TSM can allocate resources like Stream ID and manage keys, it still requires system software to manage the IDE capability register block. Add register definitions and basic enumeration in preparation for Selective IDE Stream establishment. A follow on change selects the new CONFIG_PCI_IDE symbol. Note that while the IDE specification defines both a point-to-point "Link Stream" and a Root Port to endpoint "Selective Stream", only "Selective Stream" is considered for Linux as that is the predominant mode expected by Trusted Execution Environment Security Managers (TSMs), and it is the security model that limits the number of PCI components within the TCB in a PCIe topology with switches. Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Reviewed-by: Jonathan Cameron Reviewed-by: Alexey Kardashevskiy Reviewed-by: Aneesh Kumar K.V Link: https://patch.msgid.link/20251031212902.2256310-3-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci.h | 7 ++++ include/uapi/linux/pci_regs.h | 81 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..4402ca931124 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -539,6 +539,13 @@ struct pci_dev { #endif #ifdef CONFIG_PCI_NPEM struct npem *npem; /* Native PCIe Enclosure Management */ +#endif +#ifdef CONFIG_PCI_IDE + u16 ide_cap; /* Link Integrity & Data Encryption */ + u8 nr_ide_mem; /* Address association resources for streams */ + u8 nr_link_ide; /* Link Stream count (Selective Stream offset) */ + unsigned int ide_cfg:1; /* Config cycles over IDE */ + unsigned int ide_tee_limit:1; /* Disallow T=0 traffic over IDE */ #endif u16 acs_cap; /* ACS Capability offset */ u8 supported_speeds; /* Supported Link Speeds Vector */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 07e06aafec50..05bd22d9e352 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -754,6 +754,7 @@ #define PCI_EXT_CAP_ID_NPEM 0x29 /* Native PCIe Enclosure Management */ #define PCI_EXT_CAP_ID_PL_32GT 0x2A /* Physical Layer 32.0 GT/s */ #define PCI_EXT_CAP_ID_DOE 0x2E /* Data Object Exchange */ +#define PCI_EXT_CAP_ID_IDE 0x30 /* Integrity and Data Encryption */ #define PCI_EXT_CAP_ID_PL_64GT 0x31 /* Physical Layer 64.0 GT/s */ #define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_64GT @@ -1249,4 +1250,84 @@ #define PCI_DVSEC_CXL_PORT_CTL 0x0c #define PCI_DVSEC_CXL_PORT_CTL_UNMASK_SBR 0x00000001 +/* Integrity and Data Encryption Extended Capability */ +#define PCI_IDE_CAP 0x04 +#define PCI_IDE_CAP_LINK 0x1 /* Link IDE Stream Supported */ +#define PCI_IDE_CAP_SELECTIVE 0x2 /* Selective IDE Streams Supported */ +#define PCI_IDE_CAP_FLOWTHROUGH 0x4 /* Flow-Through IDE Stream Supported */ +#define PCI_IDE_CAP_PARTIAL_HEADER_ENC 0x8 /* Partial Header Encryption Supported */ +#define PCI_IDE_CAP_AGGREGATION 0x10 /* Aggregation Supported */ +#define PCI_IDE_CAP_PCRC 0x20 /* PCRC Supported */ +#define PCI_IDE_CAP_IDE_KM 0x40 /* IDE_KM Protocol Supported */ +#define PCI_IDE_CAP_SEL_CFG 0x80 /* Selective IDE for Config Request Support */ +#define PCI_IDE_CAP_ALG __GENMASK(12, 8) /* Supported Algorithms */ +#define PCI_IDE_CAP_ALG_AES_GCM_256 0 /* AES-GCM 256 key size, 96b MAC */ +#define PCI_IDE_CAP_LINK_TC_NUM __GENMASK(15, 13) /* Link IDE TCs */ +#define PCI_IDE_CAP_SEL_NUM __GENMASK(23, 16) /* Supported Selective IDE Streams */ +#define PCI_IDE_CAP_TEE_LIMITED 0x1000000 /* TEE-Limited Stream Supported */ +#define PCI_IDE_CTL 0x08 +#define PCI_IDE_CTL_FLOWTHROUGH_IDE 0x4 /* Flow-Through IDE Stream Enabled */ + +#define PCI_IDE_LINK_STREAM_0 0xc /* First Link Stream Register Block */ +#define PCI_IDE_LINK_BLOCK_SIZE 8 +/* Link IDE Stream block, up to PCI_IDE_CAP_LINK_TC_NUM */ +#define PCI_IDE_LINK_CTL_0 0x00 /* First Link Control Register Offset in block */ +#define PCI_IDE_LINK_CTL_EN 0x1 /* Link IDE Stream Enable */ +#define PCI_IDE_LINK_CTL_TX_AGGR_NPR __GENMASK(3, 2) /* Tx Aggregation Mode NPR */ +#define PCI_IDE_LINK_CTL_TX_AGGR_PR __GENMASK(5, 4) /* Tx Aggregation Mode PR */ +#define PCI_IDE_LINK_CTL_TX_AGGR_CPL __GENMASK(7, 6) /* Tx Aggregation Mode CPL */ +#define PCI_IDE_LINK_CTL_PCRC_EN 0x100 /* PCRC Enable */ +#define PCI_IDE_LINK_CTL_PART_ENC __GENMASK(13, 10) /* Partial Header Encryption Mode */ +#define PCI_IDE_LINK_CTL_ALG __GENMASK(18, 14) /* Selection from PCI_IDE_CAP_ALG */ +#define PCI_IDE_LINK_CTL_TC __GENMASK(21, 19) /* Traffic Class */ +#define PCI_IDE_LINK_CTL_ID __GENMASK(31, 24) /* Stream ID */ +#define PCI_IDE_LINK_STS_0 0x4 /* First Link Status Register Offset in block */ +#define PCI_IDE_LINK_STS_STATE __GENMASK(3, 0) /* Link IDE Stream State */ +#define PCI_IDE_LINK_STS_IDE_FAIL 0x80000000 /* IDE fail message received */ + +/* Selective IDE Stream block, up to PCI_IDE_CAP_SELECTIVE_STREAMS_NUM */ +/* Selective IDE Stream Capability Register */ +#define PCI_IDE_SEL_CAP 0x00 +#define PCI_IDE_SEL_CAP_ASSOC_NUM __GENMASK(3, 0) +/* Selective IDE Stream Control Register */ +#define PCI_IDE_SEL_CTL 0x04 +#define PCI_IDE_SEL_CTL_EN 0x1 /* Selective IDE Stream Enable */ +#define PCI_IDE_SEL_CTL_TX_AGGR_NPR __GENMASK(3, 2) /* Tx Aggregation Mode NPR */ +#define PCI_IDE_SEL_CTL_TX_AGGR_PR __GENMASK(5, 4) /* Tx Aggregation Mode PR */ +#define PCI_IDE_SEL_CTL_TX_AGGR_CPL __GENMASK(7, 6) /* Tx Aggregation Mode CPL */ +#define PCI_IDE_SEL_CTL_PCRC_EN 0x100 /* PCRC Enable */ +#define PCI_IDE_SEL_CTL_CFG_EN 0x200 /* Selective IDE for Configuration Requests */ +#define PCI_IDE_SEL_CTL_PART_ENC __GENMASK(13, 10) /* Partial Header Encryption Mode */ +#define PCI_IDE_SEL_CTL_ALG __GENMASK(18, 14) /* Selection from PCI_IDE_CAP_ALG */ +#define PCI_IDE_SEL_CTL_TC __GENMASK(21, 19) /* Traffic Class */ +#define PCI_IDE_SEL_CTL_DEFAULT 0x400000 /* Default Stream */ +#define PCI_IDE_SEL_CTL_TEE_LIMITED 0x800000 /* TEE-Limited Stream */ +#define PCI_IDE_SEL_CTL_ID __GENMASK(31, 24) /* Stream ID */ +#define PCI_IDE_SEL_CTL_ID_MAX 255 +/* Selective IDE Stream Status Register */ +#define PCI_IDE_SEL_STS 0x08 +#define PCI_IDE_SEL_STS_STATE __GENMASK(3, 0) /* Selective IDE Stream State */ +#define PCI_IDE_SEL_STS_STATE_INSECURE 0 +#define PCI_IDE_SEL_STS_STATE_SECURE 2 +#define PCI_IDE_SEL_STS_IDE_FAIL 0x80000000 /* IDE fail message received */ +/* IDE RID Association Register 1 */ +#define PCI_IDE_SEL_RID_1 0x0c +#define PCI_IDE_SEL_RID_1_LIMIT __GENMASK(23, 8) +/* IDE RID Association Register 2 */ +#define PCI_IDE_SEL_RID_2 0x10 +#define PCI_IDE_SEL_RID_2_VALID 0x1 +#define PCI_IDE_SEL_RID_2_BASE __GENMASK(23, 8) +#define PCI_IDE_SEL_RID_2_SEG __GENMASK(31, 24) +/* Selective IDE Address Association Register Block, up to PCI_IDE_SEL_CAP_ASSOC_NUM */ +#define PCI_IDE_SEL_ADDR_BLOCK_SIZE 12 +#define PCI_IDE_SEL_ADDR_1(x) (20 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) +#define PCI_IDE_SEL_ADDR_1_VALID 0x1 +#define PCI_IDE_SEL_ADDR_1_BASE_LOW __GENMASK(19, 8) +#define PCI_IDE_SEL_ADDR_1_LIMIT_LOW __GENMASK(31, 20) +/* IDE Address Association Register 2 is "Memory Limit Upper" */ +#define PCI_IDE_SEL_ADDR_2(x) (24 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) +/* IDE Address Association Register 3 is "Memory Base Upper" */ +#define PCI_IDE_SEL_ADDR_3(x) (28 + (x) * PCI_IDE_SEL_ADDR_BLOCK_SIZE) +#define PCI_IDE_SEL_BLOCK_SIZE(nr_assoc) (20 + PCI_IDE_SEL_ADDR_BLOCK_SIZE * (nr_assoc)) + #endif /* LINUX_PCI_REGS_H */ -- cgit v1.2.3 From 215afa89d249bb095126cf00f8be719e421c75e9 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:55 -0700 Subject: PCI: Introduce pci_walk_bus_reverse(), for_each_pci_dev_reverse() PCI/TSM, the PCI core functionality for the PCIe TEE Device Interface Security Protocol (TDISP), has a need to walk all subordinate functions of a Device Security Manager (DSM) to setup a device security context. A DSM is physical function 0 of multi-function or SR-IOV device endpoint, or it is an upstream switch port. In error scenarios or when a TEE Security Manager (TSM) device is removed it needs to unwind all established DSM contexts. Introduce reverse versions of PCI device iteration helpers to mirror the setup path and ensure that dependent children are handled before parents. Cc: Greg Kroah-Hartman Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-4-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/device/bus.h | 3 +++ include/linux/pci.h | 11 +++++++++++ 2 files changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/device/bus.h b/include/linux/device/bus.h index f5a56efd2bd6..99b1002b3e31 100644 --- a/include/linux/device/bus.h +++ b/include/linux/device/bus.h @@ -150,6 +150,9 @@ int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data, device_iter_t fn); struct device *bus_find_device(const struct bus_type *bus, struct device *start, const void *data, device_match_t match); +struct device *bus_find_device_reverse(const struct bus_type *bus, + struct device *start, const void *data, + device_match_t match); /** * bus_find_device_by_name - device iterator for locating a particular device * of a specific name. diff --git a/include/linux/pci.h b/include/linux/pci.h index 4402ca931124..b6a12a82be12 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -582,6 +582,8 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus); #define to_pci_dev(n) container_of(n, struct pci_dev, dev) #define for_each_pci_dev(d) while ((d = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL) +#define for_each_pci_dev_reverse(d) \ + while ((d = pci_get_device_reverse(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL) static inline int pci_channel_offline(struct pci_dev *pdev) { @@ -1242,6 +1244,8 @@ u64 pci_get_dsn(struct pci_dev *dev); struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device, struct pci_dev *from); +struct pci_dev *pci_get_device_reverse(unsigned int vendor, unsigned int device, + struct pci_dev *from); struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from); @@ -1661,6 +1665,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *), void *userdata); +void pci_walk_bus_reverse(struct pci_bus *top, + int (*cb)(struct pci_dev *, void *), void *userdata); int pci_cfg_space_size(struct pci_dev *dev); unsigned char pci_bus_max_busnr(struct pci_bus *bus); resource_size_t pcibios_window_alignment(struct pci_bus *bus, @@ -2049,6 +2055,11 @@ static inline struct pci_dev *pci_get_device(unsigned int vendor, struct pci_dev *from) { return NULL; } +static inline struct pci_dev *pci_get_device_reverse(unsigned int vendor, + unsigned int device, + struct pci_dev *from) +{ return NULL; } + static inline struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, unsigned int ss_vendor, -- cgit v1.2.3 From 3225f52cde56f46789a4972d3c54df8a4d75f022 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:56 -0700 Subject: PCI/TSM: Establish Secure Sessions and Link Encryption The PCIe 7.0 specification, section 11, defines the Trusted Execution Environment (TEE) Device Interface Security Protocol (TDISP). This protocol definition builds upon Component Measurement and Authentication (CMA), and link Integrity and Data Encryption (IDE). It adds support for assigning devices (PCI physical or virtual function) to a confidential VM such that the assigned device is enabled to access guest private memory protected by technologies like Intel TDX, AMD SEV-SNP, RISCV COVE, or ARM CCA. The "TSM" (TEE Security Manager) is a concept in the TDISP specification of an agent that mediates between a "DSM" (Device Security Manager) and system software in both a VMM and a confidential VM. A VMM uses TSM ABIs to setup link security and assign devices. A confidential VM uses TSM ABIs to transition an assigned device into the TDISP "RUN" state and validate its configuration. From a Linux perspective the TSM abstracts many of the details of TDISP, IDE, and CMA. Some of those details leak through at times, but for the most part TDISP is an internal implementation detail of the TSM. CONFIG_PCI_TSM adds an "authenticated" attribute and "tsm/" subdirectory to pci-sysfs. Consider that the TSM driver may itself be a PCI driver. Userspace can watch for the arrival of a "TSM" device, /sys/class/tsm/tsm0/uevent KOBJ_CHANGE, to know when the PCI core has initialized TSM services. The operations that can be executed against a PCI device are split into two mutually exclusive operation sets, "Link" and "Security" (struct pci_tsm_{link,security}_ops). The "Link" operations manage physical link security properties and communication with the device's Device Security Manager firmware. These are the host side operations in TDISP. The "Security" operations coordinate the security state of the assigned virtual device (TDI). These are the guest side operations in TDISP. Only "link" (Secure Session and physical Link Encryption) operations are defined at this stage. There are placeholders for the device security (Trusted Computing Base entry / exit) operations. The locking allows for multiple devices to be executing commands simultaneously, one outstanding command per-device and an rwsem synchronizes the implementation relative to TSM registration/unregistration events. Thanks to Wu Hao for his work on an early draft of this support. Cc: Lukas Wunner Cc: Samuel Ortiz Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Alexey Kardashevskiy Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Link: https://patch.msgid.link/20251031212902.2256310-5-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-doe.h | 4 ++ include/linux/pci-tsm.h | 157 ++++++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 3 + include/linux/tsm.h | 5 +- include/uapi/linux/pci_regs.h | 1 + 5 files changed, 169 insertions(+), 1 deletion(-) create mode 100644 include/linux/pci-tsm.h (limited to 'include') diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h index 1f14aed4354b..bd4346a7c4e7 100644 --- a/include/linux/pci-doe.h +++ b/include/linux/pci-doe.h @@ -15,6 +15,10 @@ struct pci_doe_mb; +#define PCI_DOE_FEATURE_DISCOVERY 0 +#define PCI_DOE_FEATURE_CMA 1 +#define PCI_DOE_FEATURE_SSESSION 2 + struct pci_doe_mb *pci_find_doe_mailbox(struct pci_dev *pdev, u16 vendor, u8 type); diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h new file mode 100644 index 000000000000..e921d30f9b6c --- /dev/null +++ b/include/linux/pci-tsm.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PCI_TSM_H +#define __PCI_TSM_H +#include +#include + +struct pci_tsm; +struct tsm_dev; + +/* + * struct pci_tsm_ops - manage confidential links and security state + * @link_ops: Coordinate PCIe SPDM and IDE establishment via a platform TSM. + * Provide a secure session transport for TDISP state management + * (typically bare metal physical function operations). + * @devsec_ops: Lock, unlock, and interrogate the security state of the + * function via the platform TSM (typically virtual function + * operations). + * + * This operations are mutually exclusive either a tsm_dev instance + * manages physical link properties or it manages function security + * states like TDISP lock/unlock. + */ +struct pci_tsm_ops { + /* + * struct pci_tsm_link_ops - Manage physical link and the TSM/DSM session + * @probe: establish context with the TSM (allocate / wrap 'struct + * pci_tsm') for follow-on link operations + * @remove: destroy link operations context + * @connect: establish / validate a secure connection (e.g. IDE) + * with the device + * @disconnect: teardown the secure link + * + * Context: @probe, @remove, @connect, and @disconnect run under + * pci_tsm_rwsem held for write to sync with TSM unregistration and + * mutual exclusion of @connect and @disconnect. @connect and + * @disconnect additionally run under the DSM lock (struct + * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions. + */ + struct_group_tagged(pci_tsm_link_ops, link_ops, + struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev, + struct pci_dev *pdev); + void (*remove)(struct pci_tsm *tsm); + int (*connect)(struct pci_dev *pdev); + void (*disconnect)(struct pci_dev *pdev); + ); + + /* + * struct pci_tsm_devsec_ops - Manage the security state of the function + * @lock: establish context with the TSM (allocate / wrap 'struct + * pci_tsm') for follow-on security state transitions from the + * LOCKED state + * @unlock: destroy TSM context and return device to UNLOCKED state + * + * Context: @lock and @unlock run under pci_tsm_rwsem held for write to + * sync with TSM unregistration and each other + */ + struct_group_tagged(pci_tsm_devsec_ops, devsec_ops, + struct pci_tsm *(*lock)(struct tsm_dev *tsm_dev, + struct pci_dev *pdev); + void (*unlock)(struct pci_tsm *tsm); + ); +}; + +/** + * struct pci_tsm - Core TSM context for a given PCIe endpoint + * @pdev: Back ref to device function, distinguishes type of pci_tsm context + * @dsm_dev: PCI Device Security Manager for link operations on @pdev + * @tsm_dev: PCI TEE Security Manager device for Link Confidentiality or Device + * Function Security operations + * + * This structure is wrapped by low level TSM driver data and returned by + * probe()/lock(), it is freed by the corresponding remove()/unlock(). + * + * For link operations it serves to cache the association between a Device + * Security Manager (DSM) and the functions that manager can assign to a TVM. + * That can be "self", for assigning function0 of a TEE I/O device, a + * sub-function (SR-IOV virtual function, or non-function0 + * multifunction-device), or a downstream endpoint (PCIe upstream switch-port as + * DSM). + */ +struct pci_tsm { + struct pci_dev *pdev; + struct pci_dev *dsm_dev; + struct tsm_dev *tsm_dev; +}; + +/** + * struct pci_tsm_pf0 - Physical Function 0 TDISP link context + * @base_tsm: generic core "tsm" context + * @lock: mutual exclustion for pci_tsm_ops invocation + * @doe_mb: PCIe Data Object Exchange mailbox + */ +struct pci_tsm_pf0 { + struct pci_tsm base_tsm; + struct mutex lock; + struct pci_doe_mb *doe_mb; +}; + +/* physical function0 and capable of 'connect' */ +static inline bool is_pci_tsm_pf0(struct pci_dev *pdev) +{ + if (!pdev) + return false; + + if (!pci_is_pcie(pdev)) + return false; + + if (pdev->is_virtfn) + return false; + + /* + * Allow for a Device Security Manager (DSM) associated with function0 + * of an Endpoint to coordinate TDISP requests for other functions + * (physical or virtual) of the device, or allow for an Upstream Port + * DSM to accept TDISP requests for the Endpoints downstream of the + * switch. + */ + switch (pci_pcie_type(pdev)) { + case PCI_EXP_TYPE_ENDPOINT: + case PCI_EXP_TYPE_UPSTREAM: + case PCI_EXP_TYPE_RC_END: + if (pdev->ide_cap || (pdev->devcap & PCI_EXP_DEVCAP_TEE)) + break; + fallthrough; + default: + return false; + } + + return PCI_FUNC(pdev->devfn) == 0; +} + +#ifdef CONFIG_PCI_TSM +int pci_tsm_register(struct tsm_dev *tsm_dev); +void pci_tsm_unregister(struct tsm_dev *tsm_dev); +int pci_tsm_link_constructor(struct pci_dev *pdev, struct pci_tsm *tsm, + struct tsm_dev *tsm_dev); +int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm, + struct tsm_dev *tsm_dev); +void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *tsm); +int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req, + size_t req_sz, void *resp, size_t resp_sz); +#else +static inline int pci_tsm_register(struct tsm_dev *tsm_dev) +{ + return 0; +} +static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev) +{ +} +static inline int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, + const void *req, size_t req_sz, + void *resp, size_t resp_sz) +{ + return -ENXIO; +} +#endif +#endif /*__PCI_TSM_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index b6a12a82be12..2f9c0cb6a50a 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -546,6 +546,9 @@ struct pci_dev { u8 nr_link_ide; /* Link Stream count (Selective Stream offset) */ unsigned int ide_cfg:1; /* Config cycles over IDE */ unsigned int ide_tee_limit:1; /* Disallow T=0 traffic over IDE */ +#endif +#ifdef CONFIG_PCI_TSM + struct pci_tsm *tsm; /* TSM operation state */ #endif u16 acs_cap; /* ACS Capability offset */ u8 supported_speeds; /* Supported Link Speeds Vector */ diff --git a/include/linux/tsm.h b/include/linux/tsm.h index cd97c63ffa32..22e05b2aac69 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -108,9 +108,11 @@ struct tsm_report_ops { bool (*report_bin_attr_visible)(int n); }; +struct pci_tsm_ops; struct tsm_dev { struct device dev; int id; + const struct pci_tsm_ops *pci_ops; }; DEFINE_FREE(put_tsm_dev, struct tsm_dev *, @@ -118,6 +120,7 @@ DEFINE_FREE(put_tsm_dev, struct tsm_dev *, int tsm_report_register(const struct tsm_report_ops *ops, void *priv); int tsm_report_unregister(const struct tsm_report_ops *ops); -struct tsm_dev *tsm_register(struct device *parent); +struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *ops); void tsm_unregister(struct tsm_dev *tsm_dev); +struct tsm_dev *find_tsm_dev(int id); #endif /* __TSM_H */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 05bd22d9e352..f2759c1097bc 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -503,6 +503,7 @@ #define PCI_EXP_DEVCAP_PWR_VAL 0x03fc0000 /* Slot Power Limit Value */ #define PCI_EXP_DEVCAP_PWR_SCL 0x0c000000 /* Slot Power Limit Scale */ #define PCI_EXP_DEVCAP_FLR 0x10000000 /* Function Level Reset */ +#define PCI_EXP_DEVCAP_TEE 0x40000000 /* TEE I/O (TDISP) Support */ #define PCI_EXP_DEVCTL 0x08 /* Device Control */ #define PCI_EXP_DEVCTL_CERE 0x0001 /* Correctable Error Reporting En. */ #define PCI_EXP_DEVCTL_NFERE 0x0002 /* Non-Fatal Error Reporting Enable */ -- cgit v1.2.3 From c0c1262fbfbafe943dbccd5f97b500b72dbd2205 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:57 -0700 Subject: PCI: Add PCIe Device 3 Extended Capability enumeration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe r7.0 Section 7.7.9 Device 3 Extended Capability Structure, defines the canonical location for determining the Flit Mode of a device. This status is a dependency for PCIe IDE enabling. Add a new fm_enabled flag to 'struct pci_dev'. Cc: Lukas Wunner Cc: Ilpo Järvinen Cc: Bjorn Helgaas Cc: Samuel Ortiz Cc: Alexey Kardashevskiy Cc: Xu Yilun Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-6-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci.h | 1 + include/uapi/linux/pci_regs.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 2f9c0cb6a50a..ea94799c81b0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -450,6 +450,7 @@ struct pci_dev { unsigned int pasid_enabled:1; /* Process Address Space ID */ unsigned int pri_enabled:1; /* Page Request Interface */ unsigned int tph_enabled:1; /* TLP Processing Hints */ + unsigned int fm_enabled:1; /* Flit Mode (segment captured) */ unsigned int is_managed:1; /* Managed via devres */ unsigned int is_msi_managed:1; /* MSI release via devres installed */ unsigned int needs_freset:1; /* Requires fundamental reset */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index f2759c1097bc..3add74ae2594 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -755,6 +755,7 @@ #define PCI_EXT_CAP_ID_NPEM 0x29 /* Native PCIe Enclosure Management */ #define PCI_EXT_CAP_ID_PL_32GT 0x2A /* Physical Layer 32.0 GT/s */ #define PCI_EXT_CAP_ID_DOE 0x2E /* Data Object Exchange */ +#define PCI_EXT_CAP_ID_DEV3 0x2F /* Device 3 Capability/Control/Status */ #define PCI_EXT_CAP_ID_IDE 0x30 /* Integrity and Data Encryption */ #define PCI_EXT_CAP_ID_PL_64GT 0x31 /* Physical Layer 64.0 GT/s */ #define PCI_EXT_CAP_ID_MAX PCI_EXT_CAP_ID_PL_64GT @@ -1246,6 +1247,12 @@ /* Deprecated old name, replaced with PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE */ #define PCI_DOE_DATA_OBJECT_DISC_RSP_3_PROTOCOL PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE +/* Device 3 Extended Capability */ +#define PCI_DEV3_CAP 0x04 /* Device 3 Capabilities Register */ +#define PCI_DEV3_CTL 0x08 /* Device 3 Control Register */ +#define PCI_DEV3_STA 0x0c /* Device 3 Status Register */ +#define PCI_DEV3_STA_SEGMENT 0x8 /* Segment Captured (end-to-end flit-mode detected) */ + /* Compute Express Link (CXL r3.1, sec 8.1.5) */ #define PCI_DVSEC_CXL_PORT 3 #define PCI_DVSEC_CXL_PORT_CTL 0x0c -- cgit v1.2.3 From 1e4d2ff3ae450dab37b5b5726c3f7df3e60d6e89 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:28:59 -0700 Subject: PCI/IDE: Add IDE establishment helpers There are two components to establishing an encrypted link, provisioning the stream in Partner Port config-space, and programming the keys into the link layer via IDE_KM (IDE Key Management). This new library, drivers/pci/ide.c, enables the former. IDE_KM, via a TSM low-level driver, is saved for later. With the platform TSM implementations of SEV-TIO and TDX Connect in mind this library abstracts small differences in those implementations. For example, TDX Connect handles Root Port register setup while SEV-TIO expects System Software to update the Root Port registers. This is the rationale for fine-grained 'setup' + 'enable' verbs. The other design detail for TSM-coordinated IDE establishment is that the TSM may manage allocation of Stream IDs, this is why the Stream ID value is passed in to pci_ide_stream_setup(). The flow is: pci_ide_stream_alloc(): Allocate a Selective IDE Stream Register Block in each Partner Port (Endpoint + Root Port), and reserve a host bridge / platform stream slot. Gather Partner Port specific stream settings like Requester ID. pci_ide_stream_register(): Publish the stream in sysfs after allocating a Stream ID. In the TSM case the TSM allocates the Stream ID for the Partner Port pair. pci_ide_stream_setup(): Program the stream settings to a Partner Port. Caller is responsible for optionally calling this for the Root Port as well if the TSM implementation requires it. pci_ide_stream_enable(): Enable the stream after IDE_KM. In support of system administrators auditing where platform, Root Port, and Endpoint IDE stream resources are being spent, the allocated stream is reflected as a symlink from the host bridge to the endpoint with the name: stream%d.%d.%d Where the tuple of integers reflects the allocated platform, Root Port, and Endpoint stream index (Selective IDE Stream Register Block) values. Thanks to Wu Hao for a draft implementation of this infrastructure. Cc: Bjorn Helgaas Cc: Lukas Wunner Cc: Samuel Ortiz Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-8-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-ide.h | 78 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/pci.h | 6 ++++ 2 files changed, 84 insertions(+) create mode 100644 include/linux/pci-ide.h (limited to 'include') diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h new file mode 100644 index 000000000000..e638f9429bf9 --- /dev/null +++ b/include/linux/pci-ide.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common helpers for drivers (e.g. low-level PCI/TSM drivers) implementing the + * IDE key management protocol (IDE_KM) as defined by: + * PCIe r7.0 section 6.33 Integrity & Data Encryption (IDE) + * + * Copyright(c) 2024-2025 Intel Corporation. All rights reserved. + */ + +#ifndef __PCI_IDE_H__ +#define __PCI_IDE_H__ + +enum pci_ide_partner_select { + PCI_IDE_EP, + PCI_IDE_RP, + PCI_IDE_PARTNER_MAX, + /* + * In addition to the resources in each partner port the + * platform / host-bridge additionally has a Stream ID pool that + * it shares across root ports. Let pci_ide_stream_alloc() use + * the alloc_stream_index() helper as endpoints and root ports. + */ + PCI_IDE_HB = PCI_IDE_PARTNER_MAX, +}; + +/** + * struct pci_ide_partner - Per port pair Selective IDE Stream settings + * @rid_start: Partner Port Requester ID range start + * @rid_end: Partner Port Requester ID range end + * @stream_index: Selective IDE Stream Register Block selection + * @default_stream: Endpoint uses this stream for all upstream TLPs regardless of + * address and RID association registers + * @setup: flag to track whether to run pci_ide_stream_teardown() for this + * partner slot + * @enable: flag whether to run pci_ide_stream_disable() for this partner slot + */ +struct pci_ide_partner { + u16 rid_start; + u16 rid_end; + u8 stream_index; + unsigned int default_stream:1; + unsigned int setup:1; + unsigned int enable:1; +}; + +/** + * struct pci_ide - PCIe Selective IDE Stream descriptor + * @pdev: PCIe Endpoint in the pci_ide_partner pair + * @partner: per-partner settings + * @host_bridge_stream: allocated from host bridge @ide_stream_ida pool + * @stream_id: unique Stream ID (within Partner Port pairing) + * @name: name of the established Selective IDE Stream in sysfs + * + * Negative @stream_id values indicate "uninitialized" on the + * expectation that with TSM established IDE the TSM owns the stream_id + * allocation. + */ +struct pci_ide { + struct pci_dev *pdev; + struct pci_ide_partner partner[PCI_IDE_PARTNER_MAX]; + u8 host_bridge_stream; + int stream_id; + const char *name; +}; + +struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, + struct pci_ide *ide); +struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev); +void pci_ide_stream_free(struct pci_ide *ide); +int pci_ide_stream_register(struct pci_ide *ide); +void pci_ide_stream_unregister(struct pci_ide *ide); +void pci_ide_stream_setup(struct pci_dev *pdev, struct pci_ide *ide); +void pci_ide_stream_teardown(struct pci_dev *pdev, struct pci_ide *ide); +int pci_ide_stream_enable(struct pci_dev *pdev, struct pci_ide *ide); +void pci_ide_stream_disable(struct pci_dev *pdev, struct pci_ide *ide); +void pci_ide_stream_release(struct pci_ide *ide); +DEFINE_FREE(pci_ide_stream_release, struct pci_ide *, if (_T) pci_ide_stream_release(_T)) +#endif /* __PCI_IDE_H__ */ diff --git a/include/linux/pci.h b/include/linux/pci.h index ea94799c81b0..2c8dbae4916c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -545,6 +545,8 @@ struct pci_dev { u16 ide_cap; /* Link Integrity & Data Encryption */ u8 nr_ide_mem; /* Address association resources for streams */ u8 nr_link_ide; /* Link Stream count (Selective Stream offset) */ + u16 nr_sel_ide; /* Selective Stream count (register block allocator) */ + struct ida ide_stream_ida; unsigned int ide_cfg:1; /* Config cycles over IDE */ unsigned int ide_tee_limit:1; /* Disallow T=0 traffic over IDE */ #endif @@ -614,6 +616,10 @@ struct pci_host_bridge { int domain_nr; struct list_head windows; /* resource_entry */ struct list_head dma_ranges; /* dma ranges resource list */ +#ifdef CONFIG_PCI_IDE + u16 nr_ide_streams; /* Max streams possibly active in @ide_stream_ida */ + struct ida ide_stream_ida; +#endif u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ int (*map_irq)(const struct pci_dev *, u8, u8); void (*release_fn)(struct pci_host_bridge *); -- cgit v1.2.3 From 9ddaf9c3ed007cd03c1335fb40920ad76f72a3d5 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:29:00 -0700 Subject: PCI/IDE: Report available IDE streams The limited number of link-encryption (IDE) streams that a given set of host bridges supports is a platform specific detail. Provide pci_ide_init_nr_streams() as a generic facility for either platform TSM drivers, or PCI core native IDE, to report the number available streams. After invoking pci_ide_init_nr_streams() an "available_secure_streams" attribute appears in PCI host bridge sysfs to convey that count. Introduce a device-type, @pci_host_bridge_type, now that both a release method and sysfs attribute groups are being specified for all 'struct pci_host_bridge' instances. Cc: Bjorn Helgaas Cc: Lukas Wunner Cc: Samuel Ortiz Cc: Alexey Kardashevskiy Cc: Xu Yilun Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251031212902.2256310-9-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-ide.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index e638f9429bf9..85645b0a8620 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -63,6 +63,7 @@ struct pci_ide { const char *name; }; +void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr); struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, struct pci_ide *ide); struct pci_ide *pci_ide_stream_alloc(struct pci_dev *pdev); -- cgit v1.2.3 From a4438f06b1db15ce3d831ce82b8767665638aa2a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Oct 2025 14:29:01 -0700 Subject: PCI/TSM: Report active IDE streams Given that the platform TSM owns IDE Stream ID allocation, report the active streams via the TSM class device. Establish a symlink from the class device to the PCI endpoint device consuming the stream, named by the Stream ID. Acked-by: Bjorn Helgaas Reviewed-by: Jonathan Cameron Reviewed-by: Alexey Kardashevskiy Link: https://patch.msgid.link/20251031212902.2256310-10-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-ide.h | 2 ++ include/linux/tsm.h | 3 +++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index 85645b0a8620..d0f10f3c89fc 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -50,6 +50,7 @@ struct pci_ide_partner { * @host_bridge_stream: allocated from host bridge @ide_stream_ida pool * @stream_id: unique Stream ID (within Partner Port pairing) * @name: name of the established Selective IDE Stream in sysfs + * @tsm_dev: For TSM established IDE, the TSM device context * * Negative @stream_id values indicate "uninitialized" on the * expectation that with TSM established IDE the TSM owns the stream_id @@ -61,6 +62,7 @@ struct pci_ide { u8 host_bridge_stream; int stream_id; const char *name; + struct tsm_dev *tsm_dev; }; void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr); diff --git a/include/linux/tsm.h b/include/linux/tsm.h index 22e05b2aac69..a3b7ab668eff 100644 --- a/include/linux/tsm.h +++ b/include/linux/tsm.h @@ -123,4 +123,7 @@ int tsm_report_unregister(const struct tsm_report_ops *ops); struct tsm_dev *tsm_register(struct device *parent, struct pci_tsm_ops *ops); void tsm_unregister(struct tsm_dev *tsm_dev); struct tsm_dev *find_tsm_dev(int id); +struct pci_ide; +int tsm_ide_stream_register(struct pci_ide *ide); +void tsm_ide_stream_unregister(struct pci_ide *ide); #endif /* __TSM_H */ -- cgit v1.2.3 From e497310b4ffb559e1149ee89470d5c518d234ddf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:43:55 +0100 Subject: uaccess: Provide scoped user access regions User space access regions are tedious and require similar code patterns all over the place: if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(val, from, Efault); user_read_access_end(); return 0; Efault: user_read_access_end(); return -EFAULT; This got worse with the recent addition of masked user access, which optimizes the speculation prevention: if (can_do_masked_user_access()) from = masked_user_read_access_begin((from)); else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(val, from, Efault); user_read_access_end(); return 0; Efault: user_read_access_end(); return -EFAULT; There have been issues with using the wrong user_*_access_end() variant in the error path and other typical Copy&Pasta problems, e.g. using the wrong fault label in the user accessor which ends up using the wrong accesss end variant. These patterns beg for scopes with automatic cleanup. The resulting outcome is: scoped_user_read_access(from, Efault) unsafe_get_user(val, from, Efault); return 0; Efault: return -EFAULT; The scope guarantees the proper cleanup for the access mode is invoked both in the success and the failure (fault) path. The scoped_user_$MODE_access() macros are implemented as self terminating nested for() loops. Thanks to Andrew Cooper for pointing me at them. The scope can therefore be left with 'break', 'goto' and 'return'. Even 'continue' "works" due to the self termination mechanism. Both GCC and clang optimize all the convoluted macro maze out and the above results with clang in: b80: f3 0f 1e fa endbr64 b84: 48 b8 ef cd ab 89 67 45 23 01 movabs $0x123456789abcdef,%rax b8e: 48 39 c7 cmp %rax,%rdi b91: 48 0f 47 f8 cmova %rax,%rdi b95: 90 nop b96: 90 nop b97: 90 nop b98: 31 c9 xor %ecx,%ecx b9a: 8b 07 mov (%rdi),%eax b9c: 89 06 mov %eax,(%rsi) b9e: 85 c9 test %ecx,%ecx ba0: 0f 94 c0 sete %al ba3: 90 nop ba4: 90 nop ba5: 90 nop ba6: c3 ret Which looks as compact as it gets. The NOPs are placeholder for STAC/CLAC. GCC emits the fault path seperately: bf0: f3 0f 1e fa endbr64 bf4: 48 b8 ef cd ab 89 67 45 23 01 movabs $0x123456789abcdef,%rax bfe: 48 39 c7 cmp %rax,%rdi c01: 48 0f 47 f8 cmova %rax,%rdi c05: 90 nop c06: 90 nop c07: 90 nop c08: 31 d2 xor %edx,%edx c0a: 8b 07 mov (%rdi),%eax c0c: 89 06 mov %eax,(%rsi) c0e: 85 d2 test %edx,%edx c10: 75 09 jne c1b c12: 90 nop c13: 90 nop c14: 90 nop c15: b8 01 00 00 00 mov $0x1,%eax c1a: c3 ret c1b: 90 nop c1c: 90 nop c1d: 90 nop c1e: 31 c0 xor %eax,%eax c20: c3 ret The fault labels for the scoped*() macros and the fault labels for the actual user space accessors can be shared and must be placed outside of the scope. If masked user access is enabled on an architecture, then the pointer handed in to scoped_user_$MODE_access() can be modified to point to a guaranteed faulting user address. This modification is only scope local as the pointer is aliased inside the scope. When the scope is left the alias is not longer in effect. IOW the original pointer value is preserved so it can be used e.g. for fixup or diagnostic purposes in the fault path. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027083745.546420421@linutronix.de --- include/linux/uaccess.h | 192 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 8aa82b1d6013..5f142c05b0dc 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,6 +2,7 @@ #ifndef __LINUX_UACCESS_H__ #define __LINUX_UACCESS_H__ +#include #include #include #include @@ -35,9 +36,17 @@ #ifdef masked_user_access_begin #define can_do_masked_user_access() 1 +# ifndef masked_user_write_access_begin +# define masked_user_write_access_begin masked_user_access_begin +# endif +# ifndef masked_user_read_access_begin +# define masked_user_read_access_begin masked_user_access_begin +#endif #else #define can_do_masked_user_access() 0 #define masked_user_access_begin(src) NULL + #define masked_user_read_access_begin(src) NULL + #define masked_user_write_access_begin(src) NULL #define mask_user_address(src) (src) #endif @@ -633,6 +642,189 @@ static inline void user_access_restore(unsigned long flags) { } #define user_read_access_end user_access_end #endif +/* Define RW variant so the below _mode macro expansion works */ +#define masked_user_rw_access_begin(u) masked_user_access_begin(u) +#define user_rw_access_begin(u, s) user_access_begin(u, s) +#define user_rw_access_end() user_access_end() + +/* Scoped user access */ +#define USER_ACCESS_GUARD(_mode) \ +static __always_inline void __user * \ +class_user_##_mode##_begin(void __user *ptr) \ +{ \ + return ptr; \ +} \ + \ +static __always_inline void \ +class_user_##_mode##_end(void __user *ptr) \ +{ \ + user_##_mode##_access_end(); \ +} \ + \ +DEFINE_CLASS(user_ ##_mode## _access, void __user *, \ + class_user_##_mode##_end(_T), \ + class_user_##_mode##_begin(ptr), void __user *ptr) \ + \ +static __always_inline class_user_##_mode##_access_t \ +class_user_##_mode##_access_ptr(void __user *scope) \ +{ \ + return scope; \ +} + +USER_ACCESS_GUARD(read) +USER_ACCESS_GUARD(write) +USER_ACCESS_GUARD(rw) +#undef USER_ACCESS_GUARD + +/** + * __scoped_user_access_begin - Start a scoped user access + * @mode: The mode of the access class (read, write, rw) + * @uptr: The pointer to access user space memory + * @size: Size of the access + * @elbl: Error label to goto when the access region is rejected + * + * Internal helper for __scoped_user_access(). Don't use directly. + */ +#define __scoped_user_access_begin(mode, uptr, size, elbl) \ +({ \ + typeof(uptr) __retptr; \ + \ + if (can_do_masked_user_access()) { \ + __retptr = masked_user_##mode##_access_begin(uptr); \ + } else { \ + __retptr = uptr; \ + if (!user_##mode##_access_begin(uptr, size)) \ + goto elbl; \ + } \ + __retptr; \ +}) + +/** + * __scoped_user_access - Open a scope for user access + * @mode: The mode of the access class (read, write, rw) + * @uptr: The pointer to access user space memory + * @size: Size of the access + * @elbl: Error label to goto when the access region is rejected. It + * must be placed outside the scope + * + * If the user access function inside the scope requires a fault label, it + * can use @elbl or a different label outside the scope, which requires + * that user access which is implemented with ASM GOTO has been properly + * wrapped. See unsafe_get_user() for reference. + * + * scoped_user_rw_access(ptr, efault) { + * unsafe_get_user(rval, &ptr->rval, efault); + * unsafe_put_user(wval, &ptr->wval, efault); + * } + * return 0; + * efault: + * return -EFAULT; + * + * The scope is internally implemented as a autoterminating nested for() + * loop, which can be left with 'return', 'break' and 'goto' at any + * point. + * + * When the scope is left user_##@_mode##_access_end() is automatically + * invoked. + * + * When the architecture supports masked user access and the access region + * which is determined by @uptr and @size is not a valid user space + * address, i.e. < TASK_SIZE, the scope sets the pointer to a faulting user + * space address and does not terminate early. This optimizes for the good + * case and lets the performance uncritical bad case go through the fault. + * + * The eventual modification of the pointer is limited to the scope. + * Outside of the scope the original pointer value is unmodified, so that + * the original pointer value is available for diagnostic purposes in an + * out of scope fault path. + * + * Nesting scoped user access into a user access scope is invalid and fails + * the build. Nesting into other guards, e.g. pagefault is safe. + * + * The masked variant does not check the size of the access and relies on a + * mapping hole (e.g. guard page) to catch an out of range pointer, the + * first access to user memory inside the scope has to be within + * @uptr ... @uptr + PAGE_SIZE - 1 + * + * Don't use directly. Use scoped_masked_user_$MODE_access() instead. + */ +#define __scoped_user_access(mode, uptr, size, elbl) \ +for (bool done = false; !done; done = true) \ + for (void __user *_tmpptr = __scoped_user_access_begin(mode, uptr, size, elbl); \ + !done; done = true) \ + for (CLASS(user_##mode##_access, scope)(_tmpptr); !done; done = true) \ + /* Force modified pointer usage within the scope */ \ + for (const typeof(uptr) uptr = _tmpptr; !done; done = true) + +/** + * scoped_user_read_access_size - Start a scoped user read access with given size + * @usrc: Pointer to the user space address to read from + * @size: Size of the access starting from @usrc + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_read_access_size(usrc, size, elbl) \ + __scoped_user_access(read, usrc, size, elbl) + +/** + * scoped_user_read_access - Start a scoped user read access + * @usrc: Pointer to the user space address to read from + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @usrc is determined via sizeof(*@usrc)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_read_access(usrc, elbl) \ + scoped_user_read_access_size(usrc, sizeof(*(usrc)), elbl) + +/** + * scoped_user_write_access_size - Start a scoped user write access with given size + * @udst: Pointer to the user space address to write to + * @size: Size of the access starting from @udst + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_write_access_size(udst, size, elbl) \ + __scoped_user_access(write, udst, size, elbl) + +/** + * scoped_user_write_access - Start a scoped user write access + * @udst: Pointer to the user space address to write to + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @udst is determined via sizeof(*@udst)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_write_access(udst, elbl) \ + scoped_user_write_access_size(udst, sizeof(*(udst)), elbl) + +/** + * scoped_user_rw_access_size - Start a scoped user read/write access with given size + * @uptr Pointer to the user space address to read from and write to + * @size: Size of the access starting from @uptr + * @elbl: Error label to goto when the access region is rejected + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_rw_access_size(uptr, size, elbl) \ + __scoped_user_access(rw, uptr, size, elbl) + +/** + * scoped_user_rw_access - Start a scoped user read/write access + * @uptr Pointer to the user space address to read from and write to + * @elbl: Error label to goto when the access region is rejected + * + * The size of the access starting from @uptr is determined via sizeof(*@uptr)). + * + * For further information see __scoped_user_access() above. + */ +#define scoped_user_rw_access(uptr, elbl) \ + scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl) + #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, -- cgit v1.2.3 From b2cfc0cd68b830dde80fce2406580e258a1e976d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:43:56 +0100 Subject: uaccess: Provide put/get_user_inline() Provide convenience wrappers around scoped user access similar to put/get_user(), which reduce the usage sites to: if (!get_user_inline(val, ptr)) return -EFAULT; Should only be used if there is a demonstrable performance benefit. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027083745.609031602@linutronix.de --- include/linux/uaccess.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 5f142c05b0dc..be395f5f7ee3 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -825,6 +825,56 @@ for (bool done = false; !done; done = true) \ #define scoped_user_rw_access(uptr, elbl) \ scoped_user_rw_access_size(uptr, sizeof(*(uptr)), elbl) +/** + * get_user_inline - Read user data inlined + * @val: The variable to store the value read from user memory + * @usrc: Pointer to the user space memory to read from + * + * Return: 0 if successful, -EFAULT when faulted + * + * Inlined variant of get_user(). Only use when there is a demonstrable + * performance reason. + */ +#define get_user_inline(val, usrc) \ +({ \ + __label__ efault; \ + typeof(usrc) _tmpsrc = usrc; \ + int _ret = 0; \ + \ + scoped_user_read_access(_tmpsrc, efault) \ + unsafe_get_user(val, _tmpsrc, efault); \ + if (0) { \ + efault: \ + _ret = -EFAULT; \ + } \ + _ret; \ +}) + +/** + * put_user_inline - Write to user memory inlined + * @val: The value to write + * @udst: Pointer to the user space memory to write to + * + * Return: 0 if successful, -EFAULT when faulted + * + * Inlined variant of put_user(). Only use when there is a demonstrable + * performance reason. + */ +#define put_user_inline(val, udst) \ +({ \ + __label__ efault; \ + typeof(udst) _tmpdst = udst; \ + int _ret = 0; \ + \ + scoped_user_write_access(_tmpdst, efault) \ + unsafe_put_user(val, _tmpdst, efault); \ + if (0) { \ + efault: \ + _ret = -EFAULT; \ + } \ + _ret; \ +}) + #ifdef CONFIG_HARDENED_USERCOPY void __noreturn usercopy_abort(const char *name, const char *detail, bool to_user, unsigned long offset, -- cgit v1.2.3 From 3ca59da7aa5c7f569b04a511dc8670861d58b509 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:16 +0100 Subject: rseq: Avoid pointless evaluation in __rseq_notify_resume() The RSEQ critical section mechanism only clears the event mask when a critical section is registered, otherwise it is stale and collects bits. That means once a critical section is installed the first invocation of that code when TIF_NOTIFY_RESUME is set will abort the critical section, even when the TIF bit was not raised by the rseq preempt/migrate/signal helpers. This also has a performance implication because TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is utilized by quite some infrastructure. That means every invocation of __rseq_notify_resume() goes unconditionally through the heavy lifting of user space access and consistency checks even if there is no reason to do so. Keeping the stale event mask around when exiting to user space also prevents it from being utilized by the upcoming time slice extension mechanism. Avoid this by reading and clearing the event mask before doing the user space critical section access with interrupts or preemption disabled, which ensures that the read and clear operation is CPU local atomic versus scheduling and the membarrier IPI. This is correct as after re-enabling interrupts/preemption any relevant event will set the bit again and raise TIF_NOTIFY_RESUME, which makes the user space exit code take another round of TIF bit clearing. If the event mask was non-zero, invoke the slow path. On debug kernels the slow path is invoked unconditionally and the result of the event mask evaluation is handed in. Add a exit path check after the TIF bit loop, which validates on debug kernels that the event mask is zero before exiting to user space. While at it reword the convoluted comment why the pt_regs pointer can be NULL under certain circumstances. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.022571576@linutronix.de --- include/linux/irq-entry-common.h | 7 +++++-- include/linux/rseq.h | 10 +++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d643c7c87822..e5941df13901 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -2,11 +2,12 @@ #ifndef __LINUX_IRQENTRYCOMMON_H #define __LINUX_IRQENTRYCOMMON_H +#include +#include +#include #include #include -#include #include -#include #include #include @@ -226,6 +227,8 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) arch_exit_to_user_mode_prepare(regs, ti_work); + rseq_exit_to_user_mode(); + /* Ensure that kernel state is sane for a return to userspace */ kmap_assert_nomap(); lockdep_assert_irqs_disabled(); diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 69553e7c14c1..7622b733a508 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -66,6 +66,14 @@ static inline void rseq_migrate(struct task_struct *t) rseq_set_notify_resume(t); } +static __always_inline void rseq_exit_to_user_mode(void) +{ + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { + if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask)) + current->rseq_event_mask = 0; + } +} + /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. @@ -118,7 +126,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) static inline void rseq_execve(struct task_struct *t) { } - +static inline void rseq_exit_to_user_mode(void) { } #endif #ifdef CONFIG_DEBUG_RSEQ -- cgit v1.2.3 From fdc0f39d289ebcf46ef44f43460207ef24c94ed7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:18 +0100 Subject: rseq: Condense the inline stubs Scrolling over tons of pointless { } lines to find the actual code is annoying at best. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.085971048@linutronix.de --- include/linux/rseq.h | 47 ++++++++++++----------------------------------- 1 file changed, 12 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7622b733a508..21f875af0e96 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -101,44 +101,21 @@ static inline void rseq_execve(struct task_struct *t) t->rseq_event_mask = 0; } -#else - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ -} -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) -{ -} -static inline void rseq_preempt(struct task_struct *t) -{ -} -static inline void rseq_migrate(struct task_struct *t) -{ -} -static inline void rseq_fork(struct task_struct *t, u64 clone_flags) -{ -} -static inline void rseq_execve(struct task_struct *t) -{ -} +#else /* CONFIG_RSEQ */ +static inline void rseq_set_notify_resume(struct task_struct *t) { } +static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_preempt(struct task_struct *t) { } +static inline void rseq_migrate(struct task_struct *t) { } +static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } +static inline void rseq_execve(struct task_struct *t) { } static inline void rseq_exit_to_user_mode(void) { } -#endif +#endif /* !CONFIG_RSEQ */ #ifdef CONFIG_DEBUG_RSEQ - void rseq_syscall(struct pt_regs *regs); - -#else - -static inline void rseq_syscall(struct pt_regs *regs) -{ -} - -#endif +#else /* CONFIG_DEBUG_RSEQ */ +static inline void rseq_syscall(struct pt_regs *regs) { } +#endif /* !CONFIG_DEBUG_RSEQ */ #endif /* _LINUX_RSEQ_H */ -- cgit v1.2.3 From 41b43a6ba3848be8ceec77b8b2a56ddeca6167ed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:22 +0100 Subject: rseq: Remove the ksig argument from rseq_handle_notify_resume() There is no point for this being visible in the resume_to_user_mode() handling. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.211520245@linutronix.de --- include/linux/resume_user_mode.h | 2 +- include/linux/rseq.h | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index e0135e0adae0..dd3bf7da90a8 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); - rseq_handle_notify_resume(NULL, regs); + rseq_handle_notify_resume(regs); } #endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 21f875af0e96..d72ddf7ce903 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -37,19 +37,20 @@ static inline void rseq_set_notify_resume(struct task_struct *t) void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); -static inline void rseq_handle_notify_resume(struct ksignal *ksig, - struct pt_regs *regs) +static inline void rseq_handle_notify_resume(struct pt_regs *regs) { if (current->rseq) - __rseq_handle_notify_resume(ksig, regs); + __rseq_handle_notify_resume(NULL, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - scoped_guard(RSEQ_EVENT_GUARD) - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); - rseq_handle_notify_resume(ksig, regs); + if (current->rseq) { + scoped_guard(RSEQ_EVENT_GUARD) + __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + __rseq_handle_notify_resume(ksig, regs); + } } /* rseq_preempt() requires preemption to be disabled. */ @@ -103,7 +104,7 @@ static inline void rseq_execve(struct task_struct *t) #else /* CONFIG_RSEQ */ static inline void rseq_set_notify_resume(struct task_struct *t) { } -static inline void rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { } +static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_preempt(struct task_struct *t) { } static inline void rseq_migrate(struct task_struct *t) { } -- cgit v1.2.3 From d923739e2e356424cc566143a3323c62cd6ed067 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:26 +0100 Subject: rseq: Simplify the event notification Since commit 0190e4198e47 ("rseq: Deprecate RSEQ_CS_FLAG_NO_RESTART_ON_* flags") the bits in task::rseq_event_mask are meaningless and just extra work in terms of setting them individually. Aside of that the only relevant point where an event has to be raised is context switch. Neither the CPU nor MM CID can change without going through a context switch. Collapse them all into a single boolean which simplifies the code a lot and remove the pointless invocations which have been sprinkled all over the place for no value. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.336978188@linutronix.de --- include/linux/rseq.h | 66 ++++++++++------------------------------------- include/linux/sched.h | 10 +++---- include/uapi/linux/rseq.h | 21 +++++---------- 3 files changed, 25 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index d72ddf7ce903..241067bf20db 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -3,38 +3,8 @@ #define _LINUX_RSEQ_H #ifdef CONFIG_RSEQ - -#include #include -#ifdef CONFIG_MEMBARRIER -# define RSEQ_EVENT_GUARD irq -#else -# define RSEQ_EVENT_GUARD preempt -#endif - -/* - * Map the event mask on the user-space ABI enum rseq_cs_flags - * for direct mask checks. - */ -enum rseq_event_mask_bits { - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, -}; - -enum rseq_event_mask { - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), -}; - -static inline void rseq_set_notify_resume(struct task_struct *t) -{ - if (t->rseq) - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); -} - void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) @@ -43,35 +13,27 @@ static inline void rseq_handle_notify_resume(struct pt_regs *regs) __rseq_handle_notify_resume(NULL, regs); } -static inline void rseq_signal_deliver(struct ksignal *ksig, - struct pt_regs *regs) +static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { if (current->rseq) { - scoped_guard(RSEQ_EVENT_GUARD) - __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + current->rseq_event_pending = true; __rseq_handle_notify_resume(ksig, regs); } } -/* rseq_preempt() requires preemption to be disabled. */ -static inline void rseq_preempt(struct task_struct *t) +static inline void rseq_sched_switch_event(struct task_struct *t) { - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); -} - -/* rseq_migrate() requires preemption to be disabled. */ -static inline void rseq_migrate(struct task_struct *t) -{ - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); - rseq_set_notify_resume(t); + if (t->rseq) { + t->rseq_event_pending = true; + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + } } static __always_inline void rseq_exit_to_user_mode(void) { if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask)) - current->rseq_event_mask = 0; + if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending)) + current->rseq_event_pending = false; } } @@ -85,12 +47,12 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; - t->rseq_event_mask = 0; + t->rseq_event_pending = false; } else { t->rseq = current->rseq; t->rseq_len = current->rseq_len; t->rseq_sig = current->rseq_sig; - t->rseq_event_mask = current->rseq_event_mask; + t->rseq_event_pending = current->rseq_event_pending; } } @@ -99,15 +61,13 @@ static inline void rseq_execve(struct task_struct *t) t->rseq = NULL; t->rseq_len = 0; t->rseq_sig = 0; - t->rseq_event_mask = 0; + t->rseq_event_pending = false; } #else /* CONFIG_RSEQ */ -static inline void rseq_set_notify_resume(struct task_struct *t) { } static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } -static inline void rseq_preempt(struct task_struct *t) { } -static inline void rseq_migrate(struct task_struct *t) { } +static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } static inline void rseq_exit_to_user_mode(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index b469878de25c..6627c527c2c7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1407,14 +1407,14 @@ struct task_struct { #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ - struct rseq __user *rseq; - u32 rseq_len; - u32 rseq_sig; + struct rseq __user *rseq; + u32 rseq_len; + u32 rseq_sig; /* - * RmW on rseq_event_mask must be performed atomically + * RmW on rseq_event_pending must be performed atomically * with respect to preemption. */ - unsigned long rseq_event_mask; + bool rseq_event_pending; # ifdef CONFIG_DEBUG_RSEQ /* * This is a place holder to save a copy of the rseq fields for diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index c233aae5eac9..1b76d508400c 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -114,20 +114,13 @@ struct rseq { /* * Restartable sequences flags field. * - * This field should only be updated by the thread which - * registered this data structure. Read by the kernel. - * Mainly used for single-stepping through rseq critical sections - * with debuggers. - * - * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT - * Inhibit instruction sequence block restart on preemption - * for this thread. - * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL - * Inhibit instruction sequence block restart on signal - * delivery for this thread. - * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE - * Inhibit instruction sequence block restart on migration for - * this thread. + * This field was initially intended to allow event masking for + * single-stepping through rseq critical sections with debuggers. + * The kernel does not support this anymore and the relevant bits + * are checked for being always false: + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE */ __u32 flags; -- cgit v1.2.3 From 83409986f49f17b14a675f9c598ad50d4c60191b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:28 +0100 Subject: rseq, virt: Retrigger RSEQ after vcpu_run() Hypervisors invoke resume_user_mode_work() before entering the guest, which clears TIF_NOTIFY_RESUME. The @regs argument is NULL as there is no user space context available to them, so the rseq notify handler skips inspecting the critical section, but updates the CPU/MM CID values unconditionally so that the eventual pending rseq event is not lost on the way to user space. This is a pointless exercise as the task might be rescheduled before actually returning to user space and it creates unnecessary work in the vcpu_run() loops. It's way more efficient to ignore that invocation based on @regs == NULL and let the hypervisors re-raise TIF_NOTIFY_RESUME after returning from the vcpu_run() loop before returning from the ioctl(). This ensures that a pending RSEQ update is not lost and the IDs are updated before returning to user space. Once the RSEQ handling is decoupled from TIF_NOTIFY_RESUME, this turns into a NOOP. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Acked-by: Sean Christopherson Link: https://patch.msgid.link/20251027084306.399495855@linutronix.de --- include/linux/rseq.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 241067bf20db..c6267f70c746 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -37,6 +37,22 @@ static __always_inline void rseq_exit_to_user_mode(void) } } +/* + * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, + * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in + * that case just to do it eventually again before returning to user space, + * the entry resume_user_mode_work() invocation is ignored as the register + * argument is NULL. + * + * After returning from guest mode, they have to invoke this function to + * re-raise TIF_NOTIFY_RESUME if necessary. + */ +static inline void rseq_virt_userspace_exit(void) +{ + if (current->rseq_event_pending) + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); +} + /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. @@ -68,6 +84,7 @@ static inline void rseq_execve(struct task_struct *t) static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } +static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } static inline void rseq_exit_to_user_mode(void) { } -- cgit v1.2.3 From faba9d250eaec7afa248bba71531a08ccc497aab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:33 +0100 Subject: rseq: Introduce struct rseq_data In preparation for a major rewrite of this code, provide a data structure for rseq management. Put all the rseq related data into it (except for the debug part), which allows to simplify fork/execve by using memset() and memcpy() instead of adding new fields to initialize over and over. Create a storage struct for event management as well and put the sched_switch event and a indicator for RSEQ on a task into it as a start. That uses a union, which allows to mask and clear the whole lot efficiently. The indicators are explicitly not a bit field. Bit fields generate abysmal code. The boolean members are defined as u8 as that actually guarantees that it fits. There seem to be strange architecture ABIs which need more than 8 bits for a boolean. The has_rseq member is redundant vs. task::rseq, but it turns out that boolean operations and quick checks on the union generate better code than fiddling with separate entities and data types. This struct will be extended over time to carry more information. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.527086690@linutronix.de --- include/linux/rseq.h | 48 ++++++++++++++++++++----------------------- include/linux/rseq_types.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 14 +++---------- 3 files changed, 76 insertions(+), 37 deletions(-) create mode 100644 include/linux/rseq_types.h (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index c6267f70c746..ab91b1e6bb4a 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -9,22 +9,22 @@ void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) { - if (current->rseq) + if (current->rseq.event.has_rseq) __rseq_handle_notify_resume(NULL, regs); } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (current->rseq) { - current->rseq_event_pending = true; + if (current->rseq.event.has_rseq) { + current->rseq.event.sched_switch = true; __rseq_handle_notify_resume(ksig, regs); } } static inline void rseq_sched_switch_event(struct task_struct *t) { - if (t->rseq) { - t->rseq_event_pending = true; + if (t->rseq.event.has_rseq) { + t->rseq.event.sched_switch = true; set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); } } @@ -32,8 +32,9 @@ static inline void rseq_sched_switch_event(struct task_struct *t) static __always_inline void rseq_exit_to_user_mode(void) { if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - if (WARN_ON_ONCE(current->rseq && current->rseq_event_pending)) - current->rseq_event_pending = false; + if (WARN_ON_ONCE(current->rseq.event.has_rseq && + current->rseq.event.events)) + current->rseq.event.events = 0; } } @@ -49,35 +50,30 @@ static __always_inline void rseq_exit_to_user_mode(void) */ static inline void rseq_virt_userspace_exit(void) { - if (current->rseq_event_pending) + if (current->rseq.event.sched_switch) set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); } +static inline void rseq_reset(struct task_struct *t) +{ + memset(&t->rseq, 0, sizeof(t->rseq)); +} + +static inline void rseq_execve(struct task_struct *t) +{ + rseq_reset(t); +} + /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { - if (clone_flags & CLONE_VM) { - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_pending = false; - } else { + if (clone_flags & CLONE_VM) + rseq_reset(t); + else t->rseq = current->rseq; - t->rseq_len = current->rseq_len; - t->rseq_sig = current->rseq_sig; - t->rseq_event_pending = current->rseq_event_pending; - } -} - -static inline void rseq_execve(struct task_struct *t) -{ - t->rseq = NULL; - t->rseq_len = 0; - t->rseq_sig = 0; - t->rseq_event_pending = false; } #else /* CONFIG_RSEQ */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h new file mode 100644 index 000000000000..f7a60c8eddc9 --- /dev/null +++ b/include/linux/rseq_types.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RSEQ_TYPES_H +#define _LINUX_RSEQ_TYPES_H + +#include + +#ifdef CONFIG_RSEQ +struct rseq; + +/** + * struct rseq_event - Storage for rseq related event management + * @all: Compound to initialize and clear the data efficiently + * @events: Compound to access events with a single load/store + * @sched_switch: True if the task was scheduled out + * @has_rseq: True if the task has a rseq pointer installed + */ +struct rseq_event { + union { + u32 all; + struct { + union { + u16 events; + struct { + u8 sched_switch; + }; + }; + + u8 has_rseq; + }; + }; +}; + +/** + * struct rseq_data - Storage for all rseq related data + * @usrptr: Pointer to the registered user space RSEQ memory + * @len: Length of the RSEQ region + * @sig: Signature of critial section abort IPs + * @event: Storage for event management + */ +struct rseq_data { + struct rseq __user *usrptr; + u32 len; + u32 sig; + struct rseq_event event; +}; + +#else /* CONFIG_RSEQ */ +struct rseq_data { }; +#endif /* !CONFIG_RSEQ */ + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 6627c527c2c7..15627769409d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -1406,16 +1407,8 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_RSEQ - struct rseq __user *rseq; - u32 rseq_len; - u32 rseq_sig; - /* - * RmW on rseq_event_pending must be performed atomically - * with respect to preemption. - */ - bool rseq_event_pending; -# ifdef CONFIG_DEBUG_RSEQ + struct rseq_data rseq; +#ifdef CONFIG_DEBUG_RSEQ /* * This is a place holder to save a copy of the rseq fields for * validation of read-only fields. The struct rseq has a @@ -1423,7 +1416,6 @@ struct task_struct { * directly. Reserve a size large enough for the known fields. */ char rseq_fields[sizeof(struct rseq)]; -# endif #endif #ifdef CONFIG_SCHED_MM_CID -- cgit v1.2.3 From 5204be16790f305febbf331d0ec2cead7978b3c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:36 +0100 Subject: entry: Clean up header Clean up the include ordering, kernel-doc and other trivialities before making further changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.590338411@linutronix.de --- include/linux/entry-common.h | 8 ++++---- include/linux/irq-entry-common.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 7177436f0f9e..c585221ff16b 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -3,11 +3,11 @@ #define __LINUX_ENTRYCOMMON_H #include +#include #include +#include #include #include -#include -#include #include #include @@ -37,6 +37,7 @@ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ ARCH_SYSCALL_WORK_ENTER) + #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ @@ -61,8 +62,7 @@ */ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); -long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long work); +long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); /** * syscall_enter_from_user_mode_work - Check and handle work before invoking diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index e5941df13901..9b1f386ffeb1 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -68,6 +68,7 @@ static __always_inline bool arch_in_rcu_eqs(void) { return false; } /** * enter_from_user_mode - Establish state when coming from user mode + * @regs: Pointer to currents pt_regs * * Syscall/interrupt entry disables interrupts, but user mode is traced as * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. @@ -357,6 +358,7 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); * Conditional reschedule with additional sanity checks. */ void raw_irqentry_exit_cond_resched(void); + #ifdef CONFIG_PREEMPT_DYNAMIC #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched -- cgit v1.2.3 From 54a5ab56242f96555999aaa41228f77b4a76e386 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:38 +0100 Subject: entry: Remove syscall_enter_from_user_mode_prepare() Open code the only user in the x86 syscall code and reduce the zoo of functions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.652839989@linutronix.de --- include/linux/entry-common.h | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index c585221ff16b..75b194c34e18 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -45,23 +45,6 @@ SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) -/** - * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts - * @regs: Pointer to currents pt_regs - * - * Invoked from architecture specific syscall entry code with interrupts - * disabled. The calling code has to be non-instrumentable. When the - * function returns all state is correct, interrupts are enabled and the - * subsequent functions can be instrumented. - * - * This handles lockdep, RCU (context tracking) and tracing state, i.e. - * the functionality provided by enter_from_user_mode(). - * - * This is invoked when there is extra architecture specific functionality - * to be done between establishing state and handling user mode entry work. - */ -void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); - long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); /** @@ -71,8 +54,8 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work) * @syscall: The syscall number * * Invoked from architecture specific syscall entry code with interrupts - * enabled after invoking syscall_enter_from_user_mode_prepare() and extra - * architecture specific work. + * enabled after invoking enter_from_user_mode(), enabling interrupts and + * extra architecture specific work. * * Returns: The original or a modified syscall number * @@ -108,8 +91,9 @@ static __always_inline long syscall_enter_from_user_mode_work(struct pt_regs *re * function returns all state is correct, interrupts are enabled and the * subsequent functions can be instrumented. * - * This is combination of syscall_enter_from_user_mode_prepare() and - * syscall_enter_from_user_mode_work(). + * This is the combination of enter_from_user_mode() and + * syscall_enter_from_user_mode_work() to be used when there is no + * architecture specific work to be done between the two. * * Returns: The original or a modified syscall number. See * syscall_enter_from_user_mode_work() for further explanation. -- cgit v1.2.3 From 7702a9c2856794b6bf961b408eba3bacb753bd5b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:40 +0100 Subject: entry: Inline irqentry_enter/exit_from/to_user_mode() There is no point to have this as a function which just inlines enter_from_user_mode(). The function call overhead is larger than the function itself. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.715309918@linutronix.de --- include/linux/irq-entry-common.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 9b1f386ffeb1..83c9d841d9e1 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -278,7 +278,10 @@ static __always_inline void exit_to_user_mode(void) * * The function establishes state (lockdep, RCU (context tracking), tracing) */ -void irqentry_enter_from_user_mode(struct pt_regs *regs); +static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) +{ + enter_from_user_mode(regs); +} /** * irqentry_exit_to_user_mode - Interrupt exit work @@ -293,7 +296,13 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs); * Interrupt exit is not invoking #1 which is the syscall specific one time * work. */ -void irqentry_exit_to_user_mode(struct pt_regs *regs); +static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + exit_to_user_mode_prepare(regs); + instrumentation_end(); + exit_to_user_mode(); +} #ifndef irqentry_state /** -- cgit v1.2.3 From 4fc9225d19ad6289c03340a520d35e3a6d1aebed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:42 +0100 Subject: sched: Move MM CID related functions to sched.h There is nothing mm specific in that and including mm.h can cause header recursion hell. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.778457951@linutronix.de --- include/linux/mm.h | 25 ------------------------- include/linux/sched.h | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..17cfbba9914c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2401,31 +2401,6 @@ struct zap_details { /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) -#ifdef CONFIG_SCHED_MM_CID -void sched_mm_cid_before_execve(struct task_struct *t); -void sched_mm_cid_after_execve(struct task_struct *t); -void sched_mm_cid_fork(struct task_struct *t); -void sched_mm_cid_exit_signals(struct task_struct *t); -static inline int task_mm_cid(struct task_struct *t) -{ - return t->mm_cid; -} -#else -static inline void sched_mm_cid_before_execve(struct task_struct *t) { } -static inline void sched_mm_cid_after_execve(struct task_struct *t) { } -static inline void sched_mm_cid_fork(struct task_struct *t) { } -static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } -static inline int task_mm_cid(struct task_struct *t) -{ - /* - * Use the processor id as a fall-back when the mm cid feature is - * disabled. This provides functional per-cpu data structure accesses - * in user-space, althrough it won't provide the memory usage benefits. - */ - return raw_smp_processor_id(); -} -#endif - #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else diff --git a/include/linux/sched.h b/include/linux/sched.h index 15627769409d..24a9da7ca3e7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2310,6 +2310,32 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo #define alloc_tag_restore(_tag, _old) do {} while (0) #endif +/* Avoids recursive inclusion hell */ +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_before_execve(struct task_struct *t); +void sched_mm_cid_after_execve(struct task_struct *t); +void sched_mm_cid_fork(struct task_struct *t); +void sched_mm_cid_exit_signals(struct task_struct *t); +static inline int task_mm_cid(struct task_struct *t) +{ + return t->mm_cid; +} +#else +static inline void sched_mm_cid_before_execve(struct task_struct *t) { } +static inline void sched_mm_cid_after_execve(struct task_struct *t) { } +static inline void sched_mm_cid_fork(struct task_struct *t) { } +static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline int task_mm_cid(struct task_struct *t) +{ + /* + * Use the processor id as a fall-back when the mm cid feature is + * disabled. This provides functional per-cpu data structure accesses + * in user-space, althrough it won't provide the memory usage benefits. + */ + return task_cpu(t); +} +#endif + #ifndef MODULE #ifndef COMPILE_OFFSETS -- cgit v1.2.3 From 4b7de6df20d43dd651031aef8d818fa5da981dbf Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:45 +0100 Subject: rseq: Cache CPU ID and MM CID values In preparation for rewriting RSEQ exit to user space handling provide storage to cache the CPU ID and MM CID values which were written to user space. That prepares for a quick check, which avoids the update when nothing changed. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.841964081@linutronix.de --- include/linux/rseq.h | 7 +++++-- include/linux/rseq_types.h | 21 +++++++++++++++++++++ include/trace/events/rseq.h | 4 ++-- 3 files changed, 28 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index ab91b1e6bb4a..d315a92afb36 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -57,6 +57,7 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { memset(&t->rseq, 0, sizeof(t->rseq)); + t->rseq.ids.cpu_cid = ~0ULL; } static inline void rseq_execve(struct task_struct *t) @@ -70,10 +71,12 @@ static inline void rseq_execve(struct task_struct *t) */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { - if (clone_flags & CLONE_VM) + if (clone_flags & CLONE_VM) { rseq_reset(t); - else + } else { t->rseq = current->rseq; + t->rseq.ids.cpu_cid = ~0ULL; + } } #else /* CONFIG_RSEQ */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index f7a60c8eddc9..40901b033b92 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -30,18 +30,39 @@ struct rseq_event { }; }; +/** + * struct rseq_ids - Cache for ids, which need to be updated + * @cpu_cid: Compound of @cpu_id and @mm_cid to make the + * compiler emit a single compare on 64-bit + * @cpu_id: The CPU ID which was written last to user space + * @mm_cid: The MM CID which was written last to user space + * + * @cpu_id and @mm_cid are updated when the data is written to user space. + */ +struct rseq_ids { + union { + u64 cpu_cid; + struct { + u32 cpu_id; + u32 mm_cid; + }; + }; +}; + /** * struct rseq_data - Storage for all rseq related data * @usrptr: Pointer to the registered user space RSEQ memory * @len: Length of the RSEQ region * @sig: Signature of critial section abort IPs * @event: Storage for event management + * @ids: Storage for cached CPU ID and MM CID */ struct rseq_data { struct rseq __user *usrptr; u32 len; u32 sig; struct rseq_event event; + struct rseq_ids ids; }; #else /* CONFIG_RSEQ */ diff --git a/include/trace/events/rseq.h b/include/trace/events/rseq.h index 823b47d1ba1e..ce85d650bf4b 100644 --- a/include/trace/events/rseq.h +++ b/include/trace/events/rseq.h @@ -21,9 +21,9 @@ TRACE_EVENT(rseq_update, ), TP_fast_assign( - __entry->cpu_id = raw_smp_processor_id(); + __entry->cpu_id = t->rseq.ids.cpu_id; __entry->node_id = cpu_to_node(__entry->cpu_id); - __entry->mm_cid = task_mm_cid(t); + __entry->mm_cid = t->rseq.ids.mm_cid; ), TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id, -- cgit v1.2.3 From 2fc0e4b4126caadfa5772ba69276b350609584dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:48 +0100 Subject: rseq: Record interrupt from user space For RSEQ the only relevant reason to inspect and eventually fixup (abort) user space critical sections is when user space was interrupted and the task was scheduled out. If the user to kernel entry was from a syscall no fixup is required. If user space invokes a syscall from a critical section it can keep the pieces as documented. This is only supported on architectures which utilize the generic entry code. If your architecture does not use it, bad luck. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.905067101@linutronix.de --- include/linux/irq-entry-common.h | 3 ++- include/linux/rseq.h | 16 +++++++++++----- include/linux/rseq_entry.h | 18 ++++++++++++++++++ include/linux/rseq_types.h | 2 ++ 4 files changed, 33 insertions(+), 6 deletions(-) create mode 100644 include/linux/rseq_entry.h (limited to 'include') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 83c9d841d9e1..cb31fb84d7b4 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include #include @@ -281,6 +281,7 @@ static __always_inline void exit_to_user_mode(void) static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) { enter_from_user_mode(regs); + rseq_note_user_irq_entry(); } /** diff --git a/include/linux/rseq.h b/include/linux/rseq.h index d315a92afb36..a200836a6fe3 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -31,11 +31,17 @@ static inline void rseq_sched_switch_event(struct task_struct *t) static __always_inline void rseq_exit_to_user_mode(void) { - if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { - if (WARN_ON_ONCE(current->rseq.event.has_rseq && - current->rseq.event.events)) - current->rseq.event.events = 0; - } + struct rseq_event *ev = ¤t->rseq.event; + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) + WARN_ON_ONCE(ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing did not clear it. + */ + ev->events = 0; } /* diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h new file mode 100644 index 000000000000..ce30e87ce1f5 --- /dev/null +++ b/include/linux/rseq_entry.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RSEQ_ENTRY_H +#define _LINUX_RSEQ_ENTRY_H + +#ifdef CONFIG_RSEQ +#include + +static __always_inline void rseq_note_user_irq_entry(void) +{ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) + current->rseq.event.user_irq = true; +} + +#else /* CONFIG_RSEQ */ +static inline void rseq_note_user_irq_entry(void) { } +#endif /* !CONFIG_RSEQ */ + +#endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 40901b033b92..80f6c398ef0f 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -12,6 +12,7 @@ struct rseq; * @all: Compound to initialize and clear the data efficiently * @events: Compound to access events with a single load/store * @sched_switch: True if the task was scheduled out + * @user_irq: True on interrupt entry from user mode * @has_rseq: True if the task has a rseq pointer installed */ struct rseq_event { @@ -22,6 +23,7 @@ struct rseq_event { u16 events; struct { u8 sched_switch; + u8 user_irq; }; }; -- cgit v1.2.3 From dab344753e021fe84c24f9d8b0b63cb5bcf463d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:50 +0100 Subject: rseq: Provide tracepoint wrappers for inline code Provide tracepoint wrappers for the upcoming RSEQ exit to user space inline fast path, so that the header can be safely included by code which defines actual trace points. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084306.967114316@linutronix.de --- include/linux/rseq_entry.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index ce30e87ce1f5..5be507a127eb 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -5,6 +5,34 @@ #ifdef CONFIG_RSEQ #include +#include + +#ifdef CONFIG_TRACEPOINTS +DECLARE_TRACEPOINT(rseq_update); +DECLARE_TRACEPOINT(rseq_ip_fixup); +void __rseq_trace_update(struct task_struct *t); +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip); + +static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) +{ + if (tracepoint_enabled(rseq_update) && ids) + __rseq_trace_update(t); +} + +static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + if (tracepoint_enabled(rseq_ip_fixup)) + __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); +} + +#else /* CONFIG_TRACEPOINT */ +static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } +static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) { } +#endif /* !CONFIG_TRACEPOINT */ + static __always_inline void rseq_note_user_irq_entry(void) { if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) -- cgit v1.2.3 From 5412910487d0839111e4f2f3a6f33f6c9af9b007 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:52 +0100 Subject: rseq: Expose lightweight statistics in debugfs Analyzing the call frequency without actually using tracing is helpful for analysis of this infrastructure. The overhead is minimal as it just increments a per CPU counter associated to each operation. The debugfs readout provides a racy sum of all counters. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.027916598@linutronix.de --- include/linux/rseq.h | 16 --------------- include/linux/rseq_entry.h | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index a200836a6fe3..7f347c3a4af8 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -29,21 +29,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t) } } -static __always_inline void rseq_exit_to_user_mode(void) -{ - struct rseq_event *ev = ¤t->rseq.event; - - if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) - WARN_ON_ONCE(ev->sched_switch); - - /* - * Ensure that event (especially user_irq) is cleared when the - * interrupt did not result in a schedule and therefore the - * rseq processing did not clear it. - */ - ev->events = 0; -} - /* * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in @@ -92,7 +77,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } -static inline void rseq_exit_to_user_mode(void) { } #endif /* !CONFIG_RSEQ */ #ifdef CONFIG_DEBUG_RSEQ diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 5be507a127eb..ff9080b89be3 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -2,6 +2,37 @@ #ifndef _LINUX_RSEQ_ENTRY_H #define _LINUX_RSEQ_ENTRY_H +/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ +#ifdef CONFIG_RSEQ_STATS +#include + +struct rseq_stats { + unsigned long exit; + unsigned long signal; + unsigned long slowpath; + unsigned long ids; + unsigned long cs; + unsigned long clear; + unsigned long fixup; +}; + +DECLARE_PER_CPU(struct rseq_stats, rseq_stats); + +/* + * Slow path has interrupts and preemption enabled, but the fast path + * runs with interrupts disabled so there is no point in having the + * preemption checks implied in __this_cpu_inc() for every operation. + */ +#ifdef RSEQ_BUILD_SLOW_PATH +#define rseq_stat_inc(which) this_cpu_inc((which)) +#else +#define rseq_stat_inc(which) raw_cpu_inc((which)) +#endif + +#else /* CONFIG_RSEQ_STATS */ +#define rseq_stat_inc(x) do { } while (0) +#endif /* !CONFIG_RSEQ_STATS */ + #ifdef CONFIG_RSEQ #include @@ -39,8 +70,26 @@ static __always_inline void rseq_note_user_irq_entry(void) current->rseq.event.user_irq = true; } +static __always_inline void rseq_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) + WARN_ON_ONCE(ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing did not clear it. + */ + ev->events = 0; +} + #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } +static inline void rseq_exit_to_user_mode(void) { } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ -- cgit v1.2.3 From 9c37cb6e80b8fcdddc1236ba42ffd438f511192b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:55 +0100 Subject: rseq: Provide static branch for runtime debugging Config based debug is rarely turned on and is not available easily when things go wrong. Provide a static branch to allow permanent integration of debug mechanisms along with the usual toggles in Kconfig, command line and debugfs. Requested-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.089270547@linutronix.de --- include/linux/rseq_entry.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index ff9080b89be3..ed8e5f89499b 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -34,6 +34,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #endif /* !CONFIG_RSEQ_STATS */ #ifdef CONFIG_RSEQ +#include #include #include @@ -64,6 +65,8 @@ static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, unsigned long offset, unsigned long abort_ip) { } #endif /* !CONFIG_TRACEPOINT */ +DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); + static __always_inline void rseq_note_user_irq_entry(void) { if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) -- cgit v1.2.3 From abc850e7616c91ebaa3f5ba3617ab0a104d45039 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:44:57 +0100 Subject: rseq: Provide and use rseq_update_user_cs() Provide a straight forward implementation to check for and eventually clear/fixup critical sections in user space. The non-debug version does only the minimal sanity checks and aims for efficiency. There are two attack vectors, which are checked for: 1) An abort IP which is in the kernel address space. That would cause at least x86 to return to kernel space via IRET. 2) A rogue critical section descriptor with an abort IP pointing to some arbitrary address, which is not preceded by the RSEQ signature. If the section descriptors are invalid then the resulting misbehaviour of the user space application is not the kernels problem. The kernel provides a run-time switchable debug slow path, which implements the full zoo of checks including termination of the task when one of the gazillion conditions is not met. Replace the zoo in rseq.c with it and invoke it from the TIF_NOTIFY_RESUME handler. Move the remainders into the CONFIG_DEBUG_RSEQ section, which will be replaced and removed in a subsequent step. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.151465632@linutronix.de --- include/linux/rseq_entry.h | 206 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/rseq_types.h | 11 ++- 2 files changed, 216 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index ed8e5f89499b..f9510ce72211 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -36,6 +36,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #ifdef CONFIG_RSEQ #include #include +#include #include @@ -67,12 +68,217 @@ static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); +#ifdef RSEQ_BUILD_SLOW_PATH +#define rseq_inline +#else +#define rseq_inline __always_inline +#endif + +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); + static __always_inline void rseq_note_user_irq_entry(void) { if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) current->rseq.event.user_irq = true; } +/* + * Check whether there is a valid critical section and whether the + * instruction pointer in @regs is inside the critical section. + * + * - If the critical section is invalid, terminate the task. + * + * - If valid and the instruction pointer is inside, set it to the abort IP. + * + * - If valid and the instruction pointer is outside, clear the critical + * section address. + * + * Returns true, if the section was valid and either fixup or clear was + * done, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when a invalid + * section was found. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ + +#ifdef RSEQ_BUILD_SLOW_PATH +/* + * The debug version is put out of line, but kept here so the code stays + * together. + * + * @csaddr has already been checked by the caller to be in user space + */ +bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, + unsigned long csaddr) +{ + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; + u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; + unsigned long ip = instruction_pointer(regs); + u64 __user *uc_head = (u64 __user *) ucs; + u32 usig, __user *uc_sig; + + scoped_user_rw_access(ucs, efault) { + /* + * Evaluate the user pile and exit if one of the conditions + * is not fulfilled. + */ + unsafe_get_user(start_ip, &ucs->start_ip, efault); + if (unlikely(start_ip >= tasksize)) + goto die; + /* If outside, just clear the critical section. */ + if (ip < start_ip) + goto clear; + + unsafe_get_user(offset, &ucs->post_commit_offset, efault); + cs_end = start_ip + offset; + /* Check for overflow and wraparound */ + if (unlikely(cs_end >= tasksize || cs_end < start_ip)) + goto die; + + /* If not inside, clear it. */ + if (ip >= cs_end) + goto clear; + + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); + /* Ensure it's "valid" */ + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) + goto die; + /* Validate that the abort IP is not in the critical section */ + if (unlikely(abort_ip - start_ip < offset)) + goto die; + + /* + * Check version and flags for 0. No point in emitting + * deprecated warnings before dying. That could be done in + * the slow path eventually, but *shrug*. + */ + unsafe_get_user(head, uc_head, efault); + if (unlikely(head)) + goto die; + + /* abort_ip - 4 is >= 0. See abort_ip check above */ + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); + unsafe_get_user(usig, uc_sig, efault); + if (unlikely(usig != t->rseq.sig)) + goto die; + + /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* If not in interrupt from user context, let it die */ + if (unlikely(!t->rseq.event.user_irq)) + goto die; + } + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + instruction_pointer_set(regs, (unsigned long)abort_ip); + rseq_stat_inc(rseq_stats.fixup); + break; + clear: + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + rseq_stat_inc(rseq_stats.clear); + abort_ip = 0ULL; + } + + if (unlikely(abort_ip)) + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + +#endif /* RSEQ_BUILD_SLOW_PATH */ + +/* + * This only ensures that abort_ip is in the user address space and + * validates that it is preceded by the signature. + * + * No other sanity checks are done here, that's what the debug code is for. + */ +static rseq_inline bool +rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) +{ + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; + unsigned long ip = instruction_pointer(regs); + u64 start_ip, abort_ip, offset; + u32 usig, __user *uc_sig; + + rseq_stat_inc(rseq_stats.cs); + + if (unlikely(csaddr >= TASK_SIZE)) { + t->rseq.event.fatal = true; + return false; + } + + if (static_branch_unlikely(&rseq_debug_enabled)) + return rseq_debug_update_user_cs(t, regs, csaddr); + + scoped_user_rw_access(ucs, efault) { + unsafe_get_user(start_ip, &ucs->start_ip, efault); + unsafe_get_user(offset, &ucs->post_commit_offset, efault); + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); + + /* + * No sanity checks. If user space screwed it up, it can + * keep the pieces. That's what debug code is for. + * + * If outside, just clear the critical section. + */ + if (ip - start_ip >= offset) + goto clear; + + /* + * Two requirements for @abort_ip: + * - Must be in user space as x86 IRET would happily return to + * the kernel. + * - The four bytes preceding the instruction at @abort_ip must + * contain the signature. + * + * The latter protects against the following attack vector: + * + * An attacker with limited abilities to write, creates a critical + * section descriptor, sets the abort IP to a library function or + * some other ROP gadget and stores the address of the descriptor + * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP + * protection. + */ + if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig)) + goto die; + + /* The address is guaranteed to be >= 0 and < TASK_SIZE */ + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); + unsafe_get_user(usig, uc_sig, efault); + if (unlikely(usig != t->rseq.sig)) + goto die; + + /* Invalidate the critical section */ + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + /* Update the instruction pointer */ + instruction_pointer_set(regs, (unsigned long)abort_ip); + rseq_stat_inc(rseq_stats.fixup); + break; + clear: + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); + rseq_stat_inc(rseq_stats.clear); + abort_ip = 0ULL; + } + + if (unlikely(abort_ip)) + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 80f6c398ef0f..7c123947bb98 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -14,10 +14,12 @@ struct rseq; * @sched_switch: True if the task was scheduled out * @user_irq: True on interrupt entry from user mode * @has_rseq: True if the task has a rseq pointer installed + * @error: Compound error code for the slow path to analyze + * @fatal: User space data corrupted or invalid */ struct rseq_event { union { - u32 all; + u64 all; struct { union { u16 events; @@ -28,6 +30,13 @@ struct rseq_event { }; u8 has_rseq; + u8 __pad; + union { + u16 error; + struct { + u8 fatal; + }; + }; }; }; }; -- cgit v1.2.3 From c1cbad8f99b5c73c6af6e96acbfa64eaaaeb085f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:02 +0100 Subject: rseq: Make exit debugging static branch based Disconnect it from the config switch and use the static debug branch. This is a temporary measure for validating the rework. At the end this check needs to be hidden behind lockdep as it has nothing to do with the other debug infrastructure, which mainly aids user space debugging by enabling a zoo of checks which terminate misbehaving tasks instead of letting them keep the hard to diagnose pieces. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.272660745@linutronix.de --- include/linux/rseq_entry.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index f9510ce72211..5bdcf5b5f595 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -285,7 +285,7 @@ static __always_inline void rseq_exit_to_user_mode(void) rseq_stat_inc(rseq_stats.exit); - if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) + if (static_branch_unlikely(&rseq_debug_enabled)) WARN_ON_ONCE(ev->sched_switch); /* -- cgit v1.2.3 From eaa9088d568c84afd72fa32dbe01833aef861d0d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:05 +0100 Subject: rseq: Use static branch for syscall exit debug when GENERIC_IRQ_ENTRY=y Make the syscall exit debug mechanism available via the static branch on architectures which utilize the generic entry code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.333440475@linutronix.de --- include/linux/entry-common.h | 2 +- include/linux/rseq_entry.h | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 75b194c34e18..d967184ae08f 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -146,7 +146,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) local_irq_enable(); } - rseq_syscall(regs); + rseq_debug_syscall_return(regs); /* * Do one-time syscall specific work. If these work items are diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 5bdcf5b5f595..fb53a6ff05d7 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -296,9 +296,18 @@ static __always_inline void rseq_exit_to_user_mode(void) ev->events = 0; } +void __rseq_debug_syscall_return(struct pt_regs *regs); + +static inline void rseq_debug_syscall_return(struct pt_regs *regs) +{ + if (static_branch_unlikely(&rseq_debug_enabled)) + __rseq_debug_syscall_return(regs); +} + #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } static inline void rseq_exit_to_user_mode(void) { } +static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ -- cgit v1.2.3 From 0f085b41880e3140efa6941ff2b8fd43bac4d659 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:08 +0100 Subject: rseq: Provide and use rseq_set_ids() Provide a new and straight forward implementation to set the IDs (CPU ID, Node ID and MM CID), which can be later inlined into the fast path. It does all operations in one scoped_user_rw_access() section and retrieves also the critical section member (rseq::cs_rseq) from user space to avoid another user..begin/end() pair. This is in preparation for optimizing the fast path to avoid extra work when not required. On rseq registration set the CPU ID fields to RSEQ_CPU_ID_UNINITIALIZED and node and MM CID to zero. That's the same as the kernel internal reset values. That makes the debug validation in the exit code work correctly on the first exit to user space. Use it to replace the whole related zoo in rseq.c Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.393972266@linutronix.de --- include/linux/rseq.h | 16 ++++++--- include/linux/rseq_entry.h | 89 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 10 ------ 3 files changed, 100 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 7f347c3a4af8..92f9cd49489b 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -5,6 +5,8 @@ #ifdef CONFIG_RSEQ #include +#include + void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) @@ -48,7 +50,7 @@ static inline void rseq_virt_userspace_exit(void) static inline void rseq_reset(struct task_struct *t) { memset(&t->rseq, 0, sizeof(t->rseq)); - t->rseq.ids.cpu_cid = ~0ULL; + t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; } static inline void rseq_execve(struct task_struct *t) @@ -59,15 +61,19 @@ static inline void rseq_execve(struct task_struct *t) /* * If parent process has a registered restartable sequences area, the * child inherits. Unregister rseq for a clone with CLONE_VM set. + * + * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault + * on the COW page on exit to user space, when the child stays on the same + * CPU as the parent. That's obviously not guaranteed, but in overcommit + * scenarios it is more likely and optimizes for the fork/exec case without + * taking the fault. */ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { - if (clone_flags & CLONE_VM) { + if (clone_flags & CLONE_VM) rseq_reset(t); - } else { + else t->rseq = current->rseq; - t->rseq.ids.cpu_cid = ~0ULL; - } } #else /* CONFIG_RSEQ */ diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index fb53a6ff05d7..37444e80fd45 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -75,6 +75,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); #endif bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); +bool rseq_debug_validate_ids(struct task_struct *t); static __always_inline void rseq_note_user_irq_entry(void) { @@ -194,6 +195,43 @@ efault: return false; } +/* + * On debug kernels validate that user space did not mess with it if the + * debug branch is enabled. + */ +bool rseq_debug_validate_ids(struct task_struct *t) +{ + struct rseq __user *rseq = t->rseq.usrptr; + u32 cpu_id, uval, node_id; + + /* + * On the first exit after registering the rseq region CPU ID is + * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! + */ + node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? + cpu_to_node(t->rseq.ids.cpu_id) : 0; + + scoped_user_read_access(rseq, efault) { + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); + if (cpu_id != t->rseq.ids.cpu_id) + goto die; + unsafe_get_user(uval, &rseq->cpu_id, efault); + if (uval != cpu_id) + goto die; + unsafe_get_user(uval, &rseq->node_id, efault); + if (uval != node_id) + goto die; + unsafe_get_user(uval, &rseq->mm_cid, efault); + if (uval != t->rseq.ids.mm_cid) + goto die; + } + return true; +die: + t->rseq.event.fatal = true; +efault: + return false; +} + #endif /* RSEQ_BUILD_SLOW_PATH */ /* @@ -279,6 +317,57 @@ efault: return false; } +/* + * Updates CPU ID, Node ID and MM CID and reads the critical section + * address, when @csaddr != NULL. This allows to put the ID update and the + * read under the same uaccess region to spare a separate begin/end. + * + * As this is either invoked from a C wrapper with @csaddr = NULL or from + * the fast path code with a valid pointer, a clever compiler should be + * able to optimize the read out. Spares a duplicate implementation. + * + * Returns true, if the operation was successful, false otherwise. + * + * In the failure case task::rseq_event::fatal is set when invalid data + * was found on debug kernels. It's clear when the failure was an unresolved page + * fault. + * + * If inlined into the exit to user path with interrupts disabled, the + * caller has to protect against page faults with pagefault_disable(). + * + * In preemptible task context this would be counterproductive as the page + * faults could not be fully resolved. As a consequence unresolved page + * faults in task context are fatal too. + */ +static rseq_inline +bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, + u32 node_id, u64 *csaddr) +{ + struct rseq __user *rseq = t->rseq.usrptr; + + if (static_branch_unlikely(&rseq_debug_enabled)) { + if (!rseq_debug_validate_ids(t)) + return false; + } + + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); + unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); + unsafe_put_user(node_id, &rseq->node_id, efault); + unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); + if (csaddr) + unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); + } + + /* Cache the new values */ + t->rseq.ids.cpu_cid = ids->cpu_cid; + rseq_stat_inc(rseq_stats.ids); + rseq_trace_update(t, ids); + return true; +efault: + return false; +} + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; diff --git a/include/linux/sched.h b/include/linux/sched.h index 24a9da7ca3e7..e47abc8685d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include @@ -1408,15 +1407,6 @@ struct task_struct { #endif /* CONFIG_NUMA_BALANCING */ struct rseq_data rseq; -#ifdef CONFIG_DEBUG_RSEQ - /* - * This is a place holder to save a copy of the rseq fields for - * validation of read-only fields. The struct rseq has a - * variable-length array at the end, so it cannot be used - * directly. Reserve a size large enough for the known fields. - */ - char rseq_fields[sizeof(struct rseq)]; -#endif #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ -- cgit v1.2.3 From 9f6ffd4cebda86841700775de3213f22bb0ea22d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:10 +0100 Subject: rseq: Separate the signal delivery path Completely separate the signal delivery path from the notify handler as they have different semantics versus the event handling. The signal delivery only needs to ensure that the interrupted user context was not in a critical section or the section is aborted before it switches to the signal frame context. The signal frame context does not have the original instruction pointer anymore, so that can't be handled on exit to user space. No point in updating the CPU/CID ids as they might change again before the task returns to user space for real. The fast path optimization, which checks for the 'entry from user via interrupt' condition is only available for architectures which use the generic entry code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.455429038@linutronix.de --- include/linux/rseq.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 92f9cd49489b..f5a43188023f 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,22 +7,33 @@ #include -void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); +void __rseq_handle_notify_resume(struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) { if (current->rseq.event.has_rseq) - __rseq_handle_notify_resume(NULL, regs); + __rseq_handle_notify_resume(regs); } +void __rseq_signal_deliver(int sig, struct pt_regs *regs); + +/* + * Invoked from signal delivery to fixup based on the register context before + * switching to the signal delivery context. + */ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { - if (current->rseq.event.has_rseq) { - current->rseq.event.sched_switch = true; - __rseq_handle_notify_resume(ksig, regs); + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.has_rseq & current->rseq.event.user_irq) + __rseq_signal_deliver(ksig->sig, regs); + } else { + if (current->rseq.event.has_rseq) + __rseq_signal_deliver(ksig->sig, regs); } } +/* Raised from context switch and exevce to force evaluation on exit to user */ static inline void rseq_sched_switch_event(struct task_struct *t) { if (t->rseq.event.has_rseq) { -- cgit v1.2.3 From e2d4f42271155045a49b89530f2c06ad8e9f1a1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:12 +0100 Subject: rseq: Rework the TIF_NOTIFY handler Replace the whole logic with a new implementation, which is shared with signal delivery and the upcoming exit fast path. Contrary to the original implementation, this ignores invocations from KVM/IO-uring, which invoke resume_user_mode_work() with the @regs argument set to NULL. The original implementation updated the CPU/Node/MM CID fields, but that was just a side effect, which was addressing the problem that this invocation cleared TIF_NOTIFY_RESUME, which in turn could cause an update on return to user space to be lost. This problem has been addressed differently, so that it's not longer required to do that update before entering the guest. That might be considered a user visible change, when the hosts thread TLS memory is mapped into the guest, but as this was never intentionally supported, this abuse of kernel internal implementation details is not considered an ABI break. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.517640811@linutronix.de --- include/linux/rseq_entry.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 37444e80fd45..aa1c0464a16c 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -368,6 +368,35 @@ efault: return false; } +/* + * Update user space with new IDs and conditionally check whether the task + * is in a critical section. + */ +static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, + struct rseq_ids *ids, u32 node_id) +{ + u64 csaddr; + + if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) + return false; + + /* + * On architectures which utilize the generic entry code this + * allows to skip the critical section when the entry was not from + * a user space interrupt, unless debug mode is enabled. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + if (!static_branch_unlikely(&rseq_debug_enabled)) { + if (likely(!t->rseq.event.user_irq)) + return true; + } + } + if (likely(!csaddr)) + return true; + /* Sigh, this really needs to do work */ + return rseq_update_user_cs(t, regs, csaddr); +} + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; -- cgit v1.2.3 From 39a167560a61f913560ba803a96dbe6c15239f5c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:14 +0100 Subject: rseq: Optimize event setting After removing the various condition bits earlier it turns out that one extra information is needed to avoid setting event::sched_switch and TIF_NOTIFY_RESUME unconditionally on every context switch. The update of the RSEQ user space memory is only required, when either the task was interrupted in user space and schedules or the CPU or MM CID changes in schedule() independent of the entry mode Right now only the interrupt from user information is available. Add an event flag, which is set when the CPU or MM CID or both change. Evaluate this event in the scheduler to decide whether the sched_switch event and the TIF bit need to be set. It's an extra conditional in context_switch(), but the downside of unconditionally handling RSEQ after a context switch to user is way more significant. The utilized boolean logic minimizes this to a single conditional branch. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.578058898@linutronix.de --- include/linux/rseq.h | 81 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/rseq_types.h | 11 +++++-- 2 files changed, 83 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index f5a43188023f..abfbeb42d1a2 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -11,7 +11,8 @@ void __rseq_handle_notify_resume(struct pt_regs *regs); static inline void rseq_handle_notify_resume(struct pt_regs *regs) { - if (current->rseq.event.has_rseq) + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) __rseq_handle_notify_resume(regs); } @@ -33,12 +34,75 @@ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *reg } } -/* Raised from context switch and exevce to force evaluation on exit to user */ -static inline void rseq_sched_switch_event(struct task_struct *t) +static inline void rseq_raise_notify_resume(struct task_struct *t) { - if (t->rseq.event.has_rseq) { - t->rseq.event.sched_switch = true; - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); +} + +/* Invoked from context switch to force evaluation on exit to user */ +static __always_inline void rseq_sched_switch_event(struct task_struct *t) +{ + struct rseq_event *ev = &t->rseq.event; + + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { + /* + * Avoid a boat load of conditionals by using simple logic + * to determine whether NOTIFY_RESUME needs to be raised. + * + * It's required when the CPU or MM CID has changed or + * the entry was from user space. + */ + bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; + + if (raise) { + ev->sched_switch = true; + rseq_raise_notify_resume(t); + } + } else { + if (ev->has_rseq) { + t->rseq.event.sched_switch = true; + rseq_raise_notify_resume(t); + } + } +} + +/* + * Invoked from __set_task_cpu() when a task migrates to enforce an IDs + * update. + * + * This does not raise TIF_NOTIFY_RESUME as that happens in + * rseq_sched_switch_event(). + */ +static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) +{ + t->rseq.event.ids_changed = true; +} + +/* + * Invoked from switch_mm_cid() in context switch when the task gets a MM + * CID assigned. + * + * This does not raise TIF_NOTIFY_RESUME as that happens in + * rseq_sched_switch_event(). + */ +static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) +{ + /* + * Requires a comparison as the switch_mm_cid() code does not + * provide a conditional for it readily. So avoid excessive updates + * when nothing changes. + */ + if (t->rseq.ids.mm_cid != cid) + t->rseq.event.ids_changed = true; +} + +/* Enforce a full update after RSEQ registration and when execve() failed */ +static inline void rseq_force_update(void) +{ + if (current->rseq.event.has_rseq) { + current->rseq.event.ids_changed = true; + current->rseq.event.sched_switch = true; + rseq_raise_notify_resume(current); } } @@ -55,7 +119,7 @@ static inline void rseq_sched_switch_event(struct task_struct *t) static inline void rseq_virt_userspace_exit(void) { if (current->rseq.event.sched_switch) - set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + rseq_raise_notify_resume(current); } static inline void rseq_reset(struct task_struct *t) @@ -91,6 +155,9 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } +static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { } +static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { } +static inline void rseq_force_update(void) { } static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } static inline void rseq_execve(struct task_struct *t) { } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 7c123947bb98..a1389fff4fca 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -11,20 +11,27 @@ struct rseq; * struct rseq_event - Storage for rseq related event management * @all: Compound to initialize and clear the data efficiently * @events: Compound to access events with a single load/store - * @sched_switch: True if the task was scheduled out + * @sched_switch: True if the task was scheduled and needs update on + * exit to user + * @ids_changed: Indicator that IDs need to be updated * @user_irq: True on interrupt entry from user mode * @has_rseq: True if the task has a rseq pointer installed * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid + * + * @sched_switch and @ids_changed must be adjacent and the combo must be + * 16bit aligned to allow a single store, when both are set at the same + * time in the scheduler. */ struct rseq_event { union { u64 all; struct { union { - u16 events; + u32 events; struct { u8 sched_switch; + u8 ids_changed; u8 user_irq; }; }; -- cgit v1.2.3 From 05b44aef709cae5e4274590f050cf35049dcc24e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:17 +0100 Subject: rseq: Implement fast path for exit to user Implement the actual logic for handling RSEQ updates in a fast path after handling the TIF work and at the point where the task is actually returning to user space. This is the right point to do that because at this point the CPU and the MM CID are stable and cannot longer change due to yet another reschedule. That happens when the task is handling it via TIF_NOTIFY_RESUME in resume_user_mode_work(), which is invoked from the exit to user mode work loop. The function is invoked after the TIF work is handled and runs with interrupts disabled, which means it cannot resolve page faults. It therefore disables page faults and in case the access to the user space memory faults, it: - notes the fail in the event struct - raises TIF_NOTIFY_RESUME - returns false to the caller The caller has to go back to the TIF work, which runs with interrupts enabled and therefore can resolve the page faults. This happens mostly on fork() when the memory is marked COW. If the user memory inspection finds invalid data, the function returns false as well and sets the fatal flag in the event struct along with TIF_NOTIFY_RESUME. The slow path notify handler has to evaluate that flag and terminate the task with SIGSEGV as documented. The initial decision to invoke any of this is based on one flags in the event struct: @sched_switch. The decision is in pseudo ASM: load tsk::event::sched_switch jnz inspect_user_space mov $0, tsk::event::events ... leave So for the common case where the task was not scheduled out, this really boils down to three instructions before going out if the compiler is not completely stupid (and yes, some of them are). If the condition is true, then it checks, whether CPU ID or MM CID have changed. If so, then the CPU/MM IDs have to be updated and are thereby cached for the next round. The update unconditionally retrieves the user space critical section address to spare another user*begin/end() pair. If that's not zero and tsk::event::user_irq is set, then the critical section is analyzed and acted upon. If either zero or the entry came via syscall the critical section analysis is skipped. If the comparison is false then the critical section has to be analyzed because the event flag is then only true when entry from user was by interrupt. This is provided without the actual hookup to let reviewers focus on the implementation details. The hookup happens in the next step. Note: As with quite some other optimizations this depends on the generic entry infrastructure and is not enabled to be sucked into random architecture implementations. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.638929615@linutronix.de --- include/linux/rseq_entry.h | 133 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/rseq_types.h | 3 + 2 files changed, 133 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index aa1c0464a16c..3f13be7301fa 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -10,6 +10,7 @@ struct rseq_stats { unsigned long exit; unsigned long signal; unsigned long slowpath; + unsigned long fastpath; unsigned long ids; unsigned long cs; unsigned long clear; @@ -245,12 +246,13 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c { struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; unsigned long ip = instruction_pointer(regs); + unsigned long tasksize = TASK_SIZE; u64 start_ip, abort_ip, offset; u32 usig, __user *uc_sig; rseq_stat_inc(rseq_stats.cs); - if (unlikely(csaddr >= TASK_SIZE)) { + if (unlikely(csaddr >= tasksize)) { t->rseq.event.fatal = true; return false; } @@ -287,7 +289,7 @@ rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long c * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP * protection. */ - if (abort_ip >= TASK_SIZE || abort_ip < sizeof(*uc_sig)) + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) goto die; /* The address is guaranteed to be >= 0 and < TASK_SIZE */ @@ -397,6 +399,128 @@ static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *r return rseq_update_user_cs(t, regs, csaddr); } +/* + * If you want to use this then convert your architecture to the generic + * entry code. I'm tired of building workarounds for people who can't be + * bothered to make the maintenance of generic infrastructure less + * burdensome. Just sucking everything into the architecture code and + * thereby making others chase the horrible hacks and keep them working is + * neither acceptable nor sustainable. + */ +#ifdef CONFIG_GENERIC_ENTRY + +/* + * This is inlined into the exit path because: + * + * 1) It's a one time comparison in the fast path when there is no event to + * handle + * + * 2) The access to the user space rseq memory (TLS) is unlikely to fault + * so the straight inline operation is: + * + * - Four 32-bit stores only if CPU ID/ MM CID need to be updated + * - One 64-bit load to retrieve the critical section address + * + * 3) In the unlikely case that the critical section address is != NULL: + * + * - One 64-bit load to retrieve the start IP + * - One 64-bit load to retrieve the offset for calculating the end + * - One 64-bit load to retrieve the abort IP + * - One 64-bit load to retrieve the signature + * - One store to clear the critical section address + * + * The non-debug case implements only the minimal required checking. It + * provides protection against a rogue abort IP in kernel space, which + * would be exploitable at least on x86, and also against a rogue CS + * descriptor by checking the signature at the abort IP. Any fallout from + * invalid critical section descriptors is a user space problem. The debug + * case provides the full set of checks and terminates the task if a + * condition is not met. + * + * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and + * tells the caller to loop back into exit_to_user_mode_loop(). The rseq + * slow path there will handle the failure. + */ +static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) +{ + /* + * Page faults need to be disabled as this is called with + * interrupts disabled + */ + guard(pagefault)(); + if (likely(!t->rseq.event.ids_changed)) { + struct rseq __user *rseq = t->rseq.usrptr; + /* + * If IDs have not changed rseq_event::user_irq must be true + * See rseq_sched_switch_event(). + */ + u64 csaddr; + + if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) + return false; + + if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { + if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) + return false; + } + return true; + } + + struct rseq_ids ids = { + .cpu_id = task_cpu(t), + .mm_cid = task_mm_cid(t), + }; + u32 node_id = cpu_to_node(ids.cpu_id); + + return rseq_update_usr(t, regs, &ids, node_id); +} + +static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + struct task_struct *t = current; + + /* + * If the task did not go through schedule or got the flag enforced + * by the rseq syscall or execve, then nothing to do here. + * + * CPU ID and MM CID can only change when going through a context + * switch. + * + * rseq_sched_switch_event() sets the rseq_event::sched_switch bit + * only when rseq_event::has_rseq is true. That conditional is + * required to avoid setting the TIF bit if RSEQ is not registered + * for a task. rseq_event::sched_switch is cleared when RSEQ is + * unregistered by a task so it's sufficient to check for the + * sched_switch bit alone. + * + * A sane compiler requires three instructions for the nothing to do + * case including clearing the events, but your mileage might vary. + */ + if (unlikely((t->rseq.event.sched_switch))) { + rseq_stat_inc(rseq_stats.fastpath); + + if (unlikely(!rseq_exit_user_update(regs, t))) + return true; + } + /* Clear state so next entry starts from a clean slate */ + t->rseq.event.events = 0; + return false; +} + +static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { + current->rseq.event.slowpath = true; + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + return true; + } + return false; +} + +#else /* CONFIG_GENERIC_ENTRY */ +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } +#endif /* !CONFIG_GENERIC_ENTRY */ + static __always_inline void rseq_exit_to_user_mode(void) { struct rseq_event *ev = ¤t->rseq.event; @@ -421,9 +545,12 @@ static inline void rseq_debug_syscall_return(struct pt_regs *regs) if (static_branch_unlikely(&rseq_debug_enabled)) __rseq_debug_syscall_return(regs); } - #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +{ + return false; +} static inline void rseq_exit_to_user_mode(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } #endif /* !CONFIG_RSEQ */ diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index a1389fff4fca..9c7a34154de8 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -18,6 +18,8 @@ struct rseq; * @has_rseq: True if the task has a rseq pointer installed * @error: Compound error code for the slow path to analyze * @fatal: User space data corrupted or invalid + * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME + * is required * * @sched_switch and @ids_changed must be adjacent and the combo must be * 16bit aligned to allow a single store, when both are set at the same @@ -42,6 +44,7 @@ struct rseq_event { u16 error; struct { u8 fatal; + u8 slowpath; }; }; }; -- cgit v1.2.3 From 3db6b38dfe640207da706b286d4181237391f5bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:19 +0100 Subject: rseq: Switch to fast path processing on exit to user Now that all bits and pieces are in place, hook the RSEQ handling fast path function into exit_to_user_mode_prepare() after the TIF work bits have been handled. If case of fast path failure, TIF_NOTIFY_RESUME has been raised and the caller needs to take another turn through the TIF handling slow path. This only works for architectures which use the generic entry code. Architectures who still have their own incomplete hacks are not supported and won't be. This results in the following improvements: Kernel build Before After Reduction exit to user 80692981 80514451 signal checks: 32581 121 99% slowpath runs: 1201408 1.49% 198 0.00% 100% fastpath runs: 675941 0.84% N/A id updates: 1233989 1.53% 50541 0.06% 96% cs checks: 1125366 1.39% 0 0.00% 100% cs cleared: 1125366 100% 0 100% cs fixup: 0 0% 0 RSEQ selftests Before After Reduction exit to user: 386281778 387373750 signal checks: 35661203 0 100% slowpath runs: 140542396 36.38% 100 0.00% 100% fastpath runs: 9509789 2.51% N/A id updates: 176203599 45.62% 9087994 2.35% 95% cs checks: 175587856 45.46% 4728394 1.22% 98% cs cleared: 172359544 98.16% 1319307 27.90% 99% cs fixup: 3228312 1.84% 3409087 72.10% The 'cs cleared' and 'cs fixup' percentages are not relative to the exit to user invocations, they are relative to the actual 'cs check' invocations. While some of this could have been avoided in the original code, like the obvious clearing of CS when it's already clear, the main problem of going through TIF_NOTIFY_RESUME cannot be solved. In some workloads the RSEQ notify handler is invoked more than once before going out to user space. Doing this once when everything has stabilized is the only solution to avoid this. The initial attempt to completely decouple it from the TIF work turned out to be suboptimal for workloads, which do a lot of quick and short system calls. Even if the fast path decision is only 4 instructions (including a conditional branch), this adds up quickly and becomes measurable when the rate for actually having to handle rseq is in the low single digit percentage range of user/kernel transitions. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.701201365@linutronix.de --- include/linux/irq-entry-common.h | 7 ++----- include/linux/resume_user_mode.h | 2 +- include/linux/rseq.h | 18 ++++++++++++------ 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index cb31fb84d7b4..8f5ceeaaaea5 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -197,11 +197,8 @@ static __always_inline void arch_exit_to_user_mode(void) { } */ void arch_do_signal_or_restart(struct pt_regs *regs); -/** - * exit_to_user_mode_loop - do any pending work before leaving to user space - */ -unsigned long exit_to_user_mode_loop(struct pt_regs *regs, - unsigned long ti_work); +/* Handle pending TIF work */ +unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); /** * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index dd3bf7da90a8..bf92227c78d0 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -59,7 +59,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); - rseq_handle_notify_resume(regs); + rseq_handle_slowpath(regs); } #endif /* LINUX_RESUME_USER_MODE_H */ diff --git a/include/linux/rseq.h b/include/linux/rseq.h index abfbeb42d1a2..ded4baa34586 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -7,13 +7,19 @@ #include -void __rseq_handle_notify_resume(struct pt_regs *regs); +void __rseq_handle_slowpath(struct pt_regs *regs); -static inline void rseq_handle_notify_resume(struct pt_regs *regs) +/* Invoked from resume_user_mode_work() */ +static inline void rseq_handle_slowpath(struct pt_regs *regs) { - /* '&' is intentional to spare one conditional branch */ - if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) - __rseq_handle_notify_resume(regs); + if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) { + if (current->rseq.event.slowpath) + __rseq_handle_slowpath(regs); + } else { + /* '&' is intentional to spare one conditional branch */ + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) + __rseq_handle_slowpath(regs); + } } void __rseq_signal_deliver(int sig, struct pt_regs *regs); @@ -152,7 +158,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) } #else /* CONFIG_RSEQ */ -static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } +static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { } -- cgit v1.2.3 From 70fe25a3bc53a891f0e6184c12bd55cc524cb13b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:21 +0100 Subject: entry: Split up exit_to_user_mode_prepare() exit_to_user_mode_prepare() is used for both interrupts and syscalls, but there is extra rseq work, which is only required for in the interrupt exit case. Split up the function and provide wrappers for syscalls and interrupts, which allows to separate the rseq exit work in the next step. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.782234789@linutronix.de --- include/linux/entry-common.h | 2 +- include/linux/irq-entry-common.h | 49 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 45 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index d967184ae08f..87efb38b7081 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -156,7 +156,7 @@ static __always_inline void syscall_exit_to_user_mode_work(struct pt_regs *regs) if (unlikely(work & SYSCALL_WORK_EXIT)) syscall_exit_work(regs, work); local_irq_disable_exit_to_user(); - exit_to_user_mode_prepare(regs); + syscall_exit_to_user_mode_prepare(regs); } /** diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 8f5ceeaaaea5..5ea61722bb70 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -201,7 +201,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs); unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); /** - * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack * * 1) check that interrupts are disabled @@ -209,8 +209,10 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work * 3) call exit_to_user_mode_loop() if any flags from * EXIT_TO_USER_MODE_WORK are set * 4) check that interrupts are still disabled + * + * Don't invoke directly, use the syscall/irqentry_ prefixed variants below */ -static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) { unsigned long ti_work; @@ -224,15 +226,52 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) ti_work = exit_to_user_mode_loop(regs, ti_work); arch_exit_to_user_mode_prepare(regs, ti_work); +} - rseq_exit_to_user_mode(); - +static __always_inline void __exit_to_user_mode_validate(void) +{ /* Ensure that kernel state is sane for a return to userspace */ kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); } +/* Temporary workaround to keep ARM64 alive */ +static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + +/** + * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for + * syscalls and interrupts. + */ +static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + +/** + * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required + * @regs: Pointer to pt_regs on entry stack + * + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for + * syscalls and interrupts. + */ +static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) +{ + __exit_to_user_mode_prepare(regs); + rseq_exit_to_user_mode(); + __exit_to_user_mode_validate(); +} + /** * exit_to_user_mode - Fixup state when exiting to user mode * @@ -297,7 +336,7 @@ static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) { instrumentation_begin(); - exit_to_user_mode_prepare(regs); + irqentry_exit_to_user_mode_prepare(regs); instrumentation_end(); exit_to_user_mode(); } -- cgit v1.2.3 From 7a5201ea1907534efe3a6e9c001ef4c0257cb3f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:24 +0100 Subject: rseq: Split up rseq_exit_to_user_mode() Separate the interrupt and syscall exit handling. Syscall exit does not require to clear the user_irq bit as it can't be set. On interrupt exit it can be set when the interrupt did not result in a scheduling event and therefore the return path did not invoke the TIF work handling, which would have cleared it. The debug check for the event state is also not really required even when debug mode is enabled via the static key. Debug mode is largely aiding user space by enabling a larger amount of validation checks, which cause a segfault when a malformed critical section is detected. In production mode the critical section handling takes the content mostly as is and lets user space keep the pieces when it screwed up. On kernel changes in that area the state check is useful, but that can be done when lockdep is enabled, which is anyway a required test scenario for fundamental changes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.842785700@linutronix.de --- include/linux/irq-entry-common.h | 6 +++--- include/linux/rseq_entry.h | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index 5ea61722bb70..bc5d178e0b91 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -240,7 +240,7 @@ static __always_inline void __exit_to_user_mode_validate(void) static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) { __exit_to_user_mode_prepare(regs); - rseq_exit_to_user_mode(); + rseq_exit_to_user_mode_legacy(); __exit_to_user_mode_validate(); } @@ -254,7 +254,7 @@ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *reg static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { __exit_to_user_mode_prepare(regs); - rseq_exit_to_user_mode(); + rseq_syscall_exit_to_user_mode(); __exit_to_user_mode_validate(); } @@ -268,7 +268,7 @@ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *re static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) { __exit_to_user_mode_prepare(regs); - rseq_exit_to_user_mode(); + rseq_irqentry_exit_to_user_mode(); __exit_to_user_mode_validate(); } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 3f13be7301fa..958a63eeb2d3 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -521,7 +521,37 @@ static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } #endif /* !CONFIG_GENERIC_ENTRY */ -static __always_inline void rseq_exit_to_user_mode(void) +static __always_inline void rseq_syscall_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + /* Needed to remove the store for the !lockdep case */ + if (IS_ENABLED(CONFIG_LOCKDEP)) { + WARN_ON_ONCE(ev->sched_switch); + ev->events = 0; + } +} + +static __always_inline void rseq_irqentry_exit_to_user_mode(void) +{ + struct rseq_event *ev = ¤t->rseq.event; + + rseq_stat_inc(rseq_stats.exit); + + lockdep_assert_once(!ev->sched_switch); + + /* + * Ensure that event (especially user_irq) is cleared when the + * interrupt did not result in a schedule and therefore the + * rseq processing could not clear it. + */ + ev->events = 0; +} + +/* Required to keep ARM64 working */ +static __always_inline void rseq_exit_to_user_mode_legacy(void) { struct rseq_event *ev = ¤t->rseq.event; @@ -551,7 +581,9 @@ static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } -static inline void rseq_exit_to_user_mode(void) { } +static inline void rseq_syscall_exit_to_user_mode(void) { } +static inline void rseq_irqentry_exit_to_user_mode(void) { } +static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } #endif /* !CONFIG_RSEQ */ -- cgit v1.2.3 From 32034df66b5f49626aa450ceaf1849a08d87906e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 27 Oct 2025 09:45:26 +0100 Subject: rseq: Switch to TIF_RSEQ if supported TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is suboptimal especially with the RSEQ fast path depending on it, but not really handling it. Define a separate TIF_RSEQ in the generic TIF space and enable the full separation of fast and slow path for architectures which utilize that. That avoids the hassle with invocations of resume_user_mode_work() from hypervisors, which clear TIF_NOTIFY_RESUME. It makes the therefore required re-evaluation at the end of vcpu_run() a NOOP on architectures which utilize the generic TIF space and have a separate TIF_RSEQ. The hypervisor TIF handling does not include the separate TIF_RSEQ as there is no point in doing so. The guest does neither know nor care about the VMM host applications RSEQ state. That state is only relevant when the ioctl() returns to user space. The fastpath implementation still utilizes TIF_NOTIFY_RESUME for failure handling, but this only happens within exit_to_user_mode_loop(), so arguably the hypervisor ioctl() code is long done when this happens. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251027084307.903622031@linutronix.de --- include/asm-generic/thread_info_tif.h | 3 +++ include/linux/irq-entry-common.h | 2 +- include/linux/rseq.h | 22 +++++++++++++++------- include/linux/rseq_entry.h | 32 +++++++++++++++++++++++++++++--- include/linux/thread_info.h | 5 +++++ 5 files changed, 53 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h index ee3793e9b1a4..da1610a78f92 100644 --- a/include/asm-generic/thread_info_tif.h +++ b/include/asm-generic/thread_info_tif.h @@ -45,4 +45,7 @@ # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #endif +#define TIF_RSEQ 11 // Run RSEQ fast path +#define _TIF_RSEQ BIT(TIF_RSEQ) + #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index bc5d178e0b91..72e3f7a59469 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -30,7 +30,7 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ - _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ ARCH_EXIT_TO_USER_MODE_WORK) /** diff --git a/include/linux/rseq.h b/include/linux/rseq.h index ded4baa34586..b5e4803c4ebe 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -42,7 +42,7 @@ static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *reg static inline void rseq_raise_notify_resume(struct task_struct *t) { - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + set_tsk_thread_flag(t, TIF_RSEQ); } /* Invoked from context switch to force evaluation on exit to user */ @@ -114,17 +114,25 @@ static inline void rseq_force_update(void) /* * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, - * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in - * that case just to do it eventually again before returning to user space, - * the entry resume_user_mode_work() invocation is ignored as the register - * argument is NULL. + * which clears TIF_NOTIFY_RESUME on architectures that don't use the + * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag. * - * After returning from guest mode, they have to invoke this function to - * re-raise TIF_NOTIFY_RESUME if necessary. + * To avoid updating user space RSEQ in that case just to do it eventually + * again before returning to user space, because __rseq_handle_slowpath() + * does nothing when invoked with NULL register state. + * + * After returning from guest mode, before exiting to userspace, hypervisors + * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary. */ static inline void rseq_virt_userspace_exit(void) { if (current->rseq.event.sched_switch) + /* + * The generic optimization for deferring RSEQ updates until the next + * exit relies on having a dedicated TIF_RSEQ. + */ + if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && + current->rseq.event.sched_switch) rseq_raise_notify_resume(current); } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index 958a63eeb2d3..c92167ff8a7f 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -507,18 +507,44 @@ static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *reg return false; } -static __always_inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ +#ifdef CONFIG_HAVE_GENERIC_TIF_BITS +static __always_inline bool test_tif_rseq(unsigned long ti_work) { + return ti_work & _TIF_RSEQ; +} + +static __always_inline void clear_tif_rseq(void) +{ + static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); + clear_thread_flag(TIF_RSEQ); +} +#else +static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } +static __always_inline void clear_tif_rseq(void) { } +#endif + +static __always_inline bool +rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + if (likely(!test_tif_rseq(ti_work))) + return false; + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { current->rseq.event.slowpath = true; set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); return true; } + + clear_tif_rseq(); return false; } #else /* CONFIG_GENERIC_ENTRY */ -static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) { return false; } +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) +{ + return false; +} #endif /* !CONFIG_GENERIC_ENTRY */ static __always_inline void rseq_syscall_exit_to_user_mode(void) @@ -577,7 +603,7 @@ static inline void rseq_debug_syscall_return(struct pt_regs *regs) } #else /* CONFIG_RSEQ */ static inline void rseq_note_user_irq_entry(void) { } -static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs) +static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) { return false; } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index dd925d84fa46..b40de9bab4b7 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -67,6 +67,11 @@ enum syscall_work_bit { #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED #endif +#ifndef TIF_RSEQ +# define TIF_RSEQ TIF_NOTIFY_RESUME +# define _TIF_RSEQ _TIF_NOTIFY_RESUME +#endif + #ifdef __KERNEL__ #ifndef arch_set_restart_data -- cgit v1.2.3 From 323d93f0432edb5415c79bd35e15e5754a76e486 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 31 Oct 2025 12:02:12 +0100 Subject: cleanup: Always inline everything KASAN bloat caused cleanup helper functions to not get inlined: vmlinux.o: error: objtool: irqentry_exit+0x323: call to class_user_rw_access_destructor() with UACCESS enabled Force inline all the cleanup helpers like they already are on normal builds. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251031105435.GU4068168@noisy.programming.kicks-ass.net --- include/linux/cleanup.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..d1806ac5342c 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -208,7 +208,7 @@ */ #define DEFINE_FREE(_name, _type, _free) \ - static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } + static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } #define __free(_name) __cleanup(__free_##_name) @@ -220,7 +220,7 @@ __val; \ }) -static inline __must_check +static __always_inline __must_check const volatile void * __must_check_fn(const volatile void *val) { return val; } @@ -274,16 +274,16 @@ const volatile void * __must_check_fn(const volatile void *val) #define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...) \ typedef _type class_##_name##_t; \ -static inline void class_##_name##_destructor(_type *p) \ +static __always_inline void class_##_name##_destructor(_type *p) \ { _type _T = *p; _exit; } \ -static inline _type class_##_name##_constructor(_init_args) \ +static __always_inline _type class_##_name##_constructor(_init_args) \ { _type t = _init; return t; } #define EXTEND_CLASS(_name, ext, _init, _init_args...) \ typedef class_##_name##_t class_##_name##ext##_t; \ -static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\ +static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \ { class_##_name##_destructor(p); } \ -static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ +static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ { class_##_name##_t t = _init; return t; } #define CLASS(_name, var) \ @@ -347,7 +347,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond }) #define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ - static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ { \ void *_ptr = (void *)(__force unsigned long)*(_exp); \ if (IS_ERR(_ptr)) { \ @@ -355,7 +355,7 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond } \ return _ptr; \ } \ - static inline int class_##_name##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \ { \ long _rc = (__force unsigned long)*(_exp); \ if (!_rc) { \ @@ -384,9 +384,9 @@ static __maybe_unused const bool class_##_name##_is_conditional = _is_cond EXTEND_CLASS(_name, _ext, \ ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \ class_##_name##_t _T) \ - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ { return class_##_name##_lock_ptr(_T); } \ - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ { return class_##_name##_lock_err(_T); } /* @@ -466,7 +466,7 @@ typedef struct { \ __VA_ARGS__; \ } class_##_name##_t; \ \ -static inline void class_##_name##_destructor(class_##_name##_t *_T) \ +static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ { \ if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \ } \ @@ -474,7 +474,7 @@ static inline void class_##_name##_destructor(class_##_name##_t *_T) \ __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ -static inline class_##_name##_t class_##_name##_constructor(_type *l) \ +static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \ { \ class_##_name##_t _t = { .lock = l }, *_T = &_t; \ _lock; \ @@ -482,7 +482,7 @@ static inline class_##_name##_t class_##_name##_constructor(_type *l) \ } #define __DEFINE_LOCK_GUARD_0(_name, _lock) \ -static inline class_##_name##_t class_##_name##_constructor(void) \ +static __always_inline class_##_name##_t class_##_name##_constructor(void) \ { \ class_##_name##_t _t = { .lock = (void*)1 }, \ *_T __maybe_unused = &_t; \ @@ -508,9 +508,9 @@ __DEFINE_LOCK_GUARD_0(_name, _lock) if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\ _t; }), \ typeof_member(class_##_name##_t, lock) l) \ - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ { return class_##_name##_lock_ptr(_T); } \ - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ { return class_##_name##_lock_err(_T); } #define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \ -- cgit v1.2.3 From 27cb3de7f43ac0263474d87a2c84d96f904d73e2 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Tue, 28 Oct 2025 12:32:44 +0800 Subject: net: add net cookie for net device trace events In a multi-network card or container environment, this is needed in order to differentiate between trace events relating to net devices that exist in different network namespaces and share the same name. for xmit_timeout trace events: [002] ..s1. 1838.311662: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=3 [007] ..s1. 1839.335650: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=4100 [007] ..s1. 1844.455659: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=3 [002] ..s1. 1850.087647: net_dev_xmit_timeout: dev=eth0 driver=virtio_net queue=10 net_cookie=3 Cc: Eran Ben Elisha Cc: Jiri Pirko Cc: Cong Wang Cc: Jakub Kicinski Cc: Eric Dumazet Cc: Simon Horman Cc: Paolo Abeni Suggested-by: Ido Schimmel Signed-off-by: Tonghao Zhang Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20251028043244.82288-1-tonghao@bamaicloud.com Signed-off-by: Paolo Abeni --- include/trace/events/net.h | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/trace/events/net.h b/include/trace/events/net.h index d55162c12f90..fdd9ad474ce3 100644 --- a/include/trace/events/net.h +++ b/include/trace/events/net.h @@ -35,6 +35,7 @@ TRACE_EVENT(net_dev_start_xmit, __field( u16, gso_size ) __field( u16, gso_segs ) __field( u16, gso_type ) + __field( u64, net_cookie ) ), TP_fast_assign( @@ -57,16 +58,18 @@ TRACE_EVENT(net_dev_start_xmit, __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_segs = skb_shinfo(skb)->gso_segs; __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->net_cookie = dev_net(dev)->net_cookie; ), - TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x", + TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x net_cookie=%llu", __get_str(name), __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->len, __entry->data_len, __entry->network_offset, __entry->transport_offset_valid, __entry->transport_offset, __entry->tx_flags, - __entry->gso_size, __entry->gso_segs, __entry->gso_type) + __entry->gso_size, __entry->gso_segs, + __entry->gso_type, __entry->net_cookie) ); TRACE_EVENT(net_dev_xmit, @@ -83,17 +86,21 @@ TRACE_EVENT(net_dev_xmit, __field( unsigned int, len ) __field( int, rc ) __string( name, dev->name ) + __field( u64, net_cookie ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb_len; __entry->rc = rc; + __entry->net_cookie = dev_net(dev)->net_cookie; __assign_str(name); ), - TP_printk("dev=%s skbaddr=%p len=%u rc=%d", - __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) + TP_printk("dev=%s skbaddr=%p len=%u rc=%d net_cookie=%llu", + __get_str(name), __entry->skbaddr, + __entry->len, __entry->rc, + __entry->net_cookie) ); TRACE_EVENT(net_dev_xmit_timeout, @@ -107,16 +114,19 @@ TRACE_EVENT(net_dev_xmit_timeout, __string( name, dev->name ) __string( driver, netdev_drivername(dev)) __field( int, queue_index ) + __field( u64, net_cookie ) ), TP_fast_assign( __assign_str(name); __assign_str(driver); __entry->queue_index = queue_index; + __entry->net_cookie = dev_net(dev)->net_cookie; ), - TP_printk("dev=%s driver=%s queue=%d", - __get_str(name), __get_str(driver), __entry->queue_index) + TP_printk("dev=%s driver=%s queue=%d net_cookie=%llu", + __get_str(name), __get_str(driver), + __entry->queue_index, __entry->net_cookie) ); DECLARE_EVENT_CLASS(net_dev_template, @@ -129,16 +139,20 @@ DECLARE_EVENT_CLASS(net_dev_template, __field( void *, skbaddr ) __field( unsigned int, len ) __string( name, skb->dev->name ) + __field( u64, net_cookie ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb->len; + __entry->net_cookie = dev_net(skb->dev)->net_cookie; __assign_str(name); ), - TP_printk("dev=%s skbaddr=%p len=%u", - __get_str(name), __entry->skbaddr, __entry->len) + TP_printk("dev=%s skbaddr=%p len=%u net_cookie=%llu", + __get_str(name), __entry->skbaddr, + __entry->len, + __entry->net_cookie) ) DEFINE_EVENT(net_dev_template, net_dev_queue, @@ -188,6 +202,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, __field( unsigned char, nr_frags ) __field( u16, gso_size ) __field( u16, gso_type ) + __field( u64, net_cookie ) ), TP_fast_assign( @@ -214,16 +229,18 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, __entry->nr_frags = skb_shinfo(skb)->nr_frags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->net_cookie = dev_net(skb->dev)->net_cookie; ), - TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x", + TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x net_cookie=%llu", __get_str(name), __entry->napi_id, __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->hash, __entry->l4_hash, __entry->len, __entry->data_len, __entry->truesize, __entry->mac_header_valid, __entry->mac_header, - __entry->nr_frags, __entry->gso_size, __entry->gso_type) + __entry->nr_frags, __entry->gso_size, + __entry->gso_type, __entry->net_cookie) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry, -- cgit v1.2.3 From 4e97bae1b412cd6ed8053b3d8a242122952985cc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 00:12:40 +0100 Subject: cleanup: fix scoped_class() This is a class, not a guard so why on earth is it checking for guard pointers or conditional lock acquisition? None of it makes any sense at all. I'm not sure what happened back then. Maybe I had a brief psychedelic period that I completely forgot about and spaced out into a zone where that initial macro implementation made any sense at all. Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-1-cb3ec8711a6a@kernel.org Fixes: 5c21c5f22d07 ("cleanup: add a scoped version of CLASS()") Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cleanup.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..19c7e475d3a4 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -290,15 +290,16 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ class_##_name##_t var __cleanup(class_##_name##_destructor) = \ class_##_name##_constructor -#define scoped_class(_name, var, args) \ - for (CLASS(_name, var)(args); \ - __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \ - ({ goto _label; })) \ - if (0) { \ -_label: \ - break; \ +#define __scoped_class(_name, var, _label, args...) \ + for (CLASS(_name, var)(args); ; ({ goto _label; })) \ + if (0) { \ +_label: \ + break; \ } else +#define scoped_class(_name, var, args...) \ + __scoped_class(_name, var, __UNIQUE_ID(label), args) + /* * DEFINE_GUARD(name, type, lock, unlock): * trivial wrapper around DEFINE_CLASS() above specifically -- cgit v1.2.3 From 4c7ceeb62d3330b6fb2b549ae833a92c0f481f3e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 00:12:41 +0100 Subject: cred: add kernel_cred() helper Access kernel creds based off of init_task. This will let us avoid any direct access to init_cred. Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-2-cb3ec8711a6a@kernel.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index 89ae50ad2ace..8ab3718184ad 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -20,6 +20,8 @@ struct cred; struct inode; +extern struct task_struct init_task; + /* * COW Supplementary groups list */ @@ -156,6 +158,11 @@ extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); +static inline const struct cred *kernel_cred(void) +{ + /* shut up sparse */ + return rcu_dereference_raw(init_task.cred); +} extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); -- cgit v1.2.3 From 40314c2818b700da695c9686348be7aef9e156a2 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 00:12:42 +0100 Subject: cred: make init_cred static There's zero need to expose struct init_cred. The very few places that need access can just go through init_task which is already exported. Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-3-cb3ec8711a6a@kernel.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/init_task.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index bccb3f1f6262..a6cb241ea00c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -25,7 +25,6 @@ extern struct files_struct init_files; extern struct fs_struct init_fs; extern struct nsproxy init_nsproxy; -extern struct cred init_cred; #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #define INIT_PREV_CPUTIME(x) .prev_cputime = { \ -- cgit v1.2.3 From ae40e6c65791f47c76cc14d0cce2707fe6053f72 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 00:12:43 +0100 Subject: cred: add scoped_with_kernel_creds() Add a new cleanup class for override creds. We can make use of this in a bunch of places going forward. Based on this scoped_with_kernel_creds() that can be used to temporarily assume kernel credentials for specific tasks such as firmware loading, or coredump socket connections. At no point will the caller interact with the kernel credentials directly. Link: https://patch.msgid.link/20251103-work-creds-init_cred-v1-4-cb3ec8711a6a@kernel.org Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/cred.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index 8ab3718184ad..be2cd07b174c 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -187,6 +187,14 @@ static inline const struct cred *revert_creds(const struct cred *revert_cred) return rcu_replace_pointer(current->cred, revert_cred, 1); } +DEFINE_CLASS(override_creds, + const struct cred *, + revert_creds(_T), + override_creds(override_cred), const struct cred *override_cred) + +#define scoped_with_kernel_creds() \ + scoped_class(override_creds, __UNIQUE_ID(cred), kernel_cred()) + /** * get_cred_many - Get references on a set of credentials * @cred: The credentials to reference -- cgit v1.2.3 From 019e52e8d324d568e71730946beb11e7b275ff08 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 12:26:49 +0100 Subject: cred: add scoped_with_creds() guards and implement scoped_with_kernel_creds() on top of it. Link: https://patch.msgid.link/20251103-work-creds-guards-simple-v1-1-a3e156839e7f@kernel.org Reviewed-by: Amir Goldstein Signed-off-by: Christian Brauner --- include/linux/cred.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index be2cd07b174c..6ea2d81a740b 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -192,8 +192,10 @@ DEFINE_CLASS(override_creds, revert_creds(_T), override_creds(override_cred), const struct cred *override_cred) -#define scoped_with_kernel_creds() \ - scoped_class(override_creds, __UNIQUE_ID(cred), kernel_cred()) +#define scoped_with_creds(cred) \ + scoped_class(override_creds, __UNIQUE_ID(label), cred) + +#define scoped_with_kernel_creds() scoped_with_creds(kernel_cred()) /** * get_cred_many - Get references on a set of credentials -- cgit v1.2.3 From c8ad3098e1272444b6c75910d6196a36f5c8bc17 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 15:57:27 +0100 Subject: cred: add prepare credential guard A lot of code uses the following pattern: * prepare new credentials * modify them for their use-case * drop them Support that easier with the new guard infrastructure. Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-1-b447b82f2c9b@kernel.org Signed-off-by: Christian Brauner --- include/linux/cred.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/cred.h b/include/linux/cred.h index 6ea2d81a740b..343a140a6ba2 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -280,6 +280,11 @@ static inline void put_cred(const struct cred *cred) put_cred_many(cred, 1); } +DEFINE_CLASS(prepare_creds, + struct cred *, + if (_T) put_cred(_T), + prepare_creds(), void) + DEFINE_FREE(put_cred, struct cred *, if (!IS_ERR_OR_NULL(_T)) put_cred(_T)) /** -- cgit v1.2.3 From ecaba8b7990d8c6d8ba097cd4499b3b92d9df6ea Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Tue, 4 Nov 2025 12:13:12 +0800 Subject: ASoC: tas2781: Add tas5822 support TAS5822 has on-chip DSP without current/voltage feedback. Signed-off-by: Baojun Xu Link: https://patch.msgid.link/20251104041314.792-1-baojun.xu@ti.com Signed-off-by: Mark Brown --- include/sound/tas2781.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h index 0fbcdb15c74b..c3b4c43dd2bf 100644 --- a/include/sound/tas2781.h +++ b/include/sound/tas2781.h @@ -122,6 +122,7 @@ enum audio_device { TAS2781, TAS5802, TAS5815, + TAS5822, TAS5825, TAS5827, TAS5828, -- cgit v1.2.3 From 30ed05adca4a05c50594384cff18910858dd1d35 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Thu, 30 Oct 2025 08:06:46 +0800 Subject: xsk: use a smaller new lock for shared pool case - Split cq_lock into two smaller locks: cq_prod_lock and cq_cached_prod_lock - Avoid disabling/enabling interrupts in the hot xmit path In either xsk_cq_cancel_locked() or xsk_cq_reserve_locked() function, the race condition is only between multiple xsks sharing the same pool. They are all in the process context rather than interrupt context, so now the small lock named cq_cached_prod_lock can be used without handling interrupts. While cq_cached_prod_lock ensures the exclusive modification of @cached_prod, cq_prod_lock in xsk_cq_submit_addr_locked() only cares about @producer and corresponding @desc. Both of them don't necessarily be consistent with @cached_prod protected by cq_cached_prod_lock. That's the reason why the previous big lock can be split into two smaller ones. Please note that SPSC rule is all about the global state of producer and consumer that can affect both layers instead of local or cached ones. Frequently disabling and enabling interrupt are very time consuming in some cases, especially in a per-descriptor granularity, which now can be avoided after this optimization, even when the pool is shared by multiple xsks. With this patch, the performance number[1] could go from 1,872,565 pps to 1,961,009 pps. It's a minor rise of around 5%. [1]: taskset -c 1 ./xdpsock -i enp2s0f1 -q 0 -t -S -s 64 Signed-off-by: Jason Xing Acked-by: Maciej Fijalkowski Link: https://patch.msgid.link/20251030000646.18859-3-kerneljasonxing@gmail.com Signed-off-by: Paolo Abeni --- include/net/xsk_buff_pool.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index cac56e6b0869..92a2358c6ce3 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -85,11 +85,16 @@ struct xsk_buff_pool { bool unaligned; bool tx_sw_csum; void *addrs; - /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: - * NAPI TX thread and sendmsg error paths in the SKB destructor callback and when - * sockets share a single cq when the same netdev and queue id is shared. + /* Mutual exclusion of the completion ring in the SKB mode. + * Protect: NAPI TX thread and sendmsg error paths in the SKB + * destructor callback. */ - spinlock_t cq_lock; + spinlock_t cq_prod_lock; + /* Mutual exclusion of the completion ring in the SKB mode. + * Protect: when sockets share a single cq when the same netdev + * and queue id is shared. + */ + spinlock_t cq_cached_prod_lock; struct xdp_buff_xsk *free_heads[]; }; -- cgit v1.2.3 From ec7f31b2a2d3bf6b9e4d4b8cd156587f1d0607d5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Nov 2025 05:16:45 -0500 Subject: block: make bio auto-integrity deadlock safe The current block layer automatic integrity protection allocates the actual integrity buffer, which has three problems: - because it happens at the bottom of the I/O stack and doesn't use a mempool it can deadlock under load - because the data size in a bio is almost unbounded when using lage folios it can relatively easily exceed the maximum kmalloc size - even when it does not exceed the maximum kmalloc size, it could exceed the maximum segment size of the device Fix this by limiting the I/O size so that we can allocate at least a 2MiB integrity buffer, i.e. 128MiB for 8 byte PI and 512 byte integrity intervals, and create a mempool as a last resort for this maximum size, mirroring the scheme used for bvecs. As a nice upside none of this can fail now, so we remove the error handling and open code the trivial addition of the bip vec. The new allocation helpers sit outside of bio-integrity-auto.c because I plan to reuse them for file system based PI in the near future. Fixes: 7ba1ba12eeef ("block: Block layer data integrity support") Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Anuj Gupta Reviewed-by: Kanchan Joshi Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 6 ++++++ include/linux/blk-integrity.h | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 851254f36eb3..3d05296a5afe 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -14,6 +14,8 @@ enum bip_flags { BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ BIP_P2P_DMA = 1 << 8, /* using P2P address */ + + BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; struct bio_integrity_payload { @@ -140,4 +142,8 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page, return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ + +void bio_integrity_alloc_buf(struct bio *bio, bool zero_buffer); +void bio_integrity_free_buf(struct bio_integrity_payload *bip); + #endif /* _LINUX_BIO_INTEGRITY_H */ diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index b659373788f6..c2030fd8ba0a 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -8,6 +8,11 @@ struct request; +/* + * Maximum contiguous integrity buffer allocation. + */ +#define BLK_INTEGRITY_MAX_SIZE SZ_2M + enum blk_integrity_flags { BLK_INTEGRITY_NOVERIFY = 1 << 0, BLK_INTEGRITY_NOGENERATE = 1 << 1, -- cgit v1.2.3 From 1b6aa81c85621d6b55099906585ff09a477203b8 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 3 Nov 2025 11:50:15 +0000 Subject: net: stmmac: add support for configuring the phy_intf_sel inputs When dwmac is synthesised with support for multiple PHY interfaces, the core provides phy_intf_sel inputs, sampled on reset, to configure the PHY facing interface. Use stmmac_get_phy_intf_sel() in core code to determine the dwmac phy_intf_sel input value, and provide a new platform method called with this value just before we issue a soft reset to the dwmac core. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vFt4h-0000000Chos-3wxX@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 151c81c560c8..48e9f1d4e17e 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -250,6 +250,7 @@ struct plat_stmmacenet_data { struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; void (*get_interfaces)(struct stmmac_priv *priv, void *bsp_priv, unsigned long *interfaces); + int (*set_phy_intf_sel)(void *priv, u8 phy_intf_sel); int (*set_clk_tx_rate)(void *priv, struct clk *clk_tx_i, phy_interface_t interface, int speed); void (*fix_mac_speed)(void *priv, int speed, unsigned int mode); -- cgit v1.2.3 From f88191c7f3618405f1fc5c331a94ebfe601c5b08 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Sat, 1 Nov 2025 18:56:51 +0100 Subject: mptcp: pm: in-kernel: record fullmesh endp nb Instead of iterating over all endpoints, under RCU read lock, just to check if one of them as the fullmesh flag, we can keep a counter of fullmesh endpoint, similar to what is done with the other flags. This counter is now checked, before iterating over all endpoints. Similar to the other counters, this new one is also exposed. A userspace app can then know when it is being used in a fullmesh mode, with potentially (too) many subflows. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251101-net-next-mptcp-fm-endp-nb-bind-v1-1-b4166772d6bb@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index 87cfab874e24..04eea6d1d0a9 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -70,7 +70,8 @@ struct mptcp_info { __u64 mptcpi_bytes_acked; __u8 mptcpi_subflows_total; __u8 mptcpi_endp_laminar_max; - __u8 reserved[2]; + __u8 mptcpi_endp_fullmesh_max; + __u8 reserved; __u32 mptcpi_last_data_sent; __u32 mptcpi_last_data_recv; __u32 mptcpi_last_ack_recv; -- cgit v1.2.3 From 617a0dd24ef2b4e6240df48b1fbac1c3ebfa9282 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 3 Nov 2025 23:26:49 +0100 Subject: net: phy: make phy_device members pause and asym_pause bitfield bits We can reduce the size of struct phy_device a little by switching the type of members pause and asym_pause from int to a single bit. As C99 is supported now, we can use type bool for the bitfield members, what provides us with the benefit of the usual implicit bool conversions. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/764e9a31-b40b-4dc9-b808-118192a16d87@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index e3474f03cbc1..d145a200ea21 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -666,6 +666,8 @@ struct phy_device { /* The most recently read link state */ unsigned link:1; unsigned autoneg_complete:1; + bool pause:1; + bool asym_pause:1; /* Interrupts are enabled */ unsigned interrupts:1; @@ -690,8 +692,6 @@ struct phy_device { int speed; int duplex; int port; - int pause; - int asym_pause; u8 master_slave_get; u8 master_slave_set; u8 master_slave_state; -- cgit v1.2.3 From c9445e3c087656e01d0160a48f90389856baf368 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Oct 2025 22:41:19 +0100 Subject: net: phy: fixed_phy: add helper fixed_phy_register_100fd In few places a 100FD fixed PHY is used. Create a helper so that users don't have to define the struct fixed_phy_status. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/bf564b19-e9bc-4896-aeae-9f721cc4fecd@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index d17ff750c708..08275ef64147 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -20,6 +20,7 @@ extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); void fixed_phy_add(const struct fixed_phy_status *status); struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np); +struct phy_device *fixed_phy_register_100fd(void); extern void fixed_phy_unregister(struct phy_device *phydev); extern int fixed_phy_set_link_update(struct phy_device *phydev, @@ -34,6 +35,11 @@ fixed_phy_register(const struct fixed_phy_status *status, return ERR_PTR(-ENODEV); } +static inline struct phy_device *fixed_phy_register_100fd(void) +{ + return ERR_PTR(-ENODEV); +} + static inline void fixed_phy_unregister(struct phy_device *phydev) { } -- cgit v1.2.3 From 5de9ea1c50f061892625388880e83fdc50a4ef66 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 30 Oct 2025 22:46:32 +0100 Subject: net: phy: fixed_phy: remove fixed_phy_add fixed_phy_add() has a number of problems/disadvantages: - It uses phy address 0 w/o checking whether a fixed phy with this address exists already. - A subsequent call to fixed_phy_register() would also use phy address 0, because fixed_phy_add() doesn't mark it as used. - fixed_phy_add() is used from platform code, therefore requires that fixed_phy code is built-in. Now that for the only two users (coldfire/5272 and bcm47xx) fixed_phy creation has been moved to the respective ethernet driver (fec, b44), we can remove fixed_phy_add(). Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/bee046a1-1e77-4057-8b04-fdb2a1bbbd08@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 08275ef64147..8bade999831c 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -17,7 +17,6 @@ struct net_device; #if IS_ENABLED(CONFIG_FIXED_PHY) extern int fixed_phy_change_carrier(struct net_device *dev, bool new_carrier); -void fixed_phy_add(const struct fixed_phy_status *status); struct phy_device *fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np); struct phy_device *fixed_phy_register_100fd(void); @@ -27,7 +26,6 @@ extern int fixed_phy_set_link_update(struct phy_device *phydev, int (*link_update)(struct net_device *, struct fixed_phy_status *)); #else -static inline void fixed_phy_add(const struct fixed_phy_status *status) {} static inline struct phy_device * fixed_phy_register(const struct fixed_phy_status *status, struct device_node *np) -- cgit v1.2.3 From bf33247a90d3e85d53a9b55bb276b725456ff0bf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:09 -0800 Subject: net: Add struct sockaddr_unsized for sockaddr of unknown length Add flexible sockaddr structure to support addresses longer than the traditional 14-byte struct sockaddr::sa_data limitation without requiring the full 128-byte sa_data of struct sockaddr_storage. This allows the network APIs to pass around a pointer to an object that isn't lying to the compiler about how big it is, but must be accompanied by its actual size as an additional parameter. It's possible we may way to migrate to including the size with the struct in the future, e.g.: struct sockaddr_unsized { u16 sa_data_len; u16 sa_family; u8 sa_data[] __counted_by(sa_data_len); }; Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-1-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 3b262487ec06..7b1a01be29da 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -40,6 +40,23 @@ struct sockaddr { }; }; +/** + * struct sockaddr_unsized - Unspecified size sockaddr for callbacks + * @sa_family: Address family (AF_UNIX, AF_INET, AF_INET6, etc.) + * @sa_data: Flexible array for address data + * + * This structure is designed for callback interfaces where the + * total size is known via the sockaddr_len parameter. Unlike struct + * sockaddr which has a fixed 14-byte sa_data limit or struct + * sockaddr_storage which has a fixed 128-byte sa_data limit, this + * structure can accommodate addresses of any size, but must be used + * carefully. + */ +struct sockaddr_unsized { + __kernel_sa_family_t sa_family; /* address family, AF_xxx */ + char sa_data[]; /* flexible address data */ +}; + struct linger { int l_onoff; /* Linger active */ int l_linger; /* How long to linger for */ -- cgit v1.2.3 From 0e50474fa514822e9d990874e554bf8043a201d7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:10 -0800 Subject: net: Convert proto_ops bind() callbacks to use sockaddr_unsized Update all struct proto_ops bind() callback function prototypes from "struct sockaddr *" to "struct sockaddr_unsized *" to avoid lying to the compiler about object sizes. Calls into struct proto handlers gain casts that will be removed in the struct proto conversion patch. No binary changes expected. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-2-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/net.h | 4 ++-- include/net/inet_common.h | 2 +- include/net/ipv6.h | 2 +- include/net/sock.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index ec09620f40f7..0e316f063113 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -163,7 +163,7 @@ struct proto_ops { struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, - struct sockaddr *myaddr, + struct sockaddr_unsized *myaddr, int sockaddr_len); int (*connect) (struct socket *sock, struct sockaddr *vaddr, @@ -345,7 +345,7 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len, int flags); -int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen); +int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen); int kernel_listen(struct socket *sock, int backlog); int kernel_accept(struct socket *sock, struct socket **newsock, int flags); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, diff --git a/include/net/inet_common.h b/include/net/inet_common.h index c17a6585d0b0..1666cf6f539e 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -42,7 +42,7 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); int __inet_listen_sk(struct sock *sk, int backlog); void inet_sock_destruct(struct sock *sk); -int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 2ccdf85f34f1..2188bad9a687 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1208,7 +1208,7 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); diff --git a/include/net/sock.h b/include/net/sock.h index c7e58b8e8a90..acbb78c96d69 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1920,7 +1920,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ -int sock_no_bind(struct socket *, struct sockaddr *, int); +int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); -- cgit v1.2.3 From 85cb0757d7e1f9370a8b52a8b8144c37941cba0a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:11 -0800 Subject: net: Convert proto_ops connect() callbacks to use sockaddr_unsized Update all struct proto_ops connect() callback function prototypes from "struct sockaddr *" to "struct sockaddr_unsized *" to avoid lying to the compiler about object sizes. Calls into struct proto handlers gain casts that will be removed in the struct proto conversion patch. No binary changes expected. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-3-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/bpf-cgroup.h | 6 +++--- include/linux/net.h | 4 ++-- include/net/inet_common.h | 6 +++--- include/net/sctp/sctp.h | 2 +- include/net/sock.h | 2 +- include/net/vsock_addr.h | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index aedf573bdb42..a7fb4f46974f 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -238,7 +238,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ atype, NULL, NULL); \ __ret; \ }) @@ -248,7 +248,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ atype, t_ctx, NULL); \ release_sock(sk); \ } \ @@ -266,7 +266,7 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ atype, NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ diff --git a/include/linux/net.h b/include/linux/net.h index 0e316f063113..db6bc997ca5b 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -166,7 +166,7 @@ struct proto_ops { struct sockaddr_unsized *myaddr, int sockaddr_len); int (*connect) (struct socket *sock, - struct sockaddr *vaddr, + struct sockaddr_unsized *vaddr, int sockaddr_len, int flags); int (*socketpair)(struct socket *sock1, struct socket *sock2); @@ -348,7 +348,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen); int kernel_listen(struct socket *sock, int backlog); int kernel_accept(struct socket *sock, struct socket **newsock, int flags); -int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, +int kernel_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 1666cf6f539e..ebafd96912bb 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -23,11 +23,11 @@ struct sockaddr; struct socket; int inet_release(struct socket *sock); -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); -int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags, int is_sendmsg); -int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); int inet_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index bb4b80c12541..58242b37b47a 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -85,7 +85,7 @@ void sctp_udp_sock_stop(struct net *net); /* * sctp/socket.c */ -int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, +int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags); int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb); int sctp_inet_listen(struct socket *sock, int backlog); diff --git a/include/net/sock.h b/include/net/sock.h index acbb78c96d69..589fbce77217 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1921,7 +1921,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, * does not implement a particular function. */ int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len); -int sock_no_connect(struct socket *, struct sockaddr *, int, int); +int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *); int sock_no_getname(struct socket *, struct sockaddr *, int); diff --git a/include/net/vsock_addr.h b/include/net/vsock_addr.h index cf8cc140d68d..c3f4cc206198 100644 --- a/include/net/vsock_addr.h +++ b/include/net/vsock_addr.h @@ -16,7 +16,7 @@ bool vsock_addr_bound(const struct sockaddr_vm *addr); void vsock_addr_unbind(struct sockaddr_vm *addr); bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, const struct sockaddr_vm *other); -int vsock_addr_cast(const struct sockaddr *addr, size_t len, +int vsock_addr_cast(const struct sockaddr_unsized *addr, size_t len, struct sockaddr_vm **out_addr); #endif -- cgit v1.2.3 From 3d39d34146f2b38127eadf36a0513e130eaa7eec Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:12 -0800 Subject: net: Remove struct sockaddr from net.h Now that struct sockaddr is no longer used by net.h, remove it. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-4-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/net.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index db6bc997ca5b..f58b38ab37f8 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -148,7 +148,6 @@ typedef struct { struct vm_area_struct; struct page; -struct sockaddr; struct msghdr; struct module; struct sk_buff; -- cgit v1.2.3 From 449f68f8fffa2c41fc265730bd05a3c4947916c1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:13 -0800 Subject: net: Convert proto callbacks from sockaddr to sockaddr_unsized Convert struct proto pre_connect(), connect(), bind(), and bind_add() callback function prototypes from struct sockaddr to struct sockaddr_unsized. This does not change per-implementation use of sockaddr for passing around an arbitrarily sized sockaddr struct. Those will be addressed in future patches. Additionally removes the no longer referenced struct sockaddr from include/net/inet_common.h. No binary changes expected. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-5-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/net/inet_common.h | 5 ++--- include/net/ip.h | 4 ++-- include/net/ipv6.h | 8 ++++---- include/net/ipv6_stubs.h | 2 +- include/net/ping.h | 2 +- include/net/sock.h | 10 +++++----- include/net/tcp.h | 2 +- include/net/udp.h | 2 +- 8 files changed, 17 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index ebafd96912bb..5dd2bf24449e 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -19,7 +19,6 @@ struct msghdr; struct net; struct page; struct sock; -struct sockaddr; struct socket; int inet_release(struct socket *sock); @@ -43,7 +42,7 @@ int inet_listen(struct socket *sock, int backlog); int __inet_listen_sk(struct sock *sk, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); -int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ @@ -52,7 +51,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); #define BIND_FROM_BPF (1 << 2) /* Skip CAP_NET_BIND_SERVICE check. */ #define BIND_NO_CAP_NET_BIND_SERVICE (1 << 3) -int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, +int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); diff --git a/include/net/ip.h b/include/net/ip.h index 380afb691c41..69d5cef46004 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -261,8 +261,8 @@ static inline u8 ip_sendmsg_scope(const struct inet_sock *inet, } /* datagram.c */ -int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); -int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); +int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); void ip4_datagram_release_cb(struct sock *sk); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 2188bad9a687..74fbf1ad8065 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1188,10 +1188,10 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); -int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, +int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); -int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); -int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, +int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); +int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); void ip6_datagram_release_cb(struct sock *sk); @@ -1209,7 +1209,7 @@ void inet6_cleanup_sock(struct sock *sk); void inet6_sock_destruct(struct sock *sk); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); -int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index 8a3465c8c2c5..d3013e721b14 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -80,7 +80,7 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly; /* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ struct ipv6_bpf_stub { - int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, + int (*inet6_bind)(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags); struct sock *(*udp6_lib_lookup)(const struct net *net, const struct in6_addr *saddr, __be16 sport, diff --git a/include/net/ping.h b/include/net/ping.h index 9634b8800814..05bfd594a64c 100644 --- a/include/net/ping.h +++ b/include/net/ping.h @@ -58,7 +58,7 @@ void ping_unhash(struct sock *sk); int ping_init_sock(struct sock *sk); void ping_close(struct sock *sk, long timeout); -int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); void ping_err(struct sk_buff *skb, int offset, u32 info); int ping_getfrag(void *from, char *to, int offset, int fraglen, int odd, struct sk_buff *); diff --git a/include/net/sock.h b/include/net/sock.h index 589fbce77217..a5f36ea9d46f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1274,10 +1274,10 @@ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int addr_len); int (*connect)(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); @@ -1306,9 +1306,9 @@ struct proto { size_t len, int flags, int *addr_len); void (*splice_eof)(struct socket *sock); int (*bind)(struct sock *sk, - struct sockaddr *addr, int addr_len); + struct sockaddr_unsized *addr, int addr_len); int (*bind_add)(struct sock *sk, - struct sockaddr *addr, int addr_len); + struct sockaddr_unsized *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); @@ -3105,7 +3105,7 @@ void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); -int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); +int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len); int sock_get_timeout(long timeo, void *optval, bool old_timeval); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, diff --git a/include/net/tcp.h b/include/net/tcp.h index 4fd6d8d1230d..0aa1f07d036a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -535,7 +535,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req_unhash, bool *own_req); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); -int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); int tcp_connect(struct sock *sk); enum tcp_synack_type { TCP_SYNACK_NORMAL, diff --git a/include/net/udp.h b/include/net/udp.h index cffedb3e40f2..a061d1b22ddc 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -424,7 +424,7 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, int *karg); int udp_init_sock(struct sock *sk); -int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); -- cgit v1.2.3 From 8116d803e7f8f20bf00ce23ff8bd0baab41e1635 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:14 -0800 Subject: bpf: Convert cgroup sockaddr filters to use sockaddr_unsized consistently Update BPF cgroup sockaddr filtering infrastructure to use sockaddr_unsized consistently throughout the call chain, removing redundant explicit casts from callers. No binary changes expected. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-6-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/bpf-cgroup.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index a7fb4f46974f..d1eb5c7729cb 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -120,7 +120,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, - struct sockaddr *uaddr, + struct sockaddr_unsized *uaddr, int *uaddrlen, enum cgroup_bpf_attach_type atype, void *t_ctx, @@ -238,8 +238,9 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, ({ \ int __ret = 0; \ if (cgroup_bpf_enabled(atype)) \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ - atype, NULL, NULL); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, \ + (struct sockaddr_unsized *)uaddr, uaddrlen, \ + atype, NULL, NULL); \ __ret; \ }) @@ -248,8 +249,9 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ - atype, t_ctx, NULL); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, \ + (struct sockaddr_unsized *)uaddr, uaddrlen, \ + atype, t_ctx, NULL); \ release_sock(sk); \ } \ __ret; \ @@ -266,8 +268,9 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, int __ret = 0; \ if (cgroup_bpf_enabled(atype)) { \ lock_sock(sk); \ - __ret = __cgroup_bpf_run_filter_sock_addr(sk, (struct sockaddr *)uaddr, uaddrlen, \ - atype, NULL, &__flags); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, \ + (struct sockaddr_unsized *)uaddr, uaddrlen, \ + atype, NULL, &__flags); \ release_sock(sk); \ if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ *bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE; \ -- cgit v1.2.3 From c1a799eef62b8c3298a4d82753fe0f2a448e5e4f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:15 -0800 Subject: bpf: Convert bpf_sock_addr_kern "uaddr" to sockaddr_unsized Change struct bpf_sock_addr_kern to use sockaddr_unsized for the "uaddr" field instead of sockaddr. This improves type safety in the BPF cgroup socket address filtering code. The casting in __cgroup_bpf_run_filter_sock_addr() is updated to match the new type, removing an unnecessary cast in the initialization and updating the conditional assignment to use the appropriate sockaddr_unsized cast. Additionally rename the "unspec" variable to "storage" to better align with its usage. No binary changes expected. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-7-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/filter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index f5c859b8131a..e116de7edc58 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1515,7 +1515,7 @@ static inline int bpf_tell_extensions(void) struct bpf_sock_addr_kern { struct sock *sk; - struct sockaddr *uaddr; + struct sockaddr_unsized *uaddr; /* Temporary "register" to make indirect stores to nested structures * defined above. We need three registers to make such a store, but * only two (src and dst) are available at convert_ctx_access time -- cgit v1.2.3 From 2b5e9f9b7e414c5eeb20dd7a7b80816ff55cf57b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 3 Nov 2025 16:26:16 -0800 Subject: net: Convert struct sockaddr to fixed-size "sa_data[14]" Revert struct sockaddr from flexible array to fixed 14-byte "sa_data", to solve over 36,000 -Wflex-array-member-not-at-end warnings, since struct sockaddr is embedded within many network structs. With socket/proto sockaddr-based internal APIs switched to use struct sockaddr_unsized, there should be no more uses of struct sockaddr that depend on reading beyond the end of struct sockaddr::sa_data that might trigger bounds checking. Comparing an x86_64 "allyesconfig" vmlinux build before and after this patch showed no new "ud1" instructions from CONFIG_UBSAN_BOUNDS nor any new "field-spanning" memcpy CONFIG_FORTIFY_SOURCE instrumentations. Cc: Gustavo A. R. Silva Signed-off-by: Kees Cook Link: https://patch.msgid.link/20251104002617.2752303-8-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 7b1a01be29da..944027f9765e 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -32,12 +32,10 @@ typedef __kernel_sa_family_t sa_family_t; * 1003.1g requires sa_family_t and that sa_data is char. */ +/* Deprecated for in-kernel use. Use struct sockaddr_unsized instead. */ struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ - union { - char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ - DECLARE_FLEX_ARRAY(char, sa_data); - }; + char sa_data[14]; /* 14 bytes of protocol address */ }; /** -- cgit v1.2.3 From 7c5b184db7145fd417785377337bd15c4fe1d0f4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:29:59 -0400 Subject: genpt: Generic Page Table base API The generic API is intended to be separated from the implementation of page table algorithms. It contains only accessors for walking and manipulating the table and helpers that are useful for building an implementation. Memory management is not in the generic API, but part of the implementation. Using a multi-compilation approach the implementation module would include headers in this order: common.h defs_FMT.h pt_defs.h FMT.h pt_common.h IMPLEMENTATION.h Where each compilation unit would have a combination of FMT and IMPLEMENTATION to produce a per-format per-implementation module. The API is designed so that the format headers have minimal logic, and default implementations are provided if the format doesn't include one. Generally formats provide their code via an inline function using the pattern: static inline FMTpt_XX(..) {} #define pt_XX FMTpt_XX The common code then enforces a function signature so that there is no drift in function arguments, or accidental polymorphic functions (as has been slightly troublesome in mm). Use of function-like #defines are avoided in the format even though many of the functions are small enough. Provide kdocs for the API surface. This is enough to implement the 8 initial format variations with all of their features: * Entries comprised of contiguous blocks of IO PTEs for larger page sizes (AMDv1, ARMv8) * Multi-level tables, up to 6 levels. Runtime selected top level * The size of the top table level can be selected at runtime (ARM's concatenated tables) * The number of levels in the table can optionally increase dynamically during map (AMDv1) * Optional leaf entries at any level * 32 bit/64 bit virtual and output addresses, using every bit * Sign extended addressing (x86) * Dirty tracking A basic simple format takes about 200 lines to declare the require inline functions. Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/common.h | 135 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 include/linux/generic_pt/common.h (limited to 'include') diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h new file mode 100644 index 000000000000..e69a75511313 --- /dev/null +++ b/include/linux/generic_pt/common.h @@ -0,0 +1,135 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __GENERIC_PT_COMMON_H +#define __GENERIC_PT_COMMON_H + +#include +#include +#include + +/** + * DOC: Generic Radix Page Table + * + * Generic Radix Page Table is a set of functions and helpers to efficiently + * parse radix style page tables typically seen in HW implementations. The + * interface is built to deliver similar code generation as the mm's pte/pmd/etc + * system by fully inlining the exact code required to handle each table level. + * + * Like the mm subsystem each format contributes its parsing implementation + * under common names and the common code implements the required algorithms. + * + * The system is divided into three logical levels: + * + * - The page table format and its manipulation functions + * - Generic helpers to give a consistent API regardless of underlying format + * - An algorithm implementation (e.g. IOMMU/DRM/KVM/MM) + * + * Multiple implementations are supported. The intention is to have the generic + * format code be re-usable for whatever specialized implementation is required. + * The generic code is solely about the format of the radix tree; it does not + * include memory allocation or higher level decisions that are left for the + * implementation. + * + * The generic framework supports a superset of functions across many HW + * implementations: + * + * - Entries comprised of contiguous blocks of IO PTEs for larger page sizes + * - Multi-level tables, up to 6 levels. Runtime selected top level + * - Runtime variable table level size (ARM's concatenated tables) + * - Expandable top level allowing dynamic sizing of table levels + * - Optional leaf entries at any level + * - 32-bit/64-bit virtual and output addresses, using every address bit + * - Dirty tracking + * - Sign extended addressing + */ + +/** + * struct pt_common - struct for all page table implementations + */ +struct pt_common { + /** + * @top_of_table: Encodes the table top pointer and the top level in a + * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower + * bits of the aligned table pointer are used for the level. + */ + uintptr_t top_of_table; + /** + * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits + * must be zero. This may be less than what the page table format + * supports, but must not be more. + */ + u8 max_oasz_lg2; + /** + * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits + * are 0 or 1 depending on pt_full_va_prefix(). This may be less than + * what the page table format supports, but must not be more. When + * PT_FEAT_DYNAMIC_TOP is set this reflects the maximum VA capability. + */ + u8 max_vasz_lg2; + /** + * @features: Bitmap of `enum pt_features` + */ + unsigned int features; +}; + +/* Encoding parameters for top_of_table */ +enum { + PT_TOP_LEVEL_BITS = 3, + PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0), +}; + +/** + * enum pt_features - Features turned on in the table. Each symbol is a bit + * position. + */ +enum pt_features { + /** + * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to + * PT_VADDR_MAX. + */ + PT_FEAT_FULL_VA, + /** + * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased + * dynamically during map. This requires HW support for atomically + * setting both the table top pointer and the starting table level. + */ + PT_FEAT_DYNAMIC_TOP, + /** + * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign + * extends up to the full pt_vaddr_t. This divides the page table into + * three VA ranges:: + * + * 0 -> 2^N - 1 Lower + * 2^N -> (MAX - 2^N - 1) Non-Canonical + * MAX - 2^N -> MAX Upper + * + * In this mode pt_common::max_vasz_lg2 includes the sign bit and the + * upper bits that don't fall within the translation are just validated. + * + * If not set there is no sign extension and valid VA goes from 0 to 2^N + * - 1. + */ + PT_FEAT_SIGN_EXTEND, + /** + * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA + * ranges which will clean out any walk cache or any IOPTE fully + * contained by the range. The optimization objective is to minimize the + * number of flushes even if ranges include IOVA gaps that do not need + * to be flushed. + */ + PT_FEAT_FLUSH_RANGE, + /** + * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that + * the optimization objective is to only flush IOVA that has been + * changed. This mode is suitable for cases like hypervisor shadowing + * where flushing unchanged ranges may cause the hypervisor to reparse + * significant amount of page table. + */ + PT_FEAT_FLUSH_RANGE_NO_GAPS, + /* private: */ + PT_FEAT_FMT_START, +}; + +#endif -- cgit v1.2.3 From cdb39d9185795b744dab4d4d782f2fe3f5eca10c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:01 -0400 Subject: iommupt: Add the basic structure of the iommu implementation The existing IOMMU page table implementations duplicate all of the working algorithms for each format. By using the generic page table API a single C version of the IOMMU algorithms can be created and re-used for all of the different formats used in the drivers. The implementation will provide a single C version of the iommu domain operations: iova_to_phys, map, unmap, and read_and_clear_dirty. Further, adding new algorithms and techniques becomes easy to do across the entire fleet of drivers and formats. The C functions are drop in compatible with the existing iommu_domain_ops using the IOMMU_PT_DOMAIN_OPS() macro. Each per-format implementation compilation unit will produce exported symbols following the pattern pt_iommu_FMT_map_pages() which the macro directly maps to the iommu_domain_ops members. This avoids the additional function pointer indirection like io-pgtable has. The top level struct used by the drivers is pt_iommu_table_FMT. It contains the other structs to allow container_of() to move between the driver, iommu page table, generic page table, and generic format layers. struct pt_iommu_table_amdv1 { struct pt_iommu { struct iommu_domain domain; } iommu; struct pt_amdv1 { struct pt_common common; } amdpt; }; The driver is expected to union the pt_iommu_table_FMT with its own existing domain struct: struct driver_domain { union { struct iommu_domain domain; struct pt_iommu_table_amdv1 amdv1; }; }; PT_IOMMU_CHECK_DOMAIN(struct driver_domain, amdv1, domain); To create an alias to avoid renaming 'domain' in a lot of driver code. This allows all the layers to access all the necessary functions to implement their different roles with no change to any of the existing iommu core code. Implement the basic starting point: pt_iommu_init(), get_info() and deinit(). Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 150 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 include/linux/generic_pt/iommu.h (limited to 'include') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h new file mode 100644 index 000000000000..defa96abc497 --- /dev/null +++ b/include/linux/generic_pt/iommu.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __GENERIC_PT_IOMMU_H +#define __GENERIC_PT_IOMMU_H + +#include +#include +#include + +struct pt_iommu_ops; + +/** + * DOC: IOMMU Radix Page Table + * + * The IOMMU implementation of the Generic Page Table provides an ops struct + * that is useful to go with an iommu_domain to serve the DMA API, IOMMUFD and + * the generic map/unmap interface. + * + * This interface uses a caller provided locking approach. The caller must have + * a VA range lock concept that prevents concurrent threads from calling ops on + * the same VA. Generally the range lock must be at least as large as a single + * map call. + */ + +/** + * struct pt_iommu - Base structure for IOMMU page tables + * + * The format-specific struct will include this as the first member. + */ +struct pt_iommu { + /** + * @domain: The core IOMMU domain. The driver should use a union to + * overlay this memory with its previously existing domain struct to + * create an alias. + */ + struct iommu_domain domain; + + /** + * @ops: Function pointers to access the API + */ + const struct pt_iommu_ops *ops; + + /** + * @nid: Node ID to use for table memory allocations. The IOMMU driver + * may want to set the NID to the device's NID, if there are multiple + * table walkers. + */ + int nid; +}; + +/** + * struct pt_iommu_info - Details about the IOMMU page table + * + * Returned from pt_iommu_ops->get_info() + */ +struct pt_iommu_info { + /** + * @pgsize_bitmap: A bitmask where each set bit indicates + * a page size that can be natively stored in the page table. + */ + u64 pgsize_bitmap; +}; + +struct pt_iommu_ops { + /** + * @get_info: Return the pt_iommu_info structure + * @iommu_table: Table to query + * + * Return some basic static information about the page table. + */ + void (*get_info)(struct pt_iommu *iommu_table, + struct pt_iommu_info *info); + + /** + * @deinit: Undo a format specific init operation + * @iommu_table: Table to destroy + * + * Release all of the memory. The caller must have already removed the + * table from all HW access and all caches. + */ + void (*deinit)(struct pt_iommu *iommu_table); +}; + +static inline void pt_iommu_deinit(struct pt_iommu *iommu_table) +{ + /* + * It is safe to call pt_iommu_deinit() before an init, or if init + * fails. The ops pointer will only become non-NULL if deinit needs to be + * run. + */ + if (iommu_table->ops) + iommu_table->ops->deinit(iommu_table); +} + +/** + * struct pt_iommu_cfg - Common configuration values for all formats + */ +struct pt_iommu_cfg { + /** + * @features: Features required. Only these features will be turned on. + * The feature list should reflect what the IOMMU HW is capable of. + */ + unsigned int features; + /** + * @hw_max_vasz_lg2: Maximum VA the IOMMU HW can support. This will + * imply the top level of the table. + */ + u8 hw_max_vasz_lg2; + /** + * @hw_max_oasz_lg2: Maximum OA the IOMMU HW can support. The format + * might select a lower maximum OA. + */ + u8 hw_max_oasz_lg2; +}; + +/* Generate the exported function signatures from iommu_pt.h */ +#define IOMMU_PROTOTYPES(fmt) \ + int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ + const struct pt_iommu_##fmt##_cfg *cfg, \ + gfp_t gfp); \ + void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \ + struct pt_iommu_##fmt##_hw_info *info) +#define IOMMU_FORMAT(fmt, member) \ + struct pt_iommu_##fmt { \ + struct pt_iommu iommu; \ + struct pt_##fmt member; \ + }; \ + IOMMU_PROTOTYPES(fmt) + +/* + * The driver should setup its domain struct like + * union { + * struct iommu_domain domain; + * struct pt_iommu_xxx xx; + * }; + * PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, xx.iommu, domain); + * + * Which creates an alias between driver_domain.domain and + * driver_domain.xx.iommu.domain. This is to avoid a mass rename of existing + * driver_domain.domain users. + */ +#define PT_IOMMU_CHECK_DOMAIN(s, pt_iommu_memb, domain_memb) \ + static_assert(offsetof(s, pt_iommu_memb.domain) == \ + offsetof(s, domain_memb)) + +#undef IOMMU_PROTOTYPES +#undef IOMMU_FORMAT +#endif -- cgit v1.2.3 From 879ced2bab1ba95e98fac56c9503791183bc7cbb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:02 -0400 Subject: iommupt: Add the AMD IOMMU v1 page table format AMD IOMMU v1 is unique in supporting contiguous pages with a variable size and it can decode the full 64 bit VA space. Unlike other x86 page tables this explicitly does not do sign extension as part of allowing the entire 64 bit VA space to be supported. The general design is quite similar to the x86 PAE format, except with a 6th level and quite different PTE encoding. This format is the only one that uses the PT_FEAT_DYNAMIC_TOP feature in the existing code as the existing AMDv1 code starts out with a 3 level table and adds levels on the fly if more IOVA is needed. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 65,64 , 62,61 , -1.01 2^13, 70,66 , 67,62 , -8.08 2^14, 73,69 , 71,65 , -9.09 2^15, 78,75 , 75,71 , -5.05 2^16, 89,89 , 86,84 , -2.02 2^17, 128,121 , 124,112 , -10.10 2^18, 175,175 , 170,163 , -4.04 2^19, 264,306 , 261,279 , 6.06 2^20, 444,525 , 438,489 , 10.10 2^21, 60,62 , 58,59 , 1.01 256*2^12, 381,1833 , 367,1795 , 79.79 256*2^21, 375,1623 , 356,1555 , 77.77 256*2^30, 356,1338 , 349,1277 , 72.72 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 76,89 , 71,86 , 17.17 2^13, 79,89 , 75,86 , 12.12 2^14, 78,90 , 74,86 , 13.13 2^15, 82,89 , 74,86 , 13.13 2^16, 79,89 , 74,86 , 13.13 2^17, 81,89 , 77,87 , 11.11 2^18, 90,92 , 87,89 , 2.02 2^19, 91,93 , 88,90 , 2.02 2^20, 96,95 , 91,92 , 1.01 2^21, 72,88 , 68,85 , 20.20 256*2^12, 372,6583 , 364,6251 , 94.94 256*2^21, 398,6032 , 392,5758 , 93.93 256*2^30, 396,5665 , 389,5258 , 92.92 The ~5-17x speedup when working with mutli-PTE map/unmaps is because the AMD implementation rewalks the entire table on every new PTE while this version retains its position. The same speedup will be seen with dirtys as well. The old implementation triggers a compiler optimization that ends up generating a "rep stos" memset for contiguous PTEs. Since AMD can have contiguous PTEs that span 2Kbytes of table this is a huge win compared to a normal movq loop. It is why the unmap side has a fairly flat runtime as the contiguous PTE sides increases. This version makes it explicit with a memset64() call. Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/common.h | 19 +++++++++++++++++++ include/linux/generic_pt/iommu.h | 12 ++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index e69a75511313..21e33489cbf2 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -132,4 +132,23 @@ enum pt_features { PT_FEAT_FMT_START, }; +struct pt_amdv1 { + struct pt_common common; +}; + +enum { + /* + * The memory backing the tables is encrypted. Use __sme_set() to adjust + * the page table pointers in the tree. This only works with + * CONFIG_AMD_MEM_ENCRYPT. + */ + PT_FEAT_AMDV1_ENCRYPT_TABLES = PT_FEAT_FMT_START, + /* + * The PTEs are set to prevent cache incoherent traffic, such as PCI no + * snoop. This is set either at creation time or before the first map + * operation. + */ + PT_FEAT_AMDV1_FORCE_COHERENCE, +}; + #endif diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index defa96abc497..dc731fe003d1 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -145,6 +145,18 @@ struct pt_iommu_cfg { static_assert(offsetof(s, pt_iommu_memb.domain) == \ offsetof(s, domain_memb)) +struct pt_iommu_amdv1_cfg { + struct pt_iommu_cfg common; + unsigned int starting_level; +}; + +struct pt_iommu_amdv1_hw_info { + u64 host_pt_root; + u8 mode; +}; + +IOMMU_FORMAT(amdv1, amdpt); + #undef IOMMU_PROTOTYPES #undef IOMMU_FORMAT #endif -- cgit v1.2.3 From 9d4c274cd7d5e1b6b9e116e155f16bcd208237d8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:03 -0400 Subject: iommupt: Add iova_to_phys op iova_to_phys is a performance path for the DMA API and iommufd, implement it using an unrolled get_user_pages() like function waterfall scheme. The implementation itself is fairly trivial. Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index dc731fe003d1..5622856e1998 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -116,11 +116,13 @@ struct pt_iommu_cfg { }; /* Generate the exported function signatures from iommu_pt.h */ -#define IOMMU_PROTOTYPES(fmt) \ - int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ - const struct pt_iommu_##fmt##_cfg *cfg, \ - gfp_t gfp); \ - void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \ +#define IOMMU_PROTOTYPES(fmt) \ + phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ + dma_addr_t iova); \ + int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ + const struct pt_iommu_##fmt##_cfg *cfg, \ + gfp_t gfp); \ + void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \ struct pt_iommu_##fmt##_hw_info *info) #define IOMMU_FORMAT(fmt, member) \ struct pt_iommu_##fmt { \ @@ -129,6 +131,13 @@ struct pt_iommu_cfg { }; \ IOMMU_PROTOTYPES(fmt) +/* + * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the + * iommu_pt + */ +#define IOMMU_PT_DOMAIN_OPS(fmt) \ + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, + /* * The driver should setup its domain struct like * union { -- cgit v1.2.3 From 7c53f4238aa8bfb476e177263133ead2eeb8d55d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:04 -0400 Subject: iommupt: Add unmap_pages op unmap_pages removes mappings and any fully contained interior tables from the given range. This follows the now-standard iommu_domain API definition where it does not split up larger page sizes into smaller. The caller must perform unmap only on ranges created by map or it must have somehow otherwise determined safe cut points (eg iommufd/vfio use iova_to_phys to scan for them) A future work will provide 'cut' which explicitly does the page size split if the HW can support it. unmap is implemented with a recursive descent of the tree. If the caller provides a VA range that spans an entire table item then the table memory can be freed as well. If an entire table item can be freed then this version will also check the leaf-only level of the tree to ensure that all entries are present to generate -EINVAL. Many of the existing drivers don't do this extra check. This version sits under the iommu_domain_ops as unmap_pages() but does not require the external page size calculation. The implementation is actually unmap_range() and can do arbitrary ranges, internally handling all the validation and supporting any arrangment of page sizes. A future series can optimize __iommu_unmap() to take advantage of this. Freed page table memory is batched up in the gather and will be freed in the driver's iotlb_sync() callback after the IOTLB flush completes. Reviewed-by: Kevin Tian Reviewed-by: Pasha Tatashin Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 5622856e1998..ceb6bc9cea37 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -9,6 +9,7 @@ #include #include +struct iommu_iotlb_gather; struct pt_iommu_ops; /** @@ -119,6 +120,10 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ + size_t pt_iommu_##fmt##_unmap_pages( \ + struct iommu_domain *domain, unsigned long iova, \ + size_t pgsize, size_t pgcount, \ + struct iommu_iotlb_gather *iotlb_gather); \ int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ const struct pt_iommu_##fmt##_cfg *cfg, \ gfp_t gfp); \ @@ -135,8 +140,9 @@ struct pt_iommu_cfg { * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the * iommu_pt */ -#define IOMMU_PT_DOMAIN_OPS(fmt) \ - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, +#define IOMMU_PT_DOMAIN_OPS(fmt) \ + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ + .unmap_pages = &pt_iommu_##fmt##_unmap_pages /* * The driver should setup its domain struct like -- cgit v1.2.3 From dcd6a011a8d523a114af2360a8753de5bd60c139 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:05 -0400 Subject: iommupt: Add map_pages op map is slightly complicated because it has to handle a number of special edge cases: - Overmapping a previously shared, but now empty, table level with an OA. Requries validating and freeing the possibly empty tables - Doing the above across an entire to-be-created contiguous entry - Installing a new shared table level concurrently with another thread - Expanding the table by adding more top levels Table expansion is a unique feature of AMDv1, this version is quite similar except we handle racing concurrent lockless map. The table top pointer and starting level are encoded in a single uintptr_t which ensures we can READ_ONCE() without tearing. Any op will do the READ_ONCE() and use that fixed point as its starting point. Concurrent expansion is handled with a table global spinlock. When inserting a new table entry map checks that the entire portion of the table is empty. This includes freeing any empty lower tables that will be overwritten by an OA. A separate free list is used while checking and collecting all the empty lower tables so that writing the new entry is uninterrupted, either the new entry fully writes or nothing changes. A special fast path for PAGE_SIZE is implemented that does a direct walk to the leaf level and installs a single entry. This gives ~15% improvement for iommu_map() when mapping lists of single pages. This version sits under the iommu_domain_ops as map_pages() but does not require the external page size calculation. The implementation is actually map_range() and can do arbitrary ranges, internally handling all the validation and supporting any arrangment of page sizes. A future series can optimize iommu_map() to take advantage of this. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 59 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index ceb6bc9cea37..0d59423024d5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -11,6 +11,7 @@ struct iommu_iotlb_gather; struct pt_iommu_ops; +struct pt_iommu_driver_ops; /** * DOC: IOMMU Radix Page Table @@ -43,6 +44,12 @@ struct pt_iommu { */ const struct pt_iommu_ops *ops; + /** + * @driver_ops: Function pointers provided by the HW driver to help + * manage HW details like caches. + */ + const struct pt_iommu_driver_ops *driver_ops; + /** * @nid: Node ID to use for table memory allocations. The IOMMU driver * may want to set the NID to the device's NID, if there are multiple @@ -84,6 +91,53 @@ struct pt_iommu_ops { void (*deinit)(struct pt_iommu *iommu_table); }; +/** + * struct pt_iommu_driver_ops - HW IOTLB cache flushing operations + * + * The IOMMU driver should implement these using container_of(iommu_table) to + * get to it's iommu_domain derived structure. All ops can be called in atomic + * contexts as they are buried under DMA API calls. + */ +struct pt_iommu_driver_ops { + /** + * @change_top: Update the top of table pointer + * @iommu_table: Table to operate on + * @top_paddr: New CPU physical address of the top pointer + * @top_level: IOMMU PT level of the new top + * + * Called under the get_top_lock() spinlock. The driver must update all + * HW references to this domain with a new top address and + * configuration. On return mappings placed in the new top must be + * reachable by the HW. + * + * top_level encodes the level in IOMMU PT format, level 0 is the + * smallest page size increasing from there. This has to be translated + * to any HW specific format. During this call the new top will not be + * visible to any other API. + * + * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if + * enabled. + */ + void (*change_top)(struct pt_iommu *iommu_table, phys_addr_t top_paddr, + unsigned int top_level); + + /** + * @get_top_lock: lock to hold when changing the table top + * @iommu_table: Table to operate on + * + * Return a lock to hold when changing the table top page table from + * being stored in HW. The lock will be held prior to calling + * change_top() and released once the top is fully visible. + * + * Typically this would be a lock that protects the iommu_domain's + * attachment list. + * + * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if + * enabled. + */ + spinlock_t *(*get_top_lock)(struct pt_iommu *iommu_table); +}; + static inline void pt_iommu_deinit(struct pt_iommu *iommu_table) { /* @@ -120,6 +174,10 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ + int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ + unsigned long iova, phys_addr_t paddr, \ + size_t pgsize, size_t pgcount, \ + int prot, gfp_t gfp, size_t *mapped); \ size_t pt_iommu_##fmt##_unmap_pages( \ struct iommu_domain *domain, unsigned long iova, \ size_t pgsize, size_t pgcount, \ @@ -142,6 +200,7 @@ struct pt_iommu_cfg { */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ + .map_pages = &pt_iommu_##fmt##_map_pages, \ .unmap_pages = &pt_iommu_##fmt##_unmap_pages /* -- cgit v1.2.3 From 4a00f943489103b4b9edff9f39bd484efbfb15fa Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:06 -0400 Subject: iommupt: Add read_and_clear_dirty op IOMMU HW now supports updating a dirty bit in an entry when a DMA writes to the entry's VA range. iommufd has a uAPI to read and clear the dirty bits from the tables. This is a trivial recursive descent algorithm to read and optionally clear the dirty bits. The format needs a function to tell if a contiguous entry is dirty, and a function to clear a contiguous entry back to clean. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 0d59423024d5..03a906fbe12a 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -12,6 +12,7 @@ struct iommu_iotlb_gather; struct pt_iommu_ops; struct pt_iommu_driver_ops; +struct iommu_dirty_bitmap; /** * DOC: IOMMU Radix Page Table @@ -182,6 +183,9 @@ struct pt_iommu_cfg { struct iommu_domain *domain, unsigned long iova, \ size_t pgsize, size_t pgcount, \ struct iommu_iotlb_gather *iotlb_gather); \ + int pt_iommu_##fmt##_read_and_clear_dirty( \ + struct iommu_domain *domain, unsigned long iova, size_t size, \ + unsigned long flags, struct iommu_dirty_bitmap *dirty); \ int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ const struct pt_iommu_##fmt##_cfg *cfg, \ gfp_t gfp); \ @@ -202,6 +206,8 @@ struct pt_iommu_cfg { .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ .map_pages = &pt_iommu_##fmt##_map_pages, \ .unmap_pages = &pt_iommu_##fmt##_unmap_pages +#define IOMMU_PT_DIRTY_OPS(fmt) \ + .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty /* * The driver should setup its domain struct like -- cgit v1.2.3 From e5359dcc617a2174d834bab4083340196615d8bd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:08 -0400 Subject: iommupt: Add a mock pagetable format for iommufd selftest to use The iommufd self test uses an xarray to store the pfns and their orders to emulate a page table. Slightly modify the amdv1 page table to create a real page table that has similar properties: - 2k base granule to simulate something like a 4k page table on a 64K PAGE_SIZE ARM system - Contiguous page support for every PFN order - Dirty tracking AMDv1 is the closest format, as it is the only one that already supports every page size. Tweak it to have only 5 levels and an 11 bit base granule and compile it separately as a format variant. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 03a906fbe12a..848a5fb76272 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -237,6 +237,12 @@ struct pt_iommu_amdv1_hw_info { IOMMU_FORMAT(amdv1, amdpt); +/* amdv1_mock is used by the iommufd selftest */ +#define pt_iommu_amdv1_mock pt_iommu_amdv1 +#define pt_iommu_amdv1_mock_cfg pt_iommu_amdv1_cfg +struct pt_iommu_amdv1_mock_hw_info; +IOMMU_PROTOTYPES(amdv1_mock); + #undef IOMMU_PROTOTYPES #undef IOMMU_FORMAT #endif -- cgit v1.2.3 From e93d5945ed5bb086431e83eed7ab98b6c058cc0b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:09 -0400 Subject: iommufd: Change the selftest to use iommupt instead of xarray The iommufd self test uses an xarray to store the pfns and their orders to emulate a page table. Make it act more like a real iommu driver by replacing the xarray with an iommupt based page table. The new AMDv1 mock format behaves similarly to the xarray. Add set_dirty() as a iommu_pt operation to allow the test suite to simulate HW dirty. Userspace can select between several formats including the normal AMDv1 format and a special MOCK_IOMMUPT_HUGE variation for testing huge page dirty tracking. To make the dirty tracking test work the page table must only store exactly 2M huge pages otherwise the logic the test uses fails. They cannot be broken up or combined. Aside from aligning the selftest with a real page table implementation, this helps test the iommupt code itself. Reviewed-by: Kevin Tian Reviewed-by: Samiullah Khawaja Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 848a5fb76272..f2a763aba088 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -73,6 +73,18 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @set_dirty: Make the iova write dirty + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * + * This is only used by iommufd testing. It makes the iova dirty so that + * read_and_clear_dirty() will see it as dirty. Unlike all the other ops + * this one is safe to call without holding any locking. It may return + * -EAGAIN if there is a race. + */ + int (*set_dirty)(struct pt_iommu *iommu_table, dma_addr_t iova); + /** * @get_info: Return the pt_iommu_info structure * @iommu_table: Table to query -- cgit v1.2.3 From aef5de756ea871ab44e3a1a87be6c944e6587c51 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:10 -0400 Subject: iommupt: Add the x86 64 bit page table format This is used by x86 CPUs and can be used in AMD/VT-d x86 IOMMUs. When a x86 IOMMU is running SVA the MM will be using this format. This implementation follows the AMD v2 io-pgtable version. There is nothing remarkable here, the format can have 4 or 5 levels and limited support for different page sizes. No contiguous pages support. x86 uses a sign extension mechanism where the top bits of the VA must match the sign bit. The core code supports this through PT_FEAT_SIGN_EXTEND which creates and upper and lower VA range. All the new operations will work correctly in both spaces, however currently there is no way to report the upper space to other layers. Future patches can improve that. In principle this can support 3 page tables levels matching the 32 bit PAE table format, but no iommu driver needs this. The focus is on the modern 64 bit 4 and 5 level formats. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 71,61 , 66,58 , -13.13 2^21, 66,60 , 61,55 , -10.10 2^30, 59,56 , 56,54 , -3.03 256*2^12, 392,1360 , 345,1289 , 73.73 256*2^21, 383,1159 , 335,1145 , 70.70 256*2^30, 378,965 , 331,892 , 62.62 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 77,71 , 73,68 , -7.07 2^21, 76,70 , 70,66 , -6.06 2^30, 69,66 , 66,63 , -4.04 256*2^12, 225,899 , 210,870 , 75.75 256*2^21, 262,722 , 248,710 , 65.65 256*2^30, 251,643 , 244,634 , 61.61 The small -ve values in the iommu_unmap() are due to the core code calling iommu_pgsize() before invoking the domain op. This is unncessary with this implementation. Future work optimizes this and gets to 2%, 4%, 3%. Reviewed-by: Kevin Tian Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/common.h | 13 +++++++++++++ include/linux/generic_pt/iommu.h | 11 +++++++++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 21e33489cbf2..96f8a6a7d60e 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -151,4 +151,17 @@ enum { PT_FEAT_AMDV1_FORCE_COHERENCE, }; +struct pt_x86_64 { + struct pt_common common; +}; + +enum { + /* + * The memory backing the tables is encrypted. Use __sme_set() to adjust + * the page table pointers in the tree. This only works with + * CONFIG_AMD_MEM_ENCRYPT. + */ + PT_FEAT_X86_64_AMD_ENCRYPT_TABLES = PT_FEAT_FMT_START, +}; + #endif diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index f2a763aba088..fde7ccf007c5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -255,6 +255,17 @@ IOMMU_FORMAT(amdv1, amdpt); struct pt_iommu_amdv1_mock_hw_info; IOMMU_PROTOTYPES(amdv1_mock); +struct pt_iommu_x86_64_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_x86_64_hw_info { + u64 gcr3_pt; + u8 levels; +}; + +IOMMU_FORMAT(x86_64, x86_64_pt); + #undef IOMMU_PROTOTYPES #undef IOMMU_FORMAT #endif -- cgit v1.2.3 From 2fdf6db436e3071a8e4c9c3e67674448a13860d4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:12 -0400 Subject: iommu/amd: Remove AMD io_pgtable support None of this is used anymore, delete it. Reviewed-by: Alejandro Jimenez Reviewed-by: Vasant Hegde Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/io-pgtable.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 8a823c6f2b4a..7a1516011ccf 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -15,8 +15,6 @@ enum io_pgtable_fmt { ARM_64_LPAE_S2, ARM_V7S, ARM_MALI_LPAE, - AMD_IOMMU_V1, - AMD_IOMMU_V2, APPLE_DART, APPLE_DART2, IO_PGTABLE_NUM_FMTS, -- cgit v1.2.3 From bc5233c0904eb116a4bd94e10cd3666733216063 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 4 Nov 2025 14:30:13 -0400 Subject: iommupt: Add a kunit test for the IOMMU implementation This intends to have high coverage of the page table format functions and the IOMMU implementation itself, exercising the various corner cases. The kunit tests can be run in the kunit framework, using commands like: tools/testing/kunit/kunit.py run --build_dir build_kunit_arm64 --arch arm64 --make_options LLVM=-19 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_uml --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_x86_64 --arch x86_64 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_i386 --arch i386 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig tools/testing/kunit/kunit.py run --build_dir build_kunit_i386pae --arch i386 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig --kconfig_add CONFIG_X86_PAE=y There are several interesting corner cases on the 32 bit platforms that need checking. Like the generic tests, these are run on the format's configuration list using kunit "params". This also checks the core iommu parts of the page table code as it enters the logic through a mock iommu_domain. The following are checked: - PT_FEAT_DYNAMIC_TOP properly adds levels one by one - Every page size can be iommu_map()'d, and mapping creates that size - iommu_iova_to_phys() works with every page size - Test converting OA -> non present -> OA when the two OAs overlap and free table levels - Test that unmap stops at holes, unmap doesn't split, and unmap returns the right values for partial unmap requests - Randomly map/unmap. Checks map with random sizes, that map fails when hitting collisions doing nothing, unmap/map with random intersections and full unmap of random sizes. Also checks iommu_iova_to_phys() with random sizes - Check for memory leaks by monitoring NR_SECONDARY_PAGETABLE Reviewed-by: Kevin Tian Tested-by: Alejandro Jimenez Tested-by: Pasha Tatashin Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/irqchip/riscv-imsic.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/irqchip/riscv-imsic.h b/include/linux/irqchip/riscv-imsic.h index 7494952c5518..7f3ff5c5ea53 100644 --- a/include/linux/irqchip/riscv-imsic.h +++ b/include/linux/irqchip/riscv-imsic.h @@ -10,7 +10,6 @@ #include #include #include -#include #define IMSIC_MMIO_PAGE_SHIFT 12 #define IMSIC_MMIO_PAGE_SZ BIT(IMSIC_MMIO_PAGE_SHIFT) @@ -86,7 +85,7 @@ static inline const struct imsic_global_config *imsic_get_global_config(void) #endif -#ifdef CONFIG_ACPI +#if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_RISCV_IMSIC) int imsic_platform_acpi_probe(struct fwnode_handle *fwnode); struct fwnode_handle *imsic_acpi_get_fwnode(struct device *dev); #else -- cgit v1.2.3 From aefd967dab6469f5b827b59e50016a760dcc1fbc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 Oct 2025 15:22:31 -0300 Subject: iommupt: Use the incoherent start/stop functions for PT_FEAT_DMA_INCOHERENT This is the first step to supporting an incoherent walker, start and stop the incoherence around the allocation and frees of the page table memory. The iommu_pages API maps this to dma_map/unmap_single(), or arch cache flushing calls. Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/common.h | 6 ++++++ include/linux/generic_pt/iommu.h | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 96f8a6a7d60e..883069e32952 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -85,6 +85,12 @@ enum { * position. */ enum pt_features { + /** + * @PT_FEAT_DMA_INCOHERENT: Cache flush page table memory before + * assuming the HW can read it. Otherwise a SMP release is sufficient + * for HW to read it. + */ + PT_FEAT_DMA_INCOHERENT, /** * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to * PT_VADDR_MAX. diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index fde7ccf007c5..21132e342a79 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -57,6 +57,13 @@ struct pt_iommu { * table walkers. */ int nid; + + /** + * @iommu_device: Device pointer used for any DMA cache flushing when + * PT_FEAT_DMA_INCOHERENT. This is the iommu device that created the + * page table which must have dma ops that perform cache flushing. + */ + struct device *iommu_device; }; /** -- cgit v1.2.3 From 5448c1558f60d4051c90938f2878c6fb20e2982a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 23 Oct 2025 15:22:33 -0300 Subject: iommupt: Add the Intel VT-d second stage page table format The VT-d second stage format is almost the same as the x86 PAE format, except the bit encodings in the PTE are different and a few new PTE features, like force coherency are present. Among all the formats it is unique in not having a designated present bit. Comparing the performance of several operations to the existing version: iommu_map() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 53,66 , 50,64 , 21.21 2^21, 59,70 , 56,67 , 16.16 2^30, 54,66 , 52,63 , 17.17 256*2^12, 384,524 , 337,516 , 34.34 256*2^21, 387,632 , 336,626 , 46.46 256*2^30, 376,629 , 323,623 , 48.48 iommu_unmap() pgsz ,avg new,old ns, min new,old ns , min % (+ve is better) 2^12, 67,86 , 63,84 , 25.25 2^21, 64,84 , 59,80 , 26.26 2^30, 59,78 , 56,74 , 24.24 256*2^12, 216,335 , 198,317 , 37.37 256*2^21, 245,350 , 232,344 , 32.32 256*2^30, 248,345 , 226,339 , 33.33 Cc: Tina Zhang Cc: Kevin Tian Cc: Lu Baolu Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/common.h | 18 ++++++++++++++++++ include/linux/generic_pt/iommu.h | 11 +++++++++++ 2 files changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 883069e32952..6a9a1acb5aad 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -157,6 +157,24 @@ enum { PT_FEAT_AMDV1_FORCE_COHERENCE, }; +struct pt_vtdss { + struct pt_common common; +}; + +enum { + /* + * The PTEs are set to prevent cache incoherent traffic, such as PCI no + * snoop. This is set either at creation time or before the first map + * operation. + */ + PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START, + /* + * Prevent creating read-only PTEs. Used to work around HW errata + * ERRATA_772415_SPR17. + */ + PT_FEAT_VTDSS_FORCE_WRITEABLE, +}; + struct pt_x86_64 { struct pt_common common; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 21132e342a79..cfe05a77f86b 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -262,6 +262,17 @@ IOMMU_FORMAT(amdv1, amdpt); struct pt_iommu_amdv1_mock_hw_info; IOMMU_PROTOTYPES(amdv1_mock); +struct pt_iommu_vtdss_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_vtdss_hw_info { + u64 ssptptr; + u8 aw; +}; + +IOMMU_FORMAT(vtdss, vtdss_pt); + struct pt_iommu_x86_64_cfg { struct pt_iommu_cfg common; }; -- cgit v1.2.3 From 0485a18d9141775d54489997b284fe2557b5898e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Nov 2025 15:46:32 +0100 Subject: fs: rename fs_types.h to fs_dirent.h We will split out a bunch of types into a separate header. So free up the appropriate name for it. Link: https://patch.msgid.link/20251104-work-fs-header-v1-1-fb39a2efe39e@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- include/linux/fs_dirent.h | 78 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs_types.h | 75 --------------------------------------------- 3 files changed, 79 insertions(+), 76 deletions(-) create mode 100644 include/linux/fs_dirent.h delete mode 100644 include/linux/fs_types.h (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..3c971ddace41 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/fs_dirent.h b/include/linux/fs_dirent.h new file mode 100644 index 000000000000..92f75c5bac19 --- /dev/null +++ b/include/linux/fs_dirent.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FS_DIRENT_H +#define _LINUX_FS_DIRENT_H + +#include +#include + +/* + * This is a header for the common implementation of dirent + * to fs on-disk file type conversion. Although the fs on-disk + * bits are specific to every file system, in practice, many + * file systems use the exact same on-disk format to describe + * the lower 3 file type bits that represent the 7 POSIX file + * types. + * + * It is important to note that the definitions in this + * header MUST NOT change. This would break both the + * userspace ABI and the on-disk format of filesystems + * using this code. + * + * All those file systems can use this generic code for the + * conversions. + */ + +/* + * struct dirent file types + * exposed to user via getdents(2), readdir(3) + * + * These match bits 12..15 of stat.st_mode + * (ie "(i_mode >> 12) & 15"). + */ +#define S_DT_SHIFT 12 +#define S_DT(mode) (((mode) & S_IFMT) >> S_DT_SHIFT) +#define S_DT_MASK (S_IFMT >> S_DT_SHIFT) + +/* these are defined by POSIX and also present in glibc's dirent.h */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 +#define DT_WHT 14 + +#define DT_MAX (S_DT_MASK + 1) /* 16 */ + +/* + * fs on-disk file types. + * Only the low 3 bits are used for the POSIX file types. + * Other bits are reserved for fs private use. + * These definitions are shared and used by multiple filesystems, + * and MUST NOT change under any circumstances. + * + * Note that no fs currently stores the whiteout type on-disk, + * so whiteout dirents are exposed to user as DT_CHR. + */ +#define FT_UNKNOWN 0 +#define FT_REG_FILE 1 +#define FT_DIR 2 +#define FT_CHRDEV 3 +#define FT_BLKDEV 4 +#define FT_FIFO 5 +#define FT_SOCK 6 +#define FT_SYMLINK 7 + +#define FT_MAX 8 + +/* + * declarations for helper functions, accompanying implementation + * is in fs/fs_dirent.c + */ +extern unsigned char fs_ftype_to_dtype(unsigned int filetype); +extern unsigned char fs_umode_to_ftype(umode_t mode); +extern unsigned char fs_umode_to_dtype(umode_t mode); + +#endif /* _LINUX_FS_DIRENT_H */ diff --git a/include/linux/fs_types.h b/include/linux/fs_types.h deleted file mode 100644 index 54816791196f..000000000000 --- a/include/linux/fs_types.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_FS_TYPES_H -#define _LINUX_FS_TYPES_H - -/* - * This is a header for the common implementation of dirent - * to fs on-disk file type conversion. Although the fs on-disk - * bits are specific to every file system, in practice, many - * file systems use the exact same on-disk format to describe - * the lower 3 file type bits that represent the 7 POSIX file - * types. - * - * It is important to note that the definitions in this - * header MUST NOT change. This would break both the - * userspace ABI and the on-disk format of filesystems - * using this code. - * - * All those file systems can use this generic code for the - * conversions. - */ - -/* - * struct dirent file types - * exposed to user via getdents(2), readdir(3) - * - * These match bits 12..15 of stat.st_mode - * (ie "(i_mode >> 12) & 15"). - */ -#define S_DT_SHIFT 12 -#define S_DT(mode) (((mode) & S_IFMT) >> S_DT_SHIFT) -#define S_DT_MASK (S_IFMT >> S_DT_SHIFT) - -/* these are defined by POSIX and also present in glibc's dirent.h */ -#define DT_UNKNOWN 0 -#define DT_FIFO 1 -#define DT_CHR 2 -#define DT_DIR 4 -#define DT_BLK 6 -#define DT_REG 8 -#define DT_LNK 10 -#define DT_SOCK 12 -#define DT_WHT 14 - -#define DT_MAX (S_DT_MASK + 1) /* 16 */ - -/* - * fs on-disk file types. - * Only the low 3 bits are used for the POSIX file types. - * Other bits are reserved for fs private use. - * These definitions are shared and used by multiple filesystems, - * and MUST NOT change under any circumstances. - * - * Note that no fs currently stores the whiteout type on-disk, - * so whiteout dirents are exposed to user as DT_CHR. - */ -#define FT_UNKNOWN 0 -#define FT_REG_FILE 1 -#define FT_DIR 2 -#define FT_CHRDEV 3 -#define FT_BLKDEV 4 -#define FT_FIFO 5 -#define FT_SOCK 6 -#define FT_SYMLINK 7 - -#define FT_MAX 8 - -/* - * declarations for helper functions, accompanying implementation - * is in fs/fs_types.c - */ -extern unsigned char fs_ftype_to_dtype(unsigned int filetype); -extern unsigned char fs_umode_to_ftype(umode_t mode); -extern unsigned char fs_umode_to_dtype(umode_t mode); - -#endif -- cgit v1.2.3 From b2f35ac4146d32d4424aaa941bbc681f12c1b9e6 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:04 -0700 Subject: iomap: add caller-provided callbacks for read and readahead Add caller-provided callbacks for read and readahead so that it can be used generically, especially by filesystems that are not block-based. In particular, this: * Modifies the read and readahead interface to take in a struct iomap_read_folio_ctx that is publicly defined as: struct iomap_read_folio_ctx { const struct iomap_read_ops *ops; struct folio *cur_folio; struct readahead_control *rac; void *read_ctx; }; where struct iomap_read_ops is defined as: struct iomap_read_ops { int (*read_folio_range)(const struct iomap_iter *iter, struct iomap_read_folio_ctx *ctx, size_t len); void (*read_submit)(struct iomap_read_folio_ctx *ctx); }; read_folio_range() reads in the folio range and is required by the caller to provide. read_submit() is optional and is used for submitting any pending read requests. * Modifies existing filesystems that use iomap for read and readahead to use the new API, through the new statically inlined helpers iomap_bio_read_folio() and iomap_bio_readahead(). There is no change in functionality for those filesystems. Signed-off-by: Joanne Koong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 63 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 4469b2318b08..37435b912755 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -16,6 +16,7 @@ struct inode; struct iomap_iter; struct iomap_dio; struct iomap_writepage_ctx; +struct iomap_read_folio_ctx; struct iov_iter; struct kiocb; struct page; @@ -337,8 +338,10 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); -void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); +int iomap_read_folio(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); +void iomap_readahead(const struct iomap_ops *ops, + struct iomap_read_folio_ctx *ctx); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); @@ -465,6 +468,8 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio, loff_t pos, loff_t end_pos, unsigned int dirty_len); int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); +void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, + int error); void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, @@ -473,6 +478,34 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio, int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio); int iomap_writepages(struct iomap_writepage_ctx *wpc); +struct iomap_read_folio_ctx { + const struct iomap_read_ops *ops; + struct folio *cur_folio; + struct readahead_control *rac; + void *read_ctx; +}; + +struct iomap_read_ops { + /* + * Read in a folio range. + * + * The caller is responsible for calling iomap_finish_folio_read() after + * reading in the folio range. This should be done even if an error is + * encountered during the read. + * + * Returns 0 on success or a negative error on failure. + */ + int (*read_folio_range)(const struct iomap_iter *iter, + struct iomap_read_folio_ctx *ctx, size_t len); + + /* + * Submit any pending read requests. + * + * This is optional. + */ + void (*submit_read)(struct iomap_read_folio_ctx *ctx); +}; + /* * Flags for direct I/O ->end_io: */ @@ -538,4 +571,30 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, extern struct bio_set iomap_ioend_bioset; +#ifdef CONFIG_BLOCK +extern const struct iomap_read_ops iomap_bio_read_ops; + +static inline void iomap_bio_read_folio(struct folio *folio, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .cur_folio = folio, + }; + + iomap_read_folio(ops, &ctx); +} + +static inline void iomap_bio_readahead(struct readahead_control *rac, + const struct iomap_ops *ops) +{ + struct iomap_read_folio_ctx ctx = { + .ops = &iomap_bio_read_ops, + .rac = rac, + }; + + iomap_readahead(ops, &ctx); +} +#endif /* CONFIG_BLOCK */ + #endif /* LINUX_IOMAP_H */ -- cgit v1.2.3 From d4e88bb08e5f7e6eb4e9c3685894b9b57bfdfb08 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 25 Sep 2025 17:26:06 -0700 Subject: iomap: make iomap_read_folio() a void return No errors are propagated in iomap_read_folio(). Change iomap_read_folio() to a void return to make this clearer to callers. Signed-off-by: Joanne Koong Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 37435b912755..6d864b446b6e 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -338,7 +338,7 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter) ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -int iomap_read_folio(const struct iomap_ops *ops, +void iomap_read_folio(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); void iomap_readahead(const struct iomap_ops *ops, struct iomap_read_folio_ctx *ctx); -- cgit v1.2.3 From f8d98072feee32722086ddae4f288b6c45ae4330 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 3 Oct 2025 09:46:35 -0400 Subject: filemap: add helper to look up dirty folios in a range Add a new filemap_get_folios_dirty() helper to look up existing dirty folios in a range and add them to a folio_batch. This is to support optimization of certain iomap operations that only care about dirty folios in a target range. For example, zero range only zeroes the subset of dirty pages over unwritten mappings, seek hole/data may use similar logic in the future, etc. Note that the helper is intended for use under internal fs locks. Therefore it trylocks folios in order to filter out clean folios. This loosely follows the logic from filemap_range_has_writeback(). Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/pagemap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09b581c1d878..7274a86b4871 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -977,6 +977,8 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch); +unsigned filemap_get_folios_dirty(struct address_space *mapping, + pgoff_t *start, pgoff_t end, struct folio_batch *fbatch); struct folio *read_cache_folio(struct address_space *, pgoff_t index, filler_t *filler, struct file *file); -- cgit v1.2.3 From 395ed1ef0012e1bb1e4050e84ba0173b3623112a Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Fri, 3 Oct 2025 09:46:37 -0400 Subject: iomap: optional zero range dirty folio processing The only way zero range can currently process unwritten mappings with dirty pagecache is to check whether the range is dirty before mapping lookup and then flush when at least one underlying mapping is unwritten. This ordering is required to prevent iomap lookup from racing with folio writeback and reclaim. Since zero range can skip ranges of unwritten mappings that are clean in cache, this operation can be improved by allowing the filesystem to provide a set of dirty folios that require zeroing. In turn, rather than flush or iterate file offsets, zero range can iterate on folios in the batch and advance over clean or uncached ranges in between. Add a folio_batch in struct iomap and provide a helper for filesystems to populate the batch at lookup time. Update the folio lookup path to return the next folio in the batch, if provided, and advance the iter if the folio starts beyond the current offset. Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6d864b446b6e..65d123114883 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -9,6 +9,7 @@ #include #include #include +#include struct address_space; struct fiemap_extent_info; @@ -242,6 +243,7 @@ struct iomap_iter { unsigned flags; struct iomap iomap; struct iomap srcmap; + struct folio_batch *fbatch; void *private; }; @@ -350,6 +352,8 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio); int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops); +loff_t iomap_fill_dirty_folios(struct iomap_iter *iter, loff_t offset, + loff_t length); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops, const struct iomap_write_ops *write_ops, void *private); -- cgit v1.2.3 From 001397f5ef4908ea46a63059439e8c3bf3552d9f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 31 Oct 2025 14:10:26 +0100 Subject: iomap: add IOMAP_DIO_FSBLOCK_ALIGNED flag Btrfs requires all of its bios to be fs block aligned, normally it's totally fine but with the incoming block size larger than page size (bs > ps) support, the requirement is no longer met for direct IOs. Because iomap_dio_bio_iter() calls bio_iov_iter_get_pages(), only requiring alignment to be bdev_logical_block_size(). In the real world that value is either 512 or 4K, on 4K page sized systems it means bio_iov_iter_get_pages() can break the bio at any page boundary, breaking btrfs' requirement for bs > ps cases. To address this problem, introduce a new public iomap dio flag, IOMAP_DIO_FSBLOCK_ALIGNED. When calling __iomap_dio_rw() with that new flag, iomap_dio::flags will inherit that new flag, and iomap_dio_bio_iter() will take fs block size into the calculation of the alignment, and pass the alignment to bio_iov_iter_get_pages(), respecting the fs block size requirement. The initial user of this flag will be btrfs, which needs to calculate the checksum for direct read and thus requires the biovec to be fs block aligned for the incoming bs > ps support. Signed-off-by: Qu Wenruo Reviewed-by: Pankaj Raghav [hch: also align pos/len, incorporate the trace flags from Darrick] Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251031131045.1613229-2-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 65d123114883..8b1ac08c7474 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -553,6 +553,14 @@ struct iomap_dio_ops { */ #define IOMAP_DIO_PARTIAL (1 << 2) +/* + * Ensure each bio is aligned to fs block size. + * + * For filesystems which need to calculate/verify the checksum of each fs + * block. Otherwise they may not be able to handle unaligned bios. + */ +#define IOMAP_DIO_FSBLOCK_ALIGNED (1 << 3) + ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags, void *private, size_t done_before); -- cgit v1.2.3 From cf76553aaa363620f58a6b6409bf544f4bcfa8de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Nov 2025 11:00:14 +0100 Subject: entry,unwind/deferred: Fix unwind_reset_info() placement Stephen reported that on KASAN builds he's seeing: vmlinux.o: warning: objtool: user_exc_vmm_communication+0x15a: call to __kasan_check_read() leaves .noinstr.text section vmlinux.o: warning: objtool: exc_debug_user+0x182: call to __kasan_check_read() leaves .noinstr.text section vmlinux.o: warning: objtool: exc_int3+0x123: call to __kasan_check_read() leaves .noinstr.text section vmlinux.o: warning: objtool: noist_exc_machine_check+0x17a: call to __kasan_check_read() leaves .noinstr.text section vmlinux.o: warning: objtool: fred_exc_machine_check+0x17e: call to __kasan_check_read() leaves .noinstr.text section This turns out to be atomic ops from unwind_reset_info() that have explicit instrumentation. Place unwind_reset_info() in the preceding instrumentation_begin() section. Fixes: c6439bfaabf2 ("Merge tag 'trace-deferred-unwind-v6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace") Reported-by: Stephen Rothwell Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251105100014.GY4068168@noisy.programming.kicks-ass.net --- include/linux/irq-entry-common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d643c7c87822..ba1ed42f8a1c 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -253,11 +253,11 @@ static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) static __always_inline void exit_to_user_mode(void) { instrumentation_begin(); + unwind_reset_info(); trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); instrumentation_end(); - unwind_reset_info(); user_enter_irqoff(); arch_exit_to_user_mode(); lockdep_hardirqs_on(CALLER_ADDR0); -- cgit v1.2.3 From 8637fa89e678422995301ddb20b74190dffcccee Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Tue, 4 Nov 2025 20:50:10 +0800 Subject: block: add __must_check attribute to sb_min_blocksize() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When sb_min_blocksize() returns 0 and the return value is not checked, it may lead to a situation where sb->s_blocksize is 0 when accessing the filesystem super block. After commit a64e5a596067bd ("bdev: add back PAGE_SIZE block size validation for sb_set_blocksize()"), this becomes more likely to happen when the block device’s logical_block_size is larger than PAGE_SIZE and the filesystem is unformatted. Add the __must_check attribute to ensure callers always check the return value. Cc: stable@vger.kernel.org # v6.15 Suggested-by: Matthew Wilcox Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Yongpeng Yang Link: https://patch.msgid.link/20251104125009.2111925-6-yangyongpeng.storage@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..3ea98c6cce81 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3423,8 +3423,8 @@ static inline void remove_inode_hash(struct inode *inode) extern void inode_sb_list_add(struct inode *inode); extern void inode_add_lru(struct inode *inode); -extern int sb_set_blocksize(struct super_block *, int); -extern int sb_min_blocksize(struct super_block *, int); +int sb_set_blocksize(struct super_block *sb, int size); +int __must_check sb_min_blocksize(struct super_block *sb, int size); int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); -- cgit v1.2.3 From ae83f3b72621bd3187eb7956c7c2993a97d4b187 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 9 Oct 2025 20:06:09 -0700 Subject: module: Add compile-time check for embedded NUL characters Long ago, the kernel module license checks were bypassed by embedding a NUL character in the MODULE_LICENSE() string[1]. By using a string like "GPL\0proprietary text", the kernel would only read "GPL" due to C string termination at the NUL byte, allowing proprietary modules to avoid kernel tainting and access GPL-only symbols. The MODULE_INFO() macro stores these strings in the .modinfo ELF section, and get_next_modinfo() uses strcmp()-family functions which stop at the first NUL. This split the embedded string into two separate .modinfo entries, with only the first part being processed by license_is_gpl_compatible(). Add a compile-time check using static_assert that compares the full string length (sizeof - 1) against __builtin_strlen(), which stops at the first NUL. If they differ, compilation fails with a clear error message. While this check can still be circumvented by modifying the ELF binary post-compilation, it prevents accidental embedded NULs and forces intentional abuse to require deliberate binary manipulation rather than simple source-level tricks. Build tested with test modules containing both valid and invalid license strings. The check correctly rejects: MODULE_LICENSE("GPL\0proprietary") while accepting normal declarations: MODULE_LICENSE("GPL") Link: https://lwn.net/Articles/82305/ [1] Suggested-by: Rusty Russell Signed-off-by: Kees Cook Reviewed-by: Daniel Gomez Reviewed-by: Aaron Tomlin Reviewed-by: Petr Pavlu Tested-by: Daniel Gomez Signed-off-by: Daniel Gomez --- include/linux/moduleparam.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 6907aedc4f74..915f32f7d888 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -26,6 +26,9 @@ /* Generic info of form tag = "info" */ #define MODULE_INFO(tag, info) \ + static_assert( \ + sizeof(info) - 1 == __builtin_strlen(info), \ + "MODULE_INFO(" #tag ", ...) contains embedded NUL byte"); \ static const char __UNIQUE_ID(modinfo)[] \ __used __section(".modinfo") __aligned(1) \ = __MODULE_INFO_PREFIX __stringify(tag) "=" info -- cgit v1.2.3 From 3c36965df80801344850388592e95033eceea05b Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Mon, 27 Oct 2025 12:05:22 +0100 Subject: regulator: Add support for MediaTek MT6363 SPMI PMIC Regulators Add a driver for the regulators found on the MT6363 PMIC, fully controlled by SPMI interface. This PMIC regulates voltage with an input range of 2.6-5.0V, and features 10 buck converters and 26 LDOs. Signed-off-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20251027110527.21002-5-angelogioacchino.delregno@collabora.com Signed-off-by: Mark Brown --- include/linux/regulator/mt6363-regulator.h | 330 +++++++++++++++++++++++++++++ 1 file changed, 330 insertions(+) create mode 100644 include/linux/regulator/mt6363-regulator.h (limited to 'include') diff --git a/include/linux/regulator/mt6363-regulator.h b/include/linux/regulator/mt6363-regulator.h new file mode 100644 index 000000000000..60761f01d3ad --- /dev/null +++ b/include/linux/regulator/mt6363-regulator.h @@ -0,0 +1,330 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2024 MediaTek Inc. + * Copyright (c) 2025 Collabora Ltd + */ + +#include + +#ifndef __LINUX_REGULATOR_MT6363_H +#define __LINUX_REGULATOR_MT6363_H + +/* Register */ +#define MT6363_TOP_TRAP 0x6 +#define MT6363_TOP_TMA_KEY_L 0x36e +#define MT6363_RG_BUCK0_EN_ADDR 0x210 +#define MT6363_RG_BUCK_VS2_EN_BIT 0 +#define MT6363_RG_BUCK_VBUCK1_EN_BIT 1 +#define MT6363_RG_BUCK_VBUCK2_EN_BIT 2 +#define MT6363_RG_BUCK_VBUCK3_EN_BIT 3 +#define MT6363_RG_BUCK_VBUCK4_EN_BIT 4 +#define MT6363_RG_BUCK_VBUCK5_EN_BIT 5 +#define MT6363_RG_BUCK_VBUCK6_EN_BIT 6 +#define MT6363_RG_BUCK_VBUCK7_EN_BIT 7 +#define MT6363_RG_BUCK1_EN_ADDR 0x213 +#define MT6363_RG_BUCK_VS1_EN_BIT 0 +#define MT6363_RG_BUCK_VS3_EN_BIT 1 +#define MT6363_RG_LDO_VSRAM_DIGRF_EN_BIT 4 +#define MT6363_RG_LDO_VSRAM_MDFE_EN_BIT 5 +#define MT6363_RG_LDO_VSRAM_MODEM_EN_BIT 6 +#define MT6363_RG_BUCK0_LP_ADDR 0x216 +#define MT6363_RG_BUCK_VS2_LP_BIT 0 +#define MT6363_RG_BUCK_VBUCK1_LP_BIT 1 +#define MT6363_RG_BUCK_VBUCK2_LP_BIT 2 +#define MT6363_RG_BUCK_VBUCK3_LP_BIT 3 +#define MT6363_RG_BUCK_VBUCK4_LP_BIT 4 +#define MT6363_RG_BUCK_VBUCK5_LP_BIT 5 +#define MT6363_RG_BUCK_VBUCK6_LP_BIT 6 +#define MT6363_RG_BUCK_VBUCK7_LP_BIT 7 +#define MT6363_RG_BUCK1_LP_ADDR 0x219 +#define MT6363_RG_BUCK_VS1_LP_BIT 0 +#define MT6363_RG_BUCK_VS3_LP_BIT 1 +#define MT6363_RG_LDO_VSRAM_DIGRF_LP_BIT 4 +#define MT6363_RG_LDO_VSRAM_MDFE_LP_BIT 5 +#define MT6363_RG_LDO_VSRAM_MODEM_LP_BIT 6 +#define MT6363_RG_BUCK_VS2_VOSEL_ADDR 0x21c +#define MT6363_RG_BUCK_VS2_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK1_VOSEL_ADDR 0x21d +#define MT6363_RG_BUCK_VBUCK1_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK2_VOSEL_ADDR 0x21e +#define MT6363_RG_BUCK_VBUCK2_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK3_VOSEL_ADDR 0x21f +#define MT6363_RG_BUCK_VBUCK3_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK4_VOSEL_ADDR 0x220 +#define MT6363_RG_BUCK_VBUCK4_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK5_VOSEL_ADDR 0x221 +#define MT6363_RG_BUCK_VBUCK5_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK6_VOSEL_ADDR 0x222 +#define MT6363_RG_BUCK_VBUCK6_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VBUCK7_VOSEL_ADDR 0x223 +#define MT6363_RG_BUCK_VBUCK7_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VS1_VOSEL_ADDR 0x224 +#define MT6363_RG_BUCK_VS1_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_BUCK_VS3_VOSEL_ADDR 0x225 +#define MT6363_RG_BUCK_VS3_VOSEL_MASK GENMASK(7, 0) +#define MT6363_RG_LDO_VSRAM_DIGRF_VOSEL_ADDR 0x228 +#define MT6363_RG_LDO_VSRAM_DIGRF_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_LDO_VSRAM_MDFE_VOSEL_ADDR 0x229 +#define MT6363_RG_LDO_VSRAM_MDFE_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_LDO_VSRAM_MODEM_VOSEL_ADDR 0x22a +#define MT6363_RG_LDO_VSRAM_MODEM_VOSEL_MASK GENMASK(6, 0) +#define MT6363_BUCK_TOP_KEY_PROT_LO 0x13fa +#define MT6363_BUCK_VS2_WDTDBG_VOSEL_ADDR 0x13fc +#define MT6363_BUCK_VBUCK1_WDTDBG_VOSEL_ADDR 0x13fd +#define MT6363_BUCK_VBUCK2_WDTDBG_VOSEL_ADDR 0x13fe +#define MT6363_BUCK_VBUCK3_WDTDBG_VOSEL_ADDR 0x13ff +#define MT6363_BUCK_VBUCK4_WDTDBG_VOSEL_ADDR 0x1400 +#define MT6363_BUCK_VBUCK5_WDTDBG_VOSEL_ADDR 0x1401 +#define MT6363_BUCK_VBUCK6_WDTDBG_VOSEL_ADDR 0x1402 +#define MT6363_BUCK_VBUCK7_WDTDBG_VOSEL_ADDR 0x1403 +#define MT6363_BUCK_VS1_WDTDBG_VOSEL_ADDR 0x1404 +#define MT6363_BUCK_VS3_WDTDBG_VOSEL_ADDR 0x1405 +#define MT6363_RG_BUCK_EFUSE_RSV1 0x1417 +#define MT6363_RG_BUCK_EFUSE_RSV1_MASK GENMASK(7, 4) +#define MT6363_BUCK_VS2_OP_EN_0 0x145d +#define MT6363_BUCK_VS2_HW_LP_MODE 0x1468 +#define MT6363_BUCK_VBUCK1_OP_EN_0 0x14dd +#define MT6363_BUCK_VBUCK1_HW_LP_MODE 0x14e8 +#define MT6363_RG_BUCK_VBUCK1_SSHUB_EN_ADDR 0x14ea +#define MT6363_RG_BUCK_VBUCK1_SSHUB_VOSEL_ADDR 0x14eb +#define MT6363_RG_BUCK_VBUCK1_SSHUB_VOSEL_MASK GENMASK(7, 0) +#define MT6363_BUCK_VBUCK2_OP_EN_0 0x155d +#define MT6363_BUCK_VBUCK2_HW_LP_MODE 0x1568 +#define MT6363_RG_BUCK_VBUCK2_SSHUB_EN_ADDR 0x156a +#define MT6363_RG_BUCK_VBUCK2_SSHUB_VOSEL_ADDR 0x156b +#define MT6363_RG_BUCK_VBUCK2_SSHUB_VOSEL_MASK GENMASK(7, 0) +#define MT6363_BUCK_VBUCK3_OP_EN_0 0x15dd +#define MT6363_BUCK_VBUCK3_HW_LP_MODE 0x15e8 +#define MT6363_BUCK_VBUCK4_OP_EN_0 0x165d +#define MT6363_BUCK_VBUCK4_HW_LP_MODE 0x1668 +#define MT6363_RG_BUCK_VBUCK4_SSHUB_EN_ADDR 0x166a +#define MT6363_RG_BUCK_VBUCK4_SSHUB_VOSEL_ADDR 0x166b +#define MT6363_RG_BUCK_VBUCK4_SSHUB_VOSEL_MASK GENMASK(7, 0) +#define MT6363_BUCK_VBUCK5_OP_EN_0 0x16dd +#define MT6363_BUCK_VBUCK5_HW_LP_MODE 0x16e8 +#define MT6363_BUCK_VBUCK6_OP_EN_0 0x175d +#define MT6363_BUCK_VBUCK6_HW_LP_MODE 0x1768 +#define MT6363_BUCK_VBUCK7_OP_EN_0 0x17dd +#define MT6363_BUCK_VBUCK7_HW_LP_MODE 0x17e8 +#define MT6363_BUCK_VS1_OP_EN_0 0x185d +#define MT6363_BUCK_VS1_HW_LP_MODE 0x1868 +#define MT6363_BUCK_VS3_OP_EN_0 0x18dd +#define MT6363_BUCK_VS3_HW_LP_MODE 0x18e8 +#define MT6363_RG_VS1_FCCM_ADDR 0x1964 +#define MT6363_RG_VS1_FCCM_BIT 0 +#define MT6363_RG_VS3_FCCM_ADDR 0x1973 +#define MT6363_RG_VS3_FCCM_BIT 0 +#define MT6363_RG_BUCK0_FCCM_ADDR 0x1a02 +#define MT6363_RG_VBUCK1_FCCM_BIT 0 +#define MT6363_RG_VBUCK2_FCCM_BIT 1 +#define MT6363_RG_VBUCK3_FCCM_BIT 2 +#define MT6363_RG_VS2_FCCM_BIT 3 +#define MT6363_RG_BUCK0_1_FCCM_ADDR 0x1a82 +#define MT6363_RG_VBUCK4_FCCM_BIT 0 +#define MT6363_RG_VBUCK5_FCCM_BIT 1 +#define MT6363_RG_VBUCK6_FCCM_BIT 2 +#define MT6363_RG_VBUCK7_FCCM_BIT 3 +#define MT6363_RG_VCN13_VOSEL_ADDR 0x1b0f +#define MT6363_RG_VCN13_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VEMC_VOSEL_ADDR 0x1b10 +#define MT6363_RG_VEMC_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VEMC_VOSEL_1_MASK GENMASK(7, 4) +#define MT6363_RG_LDO_VSRAM_CPUB_VOSEL_ADDR 0x1b14 +#define MT6363_RG_LDO_VSRAM_CPUB_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_LDO_VSRAM_CPUM_VOSEL_ADDR 0x1b15 +#define MT6363_RG_LDO_VSRAM_CPUM_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_LDO_VSRAM_CPUL_VOSEL_ADDR 0x1b16 +#define MT6363_RG_LDO_VSRAM_CPUL_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_LDO_VSRAM_APU_VOSEL_ADDR 0x1b17 +#define MT6363_RG_LDO_VSRAM_APU_VOSEL_MASK GENMASK(6, 0) +#define MT6363_RG_VEMC_VOCAL_ADDR 0x1b1b +#define MT6363_RG_VEMC_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_LDO_VCN15_ADDR 0x1b57 +#define MT6363_RG_LDO_VCN15_EN_BIT 0 +#define MT6363_RG_LDO_VCN15_LP_BIT 1 +#define MT6363_LDO_VCN15_HW_LP_MODE 0x1b5b +#define MT6363_LDO_VCN15_OP_EN0 0x1b5c +#define MT6363_RG_LDO_VRF09_ADDR 0x1b65 +#define MT6363_RG_LDO_VRF09_EN_BIT 0 +#define MT6363_RG_LDO_VRF09_LP_BIT 1 +#define MT6363_LDO_VRF09_HW_LP_MODE 0x1b69 +#define MT6363_LDO_VRF09_OP_EN0 0x1b6a +#define MT6363_RG_LDO_VRF12_ADDR 0x1b73 +#define MT6363_RG_LDO_VRF12_EN_BIT 0 +#define MT6363_RG_LDO_VRF12_LP_BIT 1 +#define MT6363_LDO_VRF12_HW_LP_MODE 0x1b77 +#define MT6363_LDO_VRF12_OP_EN0 0x1b78 +#define MT6363_RG_LDO_VRF13_ADDR 0x1b81 +#define MT6363_RG_LDO_VRF13_EN_BIT 0 +#define MT6363_RG_LDO_VRF13_LP_BIT 1 +#define MT6363_LDO_VRF13_HW_LP_MODE 0x1b85 +#define MT6363_LDO_VRF13_OP_EN0 0x1b86 +#define MT6363_RG_LDO_VRF18_ADDR 0x1b8f +#define MT6363_RG_LDO_VRF18_EN_BIT 0 +#define MT6363_RG_LDO_VRF18_LP_BIT 1 +#define MT6363_LDO_VRF18_HW_LP_MODE 0x1b93 +#define MT6363_LDO_VRF18_OP_EN0 0x1b94 +#define MT6363_RG_LDO_VRFIO18_ADDR 0x1b9d +#define MT6363_RG_LDO_VRFIO18_EN_BIT 0 +#define MT6363_RG_LDO_VRFIO18_LP_BIT 1 +#define MT6363_LDO_VRFIO18_HW_LP_MODE 0x1ba1 +#define MT6363_LDO_VRFIO18_OP_EN0 0x1ba2 +#define MT6363_RG_LDO_VTREF18_ADDR 0x1bd7 +#define MT6363_RG_LDO_VTREF18_EN_BIT 0 +#define MT6363_RG_LDO_VTREF18_LP_BIT 1 +#define MT6363_LDO_VTREF18_HW_LP_MODE 0x1bdb +#define MT6363_LDO_VTREF18_OP_EN0 0x1bdc +#define MT6363_RG_LDO_VAUX18_ADDR 0x1be5 +#define MT6363_RG_LDO_VAUX18_EN_BIT 0 +#define MT6363_RG_LDO_VAUX18_LP_BIT 1 +#define MT6363_LDO_VAUX18_HW_LP_MODE 0x1be9 +#define MT6363_LDO_VAUX18_OP_EN0 0x1bea +#define MT6363_RG_LDO_VEMC_ADDR 0x1bf3 +#define MT6363_RG_LDO_VEMC_EN_BIT 0 +#define MT6363_RG_LDO_VEMC_LP_BIT 1 +#define MT6363_LDO_VEMC_HW_LP_MODE 0x1bf7 +#define MT6363_LDO_VEMC_OP_EN0 0x1bf8 +#define MT6363_RG_LDO_VUFS12_ADDR 0x1c01 +#define MT6363_RG_LDO_VUFS12_EN_BIT 0 +#define MT6363_RG_LDO_VUFS12_LP_BIT 1 +#define MT6363_LDO_VUFS12_HW_LP_MODE 0x1c05 +#define MT6363_LDO_VUFS12_OP_EN0 0x1c06 +#define MT6363_RG_LDO_VUFS18_ADDR 0x1c0f +#define MT6363_RG_LDO_VUFS18_EN_BIT 0 +#define MT6363_RG_LDO_VUFS18_LP_BIT 1 +#define MT6363_LDO_VUFS18_HW_LP_MODE 0x1c13 +#define MT6363_LDO_VUFS18_OP_EN0 0x1c14 +#define MT6363_RG_LDO_VIO18_ADDR 0x1c1d +#define MT6363_RG_LDO_VIO18_EN_BIT 0 +#define MT6363_RG_LDO_VIO18_LP_BIT 1 +#define MT6363_LDO_VIO18_HW_LP_MODE 0x1c21 +#define MT6363_LDO_VIO18_OP_EN0 0x1c22 +#define MT6363_RG_LDO_VIO075_ADDR 0x1c57 +#define MT6363_RG_LDO_VIO075_EN_BIT 0 +#define MT6363_RG_LDO_VIO075_LP_BIT 1 +#define MT6363_LDO_VIO075_HW_LP_MODE 0x1c5b +#define MT6363_LDO_VIO075_OP_EN0 0x1c5c +#define MT6363_RG_LDO_VA12_1_ADDR 0x1c65 +#define MT6363_RG_LDO_VA12_1_EN_BIT 0 +#define MT6363_RG_LDO_VA12_1_LP_BIT 1 +#define MT6363_LDO_VA12_1_HW_LP_MODE 0x1c69 +#define MT6363_LDO_VA12_1_OP_EN0 0x1c6a +#define MT6363_RG_LDO_VA12_2_ADDR 0x1c73 +#define MT6363_RG_LDO_VA12_2_EN_BIT 0 +#define MT6363_RG_LDO_VA12_2_LP_BIT 1 +#define MT6363_LDO_VA12_2_HW_LP_MODE 0x1c77 +#define MT6363_LDO_VA12_2_OP_EN0 0x1c78 +#define MT6363_RG_LDO_VA15_ADDR 0x1c81 +#define MT6363_RG_LDO_VA15_EN_BIT 0 +#define MT6363_RG_LDO_VA15_LP_BIT 1 +#define MT6363_LDO_VA15_HW_LP_MODE 0x1c85 +#define MT6363_LDO_VA15_OP_EN0 0x1c86 +#define MT6363_RG_LDO_VM18_ADDR 0x1c8f +#define MT6363_RG_LDO_VM18_EN_BIT 0 +#define MT6363_RG_LDO_VM18_LP_BIT 1 +#define MT6363_LDO_VM18_HW_LP_MODE 0x1c93 +#define MT6363_LDO_VM18_OP_EN0 0x1c94 +#define MT6363_RG_LDO_VCN13_ADDR 0x1cd7 +#define MT6363_RG_LDO_VCN13_EN_BIT 0 +#define MT6363_RG_LDO_VCN13_LP_BIT 1 +#define MT6363_LDO_VCN13_HW_LP_MODE 0x1cdb +#define MT6363_LDO_VCN13_OP_EN0 0x1ce4 +#define MT6363_LDO_VSRAM_DIGRF_HW_LP_MODE 0x1cf1 +#define MT6363_LDO_VSRAM_DIGRF_OP_EN0 0x1cfa +#define MT6363_LDO_VSRAM_MDFE_HW_LP_MODE 0x1d5b +#define MT6363_LDO_VSRAM_MDFE_OP_EN0 0x1d64 +#define MT6363_LDO_VSRAM_MODEM_HW_LP_MODE 0x1d76 +#define MT6363_LDO_VSRAM_MODEM_OP_EN0 0x1d7f +#define MT6363_RG_LDO_VSRAM_CPUB_ADDR 0x1dd7 +#define MT6363_RG_LDO_VSRAM_CPUB_EN_BIT 0 +#define MT6363_RG_LDO_VSRAM_CPUB_LP_BIT 1 +#define MT6363_LDO_VSRAM_CPUB_HW_LP_MODE 0x1ddb +#define MT6363_LDO_VSRAM_CPUB_OP_EN0 0x1de4 +#define MT6363_RG_LDO_VSRAM_CPUM_ADDR 0x1ded +#define MT6363_RG_LDO_VSRAM_CPUM_EN_BIT 0 +#define MT6363_RG_LDO_VSRAM_CPUM_LP_BIT 1 +#define MT6363_LDO_VSRAM_CPUM_HW_LP_MODE 0x1df1 +#define MT6363_LDO_VSRAM_CPUM_OP_EN0 0x1dfa +#define MT6363_RG_LDO_VSRAM_CPUL_ADDR 0x1e57 +#define MT6363_RG_LDO_VSRAM_CPUL_EN_BIT 0 +#define MT6363_RG_LDO_VSRAM_CPUL_LP_BIT 1 +#define MT6363_LDO_VSRAM_CPUL_HW_LP_MODE 0x1e5b +#define MT6363_LDO_VSRAM_CPUL_OP_EN0 0x1e64 +#define MT6363_RG_LDO_VSRAM_APU_ADDR 0x1e6d +#define MT6363_RG_LDO_VSRAM_APU_EN_BIT 0 +#define MT6363_RG_LDO_VSRAM_APU_LP_BIT 1 +#define MT6363_LDO_VSRAM_APU_HW_LP_MODE 0x1e71 +#define MT6363_LDO_VSRAM_APU_OP_EN0 0x1e7a +#define MT6363_RG_VTREF18_VOCAL_ADDR 0x1ed8 +#define MT6363_RG_VTREF18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VTREF18_VOSEL_ADDR 0x1ed9 +#define MT6363_RG_VTREF18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VAUX18_VOCAL_ADDR 0x1edc +#define MT6363_RG_VAUX18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VAUX18_VOSEL_ADDR 0x1edd +#define MT6363_RG_VAUX18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VCN15_VOCAL_ADDR 0x1ee3 +#define MT6363_RG_VCN15_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VCN15_VOSEL_ADDR 0x1ee4 +#define MT6363_RG_VCN15_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VUFS18_VOCAL_ADDR 0x1ee7 +#define MT6363_RG_VUFS18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VUFS18_VOSEL_ADDR 0x1ee8 +#define MT6363_RG_VUFS18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VIO18_VOCAL_ADDR 0x1eeb +#define MT6363_RG_VIO18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VIO18_VOSEL_ADDR 0x1eec +#define MT6363_RG_VIO18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VM18_VOCAL_ADDR 0x1eef +#define MT6363_RG_VM18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VM18_VOSEL_ADDR 0x1ef0 +#define MT6363_RG_VM18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VA15_VOCAL_ADDR 0x1ef3 +#define MT6363_RG_VA15_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VA15_VOSEL_ADDR 0x1ef4 +#define MT6363_RG_VA15_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF18_VOCAL_ADDR 0x1ef7 +#define MT6363_RG_VRF18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF18_VOSEL_ADDR 0x1ef8 +#define MT6363_RG_VRF18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VRFIO18_VOCAL_ADDR 0x1efb +#define MT6363_RG_VRFIO18_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VRFIO18_VOSEL_ADDR 0x1efc +#define MT6363_RG_VRFIO18_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VIO075_VOCFG_ADDR 0x1f01 +#define MT6363_RG_VIO075_VOCAL_ADDR MT6363_RG_VIO075_VOCFG_ADDR +#define MT6363_RG_VIO075_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VIO075_VOSEL_ADDR MT6363_RG_VIO075_VOCFG_ADDR +#define MT6363_RG_VIO075_VOSEL_MASK GENMASK(6, 4) +#define MT6363_RG_VCN13_VOCAL_ADDR 0x1f58 +#define MT6363_RG_VCN13_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VUFS12_VOCAL_ADDR 0x1f61 +#define MT6363_RG_VUFS12_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VUFS12_VOSEL_ADDR 0x1f62 +#define MT6363_RG_VUFS12_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VA12_1_VOCAL_ADDR 0x1f65 +#define MT6363_RG_VA12_1_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VA12_1_VOSEL_ADDR 0x1f66 +#define MT6363_RG_VA12_1_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VA12_2_VOCAL_ADDR 0x1f69 +#define MT6363_RG_VA12_2_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VA12_2_VOSEL_ADDR 0x1f6a +#define MT6363_RG_VA12_2_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF12_VOCAL_ADDR 0x1f6d +#define MT6363_RG_VRF12_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF12_VOSEL_ADDR 0x1f6e +#define MT6363_RG_VRF12_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF13_VOCAL_ADDR 0x1f71 +#define MT6363_RG_VRF13_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF13_VOSEL_ADDR 0x1f72 +#define MT6363_RG_VRF13_VOSEL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF09_VOCAL_ADDR 0x1f78 +#define MT6363_RG_VRF09_VOCAL_MASK GENMASK(3, 0) +#define MT6363_RG_VRF09_VOSEL_ADDR 0x1f79 +#define MT6363_RG_VRF09_VOSEL_MASK GENMASK(3, 0) +#define MT6363_ISINK_EN_CTRL0 0x21db +#define MT6363_ISINK_CTRL0_MASK GENMASK(7, 0) +#define MT6363_ISINK_EN_CTRL1 0x21dc +#define MT6363_ISINK_CTRL1_MASK GENMASK(7, 4) + +#endif /* __LINUX_REGULATOR_MT6363_H */ -- cgit v1.2.3 From fdb9aed869f34d776298b3a8197909eb820e4d0d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:38 +0900 Subject: block: introduce disk_report_zone() Commit b76b840fd933 ("dm: Fix dm-zoned-reclaim zone write pointer alignment") introduced an indirect call for the callback function of a report zones executed with blkdev_report_zones(). This is necessary so that the function disk_zone_wplug_sync_wp_offset() can be called to refresh a zone write plug zone write pointer offset after a write error. However, this solution makes following the path of a zone information harder to understand. Clean this up by introducing the new blk_report_zones_args structure to define a zone report callback and its private data and introduce the helper function disk_report_zone() which calls both disk_zone_wplug_sync_wp_offset() and the zone report user callback function for all zones of a zone report. This helper function must be called by all block device drivers that implement the report zones block operation in order to correctly report a zone information. All block device drivers supporting the report_zones block operation are updated to use this new scheme. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ++++++- include/linux/device-mapper.h | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99be263b31ab..2f75fb15f55f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -38,6 +38,7 @@ struct blk_flush_queue; struct kiocb; struct pr_ops; struct rq_qos; +struct blk_report_zones_args; struct blk_queue_stats; struct blk_stat_callback; struct blk_crypto_profile; @@ -432,6 +433,9 @@ struct queue_limits { typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); +int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, + unsigned int idx, struct blk_report_zones_args *args); + #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); @@ -1662,7 +1666,8 @@ struct block_device_operations { /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, - unsigned int nr_zones, report_zones_cb cb, void *data); + unsigned int nr_zones, + struct blk_report_zones_args *args); char *(*devnode)(struct gendisk *disk, umode_t *mode); /* returns the length of the identifier or a negative errno: */ int (*get_unique_id)(struct gendisk *disk, u8 id[16], diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 84fdc3a6a19a..38f625af6ab4 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -538,12 +538,18 @@ void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone); #ifdef CONFIG_BLK_DEV_ZONED struct dm_report_zones_args { struct dm_target *tgt; + struct gendisk *disk; sector_t next_sector; - void *orig_data; - report_zones_cb orig_cb; unsigned int zone_idx; + /* for block layer ->report_zones */ + struct blk_report_zones_args *rep_args; + + /* for internal users */ + report_zones_cb cb; + void *data; + /* must be filled by ->report_zones before calling dm_report_zones_cb */ sector_t start; }; -- cgit v1.2.3 From 6e945ffb6555705cf20b1fcdc21a139911562995 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:40 +0900 Subject: block: use zone condition to determine conventional zones The conv_zones_bitmap field of struct gendisk is used to define a bitmap to identify the conventional zones of a zoned block device. The bit for a zone is set in this bitmap if the zone is a conventional one, that is, if the zone type is BLK_ZONE_TYPE_CONVENTIONAL. For such zone, this always corresponds to the zone condition BLK_ZONE_COND_NOT_WP. In other words, conv_zones_bitmap tracks a single condition of the zones of a zoned block device. In preparation for tracking more zone conditions, change conv_zones_bitmap into an array of zone conditions, using 1 byte per zone. This increases the memory usage from 1 bit per zone to 1 byte per zone, that is, from 16 KiB to about 100 KiB for a 30 TB SMR HDD with 256 MiB zones. This is a trade-off to allow fast cached report zones later on top of this change. Rename the conv_zones_bitmap field of struct gendisk to zones_cond. Add a blk_revalidate_zone_cond() function to initialize the zones_cond array of a disk during device scan and to update it on device revalidation. Move the allocation of the zones_cond array to disk_revalidate_zone_resources(), making sure that this array is always allocated, even for devices that do not need zone write plugs (zone resources), to ensure that bdev_zone_is_seq() can be re-implemented to use the zone condition array in place of the conv zones bitmap. Finally, the function bdev_zone_is_seq() is rewritten to use a test on the condition of the target zone. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2f75fb15f55f..53bcfbc2f68f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,7 +196,7 @@ struct gendisk { unsigned int nr_zones; unsigned int zone_capacity; unsigned int last_zone_capacity; - unsigned long __rcu *conv_zones_bitmap; + u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; spinlock_t zone_wplugs_lock; @@ -925,12 +925,20 @@ static inline unsigned int bdev_zone_capacity(struct block_device *bdev, { return disk_zone_capacity(bdev->bd_disk, pos); } + +bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector); + #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } +static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) +{ + return false; +} + static inline bool bio_needs_zone_write_plugging(struct bio *bio) { return false; @@ -1533,33 +1541,6 @@ static inline bool bdev_is_zone_aligned(struct block_device *bdev, return bdev_is_zone_start(bdev, sector); } -/** - * bdev_zone_is_seq - check if a sector belongs to a sequential write zone - * @bdev: block device to check - * @sector: sector number - * - * Check if @sector on @bdev is contained in a sequential write required zone. - */ -static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) -{ - bool is_seq = false; - -#if IS_ENABLED(CONFIG_BLK_DEV_ZONED) - if (bdev_is_zoned(bdev)) { - struct gendisk *disk = bdev->bd_disk; - unsigned long *bitmap; - - rcu_read_lock(); - bitmap = rcu_dereference(disk->conv_zones_bitmap); - is_seq = !bitmap || - !test_bit(disk_zone_no(disk, sector), bitmap); - rcu_read_unlock(); - } -#endif - - return is_seq; -} - int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask); -- cgit v1.2.3 From 0bf0e2e4666822b62d7ad6473dc37fd6b377b5f1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:41 +0900 Subject: block: track zone conditions The function blk_revalidate_zone_cond() already caches the condition of all zones of a zoned block device in the zones_cond array of a gendisk. However, the zone conditions are updated only when the device is scanned or revalidated. Implement tracking of the runtime changes to zone conditions using the new cond field in struct blk_zone_wplug. The size of this structure remains 112 Bytes as the new field replaces the 4 Bytes padding at the end of the structure. Beause zones that do not have a zone write plug can be in the empty, implicit open, explicit open or full condition, the zones_cond array of a disk is used to track the conditions, of zones that do not have a zone write plug. The condition of such zone is updated in the disk zones_cond array when a zone reset, reset all or finish operation is executed, and also when a zone write plug is removed from the disk hash table when the zone becomes full. Since a device may automatically close an implicitly open zone when writing to an empty or closed zone, if the total number of open zones has reached the device limit, the BLK_ZONE_COND_IMP_OPEN and BLK_ZONE_COND_CLOSED zone conditions cannot be precisely tracked. To overcome this, the zone condition BLK_ZONE_COND_ACTIVE is introduced to represent a zone that has the condition BLK_ZONE_COND_IMP_OPEN, BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED. This follows the definition of an active zone as defined in the NVMe Zoned Namespace specifications. As such, for a zoned device that has a limit on the maximum number of open zones, we will never have more zones in the BLK_ZONE_COND_ACTIVE condition than the device limit. This is compatible with the SCSI ZBC and ATA ZAC specifications for SMR HDDs as these devices do not have a limit on the number of active zones. The function disk_zone_wplug_set_wp_offset() is modified to use the new helper disk_zone_wplug_update_cond() to update a zone write plug condition whenever a zone write plug write offset is updated on submission or merging of write BIOs to a zone. The functions blk_zone_reset_bio_endio(), blk_zone_reset_all_bio_endio() and blk_zone_finish_bio_endio() are modified to update the condition of the zones targeted by reset, reset_all and finish operations, either using though disk_zone_wplug_set_wp_offset() for zones that have a zone write plug, or using the disk_zone_set_cond() helper to update the zones_cond array of the disk for zones that do not have a zone write plug. When a zone write plug is removed from the disk hash table (when the zone becomes empty or full), the condition of struct blk_zone_wplug is used to update the disk zones_cond array. Conversely, when a zone write plug is added to the disk hash table, the zones_cond array is used to initialize the zone write plug condition. Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index f85743ef6e7d..5c7662971414 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -48,6 +48,8 @@ enum blk_zone_type { * FINISH ZONE command. * @BLK_ZONE_COND_READONLY: The zone is read-only. * @BLK_ZONE_COND_OFFLINE: The zone is offline (sectors cannot be read/written). + * @BLK_ZONE_COND_ACTIVE: The zone is either implicitly open, explicitly open, + * or closed. * * The Zone Condition state machine in the ZBC/ZAC standards maps the above * deinitions as: @@ -61,6 +63,13 @@ enum blk_zone_type { * * Conditions 0x5 to 0xC are reserved by the current ZBC/ZAC spec and should * be considered invalid. + * + * The condition BLK_ZONE_COND_ACTIVE is used only with cached zone reports. + * It is used to report any of the BLK_ZONE_COND_IMP_OPEN, + * BLK_ZONE_COND_EXP_OPEN and BLK_ZONE_COND_CLOSED conditions. Conversely, a + * regular zone report will never report a zone condition using + * BLK_ZONE_COND_ACTIVE and instead use the conditions BLK_ZONE_COND_IMP_OPEN, + * BLK_ZONE_COND_EXP_OPEN or BLK_ZONE_COND_CLOSED as reported by the device. */ enum blk_zone_cond { BLK_ZONE_COND_NOT_WP = 0x0, @@ -71,6 +80,8 @@ enum blk_zone_cond { BLK_ZONE_COND_READONLY = 0xD, BLK_ZONE_COND_FULL = 0xE, BLK_ZONE_COND_OFFLINE = 0xF, + + BLK_ZONE_COND_ACTIVE = 0xFF, }; /** -- cgit v1.2.3 From f2284eec5053df271c78e687672247922bcee881 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:43 +0900 Subject: block: introduce blkdev_get_zone_info() Introduce the function blkdev_get_zone_info() to obtain a single zone information from cached zone data, that is, either from the zone write plug for the target zone if it exists and from the disk zones_cond array otherwise. Since sequential zones that do not have a zone write plug are either full, empty or in a bad state (read-only or offline), the zone write pointer can be inferred from the zone condition cached in the disk zones_cond array. For sequential zones that have a zone write plug, the zone condition and zone write pointer are obtained from the condition and write pointer offset managed with the zone write plug. This allows obtaining the information for a zone much more quickly than having to execute a report zones command on the device. blkdev_get_zone_info() falls back to using a regular zone report if the target zone is flagged as needing an update with the BLK_ZONE_WPLUG_NEED_WP_UPDATE flag, or if the target device does not use zone write plugs (i.e. a device mapper device). In this case, the new function blkdev_report_zone_fallback() is used and the zone condition is reported consistantly with the cahced report, that is, the BLK_ZONE_COND_ACTIVE condition is used in place of the implicit open, explicit open and closed conditions. This is achieved by adding the .report_active field to struct blk_report_zones_args and by having disk_report_zone() sets the correct zone condition if .report_active is true. In preparation for using blkdev_get_zone_info() in upcoming file systems changes, also export this function as a GPL symbol. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 53bcfbc2f68f..03a594b4dfbc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -436,6 +436,9 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, unsigned int idx, struct blk_report_zones_args *args); +int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, + struct blk_zone *zone); + #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); -- cgit v1.2.3 From 31f0656a4ab712edf2888eabcc0664197a4a938e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:44 +0900 Subject: block: introduce blkdev_report_zones_cached() Introduce the function blkdev_report_zones_cached() to provide a fast report zone built using the blkdev_get_zone_info() function, which gets zone information from a disk zones_cond array or zone write plugs. For a large capacity SMR drive, such fast report zone can be completed in a few milliseconds compared to several seconds completion times when the report zone is obtained from the device. The zone report is built in the same manner as with the regular blkdev_report_zones() function, that is, the first zone reported is the one containing the specified start sector and the report is limited to the specified number of zones (nr_zones argument). The information for each zone in the report is obtained using blkdev_get_zone_info(). For zoned devices that do not use zone write plug resources, using blkdev_get_zone_info() is inefficient as the zone report would be very slow, generated one zone at a time. To avoid this, blkdev_report_zones_cached() falls back to calling blkdev_do_report_zones() to execute a regular zone report. In this case, the .report_active field of struct blk_report_zones_args is set to true to report zone conditions using the BLK_ZONE_COND_ACTIVE condition in place of the implicit open, explicit open and closed conditions. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 03a594b4dfbc..f0ab02e0a673 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -442,6 +442,8 @@ int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); +int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk); -- cgit v1.2.3 From b30ffcdc0c15a88f8866529d3532454e02571221 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 5 Nov 2025 06:22:45 +0900 Subject: block: introduce BLKREPORTZONESV2 ioctl Introduce the new BLKREPORTZONESV2 ioctl command to allow user applications access to the fast zone report implemented by blkdev_report_zones_cached(). This new ioctl is defined as number 142 and is documented in include/uapi/linux/fs.h. Unlike the existing BLKREPORTZONES ioctl, this new ioctl uses the flags field of struct blk_zone_report also as an input. If the user sets the BLK_ZONE_REP_CACHED flag as an input, then blkdev_report_zones_cached() is used to generate the zone report using cached zone information. If this flag is not set, then BLKREPORTZONESV2 behaves in the same manner as BLKREPORTZONES and the zone report is generated by accessing the zoned device. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/uapi/linux/blkzoned.h | 35 ++++++++++++++++++++++++++++++----- include/uapi/linux/fs.h | 2 +- 2 files changed, 31 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 5c7662971414..e33f02703350 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -87,10 +87,20 @@ enum blk_zone_cond { /** * enum blk_zone_report_flags - Feature flags of reported zone descriptors. * - * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field. + * @BLK_ZONE_REP_CAPACITY: Output only. Indicates that zone descriptors in a + * zone report have a valid capacity field. + * @BLK_ZONE_REP_CACHED: Input only. Indicates that the zone report should be + * generated using cached zone information. In this case, + * the implicit open, explicit open and closed zone + * conditions are all reported with the + * BLK_ZONE_COND_ACTIVE condition. */ enum blk_zone_report_flags { - BLK_ZONE_REP_CAPACITY = (1 << 0), + /* Output flags */ + BLK_ZONE_REP_CAPACITY = (1U << 0), + + /* Input flags */ + BLK_ZONE_REP_CACHED = (1U << 31), }; /** @@ -133,6 +143,10 @@ struct blk_zone { * @sector: starting sector of report * @nr_zones: IN maximum / OUT actual * @flags: one or more flags as defined by enum blk_zone_report_flags. + * @flags: one or more flags as defined by enum blk_zone_report_flags. + * With BLKREPORTZONE, this field is ignored as an input and is valid + * only as an output. Using BLKREPORTZONEV2, this field is used as both + * input and output. * @zones: Space to hold @nr_zones @zones entries on reply. * * The array of at most @nr_zones must follow this structure in memory. @@ -159,9 +173,19 @@ struct blk_zone_range { /** * Zoned block device ioctl's: * - * @BLKREPORTZONE: Get zone information. Takes a zone report as argument. - * The zone report will start from the zone containing the - * sector specified in the report request structure. + * @BLKREPORTZONE: Get zone information from a zoned device. Takes a zone report + * as argument. The zone report will start from the zone + * containing the sector specified in struct blk_zone_report. + * The flags field of struct blk_zone_report is used as an + * output only and ignored as an input. + * DEPRECATED, use BLKREPORTZONEV2 instead. + * @BLKREPORTZONEV2: Same as @BLKREPORTZONE but uses the flags field of + * struct blk_zone_report as an input, allowing to get a zone + * report using cached zone information if the flag + * BLK_ZONE_REP_CACHED is set. In such case, the zone report + * may include zones with the condition @BLK_ZONE_COND_ACTIVE + * (c.f. the description of this condition above for more + * details). * @BLKRESETZONE: Reset the write pointer of the zones in the specified * sector range. The sector range must be zone aligned. * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors. @@ -180,5 +204,6 @@ struct blk_zone_range { #define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range) #define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range) #define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range) +#define BLKREPORTZONEV2 _IOWR(0x12, 142, struct blk_zone_report) #endif /* _UAPI_BLKZONED_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 957ce3343a4f..66ca526cf786 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -298,7 +298,7 @@ struct file_attr { #define BLKROTATIONAL _IO(0x12,126) #define BLKZEROOUT _IO(0x12,127) #define BLKGETDISKSEQ _IOR(0x12,128,__u64) -/* 130-136 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ +/* 130-136 and 142 are used by zoned block device ioctls (uapi/linux/blkzoned.h) */ /* 137-141 are used by blk-crypto ioctls (uapi/linux/blk-crypto.h) */ #define BLKTRACESETUP2 _IOWR(0x12, 142, struct blk_user_trace_setup2) -- cgit v1.2.3 From 0a0da3f92118950862700497bc7917f0fbf6a6e8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:24 -0700 Subject: KVM: Make support for kvm_arch_vcpu_async_ioctl() mandatory Implement kvm_arch_vcpu_async_ioctl() "natively" in x86 and arm64 instead of relying on an #ifdef'd stub, and drop HAVE_KVM_VCPU_ASYNC_IOCTL in anticipation of using the API on x86. Once x86 uses the API, providing a stub for one architecture and having all other architectures opt-in requires more code than simply implementing the API in the lone holdout. Eliminating the Kconfig will also reduce churn if the API is renamed in the future (spoiler alert). No functional change intended. Acked-by: Claudio Imbrenda Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-2-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..7186b2ae4b57 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2437,18 +2437,8 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); -#else -static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, - unsigned long arg) -{ - return -ENOIOCTLCMD; -} -#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE -- cgit v1.2.3 From 50efc2340a598da4bafa40bc01e18f8cf73a4ae3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:25 -0700 Subject: KVM: Rename kvm_arch_vcpu_async_ioctl() to kvm_arch_vcpu_unlocked_ioctl() Rename the "async" ioctl API to "unlocked" so that upcoming usage in x86's TDX code doesn't result in a massive misnomer. To avoid having to retry SEAMCALLs, TDX needs to acquire kvm->lock *and* all vcpu->mutex locks, and acquiring all of those locks after/inside the current vCPU's mutex is a non-starter. However, TDX also needs to acquire the vCPU's mutex and load the vCPU, i.e. the handling is very much not async to the vCPU. No functional change intended. Acked-by: Claudio Imbrenda Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-3-seanjc@google.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7186b2ae4b57..d93f75b05ae2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1557,6 +1557,8 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); @@ -2437,8 +2439,6 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE -- cgit v1.2.3 From e0b62a4dee24e9176f2c4be52a1b47fe1d97c560 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Nov 2025 15:46:33 +0100 Subject: fs: add fs/super_types.h header Split out super block associated structures into a separate header. Link: https://patch.msgid.link/20251104-work-fs-header-v1-2-fb39a2efe39e@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 308 +------------------------------------ include/linux/fs/super_types.h | 335 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 336 insertions(+), 307 deletions(-) create mode 100644 include/linux/fs/super_types.h (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3c971ddace41..ae71c359077a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H +#include #include #include #include @@ -11,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include @@ -52,11 +51,9 @@ #include #include -struct backing_dev_info; struct bdi_writeback; struct bio; struct io_comp_batch; -struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; @@ -70,12 +67,8 @@ struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; -struct workqueue_struct; struct iov_iter; -struct fscrypt_operations; -struct fsverity_operations; struct fsnotify_mark_connector; -struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct file_kattr; @@ -298,11 +291,6 @@ struct iattr { struct file *ia_file; }; -/* - * Includes for diskquotas. - */ -#include - /* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow @@ -1347,49 +1335,6 @@ extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct file *file); -/* - * sb->s_flags. Note that these mirror the equivalent MS_* flags where - * represented in both. - */ -#define SB_RDONLY BIT(0) /* Mount read-only */ -#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ -#define SB_NODEV BIT(2) /* Disallow access to device special files */ -#define SB_NOEXEC BIT(3) /* Disallow program execution */ -#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ -#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ -#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ -#define SB_NOATIME BIT(10) /* Do not update access times. */ -#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ -#define SB_SILENT BIT(15) -#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ -#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ -#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ -#define SB_I_VERSION BIT(23) /* Update inode I_version field */ -#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ - -/* These sb flags are internal to the kernel */ -#define SB_DEAD BIT(21) -#define SB_DYING BIT(24) -#define SB_FORCE BIT(27) -#define SB_NOSEC BIT(28) -#define SB_BORN BIT(29) -#define SB_ACTIVE BIT(30) -#define SB_NOUSER BIT(31) - -/* These flags relate to encoding and casefolding */ -#define SB_ENC_STRICT_MODE_FL (1 << 0) -#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) - -#define sb_has_strict_encoding(sb) \ - (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) - -#if IS_ENABLED(CONFIG_UNICODE) -#define sb_no_casefold_compat_fallback(sb) \ - (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) -#else -#define sb_no_casefold_compat_fallback(sb) (1) -#endif - /* * Umount options */ @@ -1400,191 +1345,6 @@ extern int send_sigurg(struct file *file); #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ -/* sb->s_iflags */ -#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ -#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ -#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ -#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ - -/* sb->s_iflags to limit user namespace mounts */ -#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ -#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 -#define SB_I_UNTRUSTED_MOUNTER 0x00000040 -#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 - -#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ -#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ -#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ -#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ -#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ -#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ -#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ - -/* Possible states of 'frozen' field */ -enum { - SB_UNFROZEN = 0, /* FS is unfrozen */ - SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ - SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ - SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop - * internal threads if needed) */ - SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ -}; - -#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) - -struct sb_writers { - unsigned short frozen; /* Is sb frozen? */ - int freeze_kcount; /* How many kernel freeze requests? */ - int freeze_ucount; /* How many userspace freeze requests? */ - const void *freeze_owner; /* Owner of the freeze */ - struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; -}; - -struct mount; - -struct super_block { - struct list_head s_list; /* Keep this first */ - dev_t s_dev; /* search index; _not_ kdev_t */ - unsigned char s_blocksize_bits; - unsigned long s_blocksize; - loff_t s_maxbytes; /* Max file size */ - struct file_system_type *s_type; - const struct super_operations *s_op; - const struct dquot_operations *dq_op; - const struct quotactl_ops *s_qcop; - const struct export_operations *s_export_op; - unsigned long s_flags; - unsigned long s_iflags; /* internal SB_I_* flags */ - unsigned long s_magic; - struct dentry *s_root; - struct rw_semaphore s_umount; - int s_count; - atomic_t s_active; -#ifdef CONFIG_SECURITY - void *s_security; -#endif - const struct xattr_handler * const *s_xattr; -#ifdef CONFIG_FS_ENCRYPTION - const struct fscrypt_operations *s_cop; - struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ -#endif -#ifdef CONFIG_FS_VERITY - const struct fsverity_operations *s_vop; -#endif -#if IS_ENABLED(CONFIG_UNICODE) - struct unicode_map *s_encoding; - __u16 s_encoding_flags; -#endif - struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ - struct mount *s_mounts; /* list of mounts; _not_ for fs use */ - struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ - struct file *s_bdev_file; - struct backing_dev_info *s_bdi; - struct mtd_info *s_mtd; - struct hlist_node s_instances; - unsigned int s_quota_types; /* Bitmask of supported quota types */ - struct quota_info s_dquot; /* Diskquota specific options */ - - struct sb_writers s_writers; - - /* - * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and - * s_fsnotify_info together for cache efficiency. They are frequently - * accessed and rarely modified. - */ - void *s_fs_info; /* Filesystem private info */ - - /* Granularity of c/m/atime in ns (cannot be worse than a second) */ - u32 s_time_gran; - /* Time limits for c/m/atime in seconds */ - time64_t s_time_min; - time64_t s_time_max; -#ifdef CONFIG_FSNOTIFY - u32 s_fsnotify_mask; - struct fsnotify_sb_info *s_fsnotify_info; -#endif - - /* - * q: why are s_id and s_sysfs_name not the same? both are human - * readable strings that identify the filesystem - * a: s_id is allowed to change at runtime; it's used in log messages, - * and we want to when a device starts out as single device (s_id is dev - * name) but then a device is hot added and we have to switch to - * identifying it by UUID - * but s_sysfs_name is a handle for programmatic access, and can't - * change at runtime - */ - char s_id[32]; /* Informational name */ - uuid_t s_uuid; /* UUID */ - u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ - - /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ - char s_sysfs_name[UUID_STRING_LEN + 1]; - - unsigned int s_max_links; - unsigned int s_d_flags; /* default d_flags for dentries */ - - /* - * The next field is for VFS *only*. No filesystems have any business - * even looking at it. You had been warned. - */ - struct mutex s_vfs_rename_mutex; /* Kludge */ - - /* - * Filesystem subtype. If non-empty the filesystem type field - * in /proc/mounts will be "type.subtype" - */ - const char *s_subtype; - - const struct dentry_operations *__s_d_op; /* default d_op for dentries */ - - struct shrinker *s_shrink; /* per-sb shrinker handle */ - - /* Number of inodes with nlink == 0 but still referenced */ - atomic_long_t s_remove_count; - - /* Read-only state of the superblock is being changed */ - int s_readonly_remount; - - /* per-sb errseq_t for reporting writeback errors via syncfs */ - errseq_t s_wb_err; - - /* AIO completions deferred from interrupt context */ - struct workqueue_struct *s_dio_done_wq; - struct hlist_head s_pins; - - /* - * Owning user namespace and default context in which to - * interpret filesystem uids, gids, quotas, device nodes, - * xattrs and security labels. - */ - struct user_namespace *s_user_ns; - - /* - * The list_lru structure is essentially just a pointer to a table - * of per-node lru lists, each of which has its own spinlock. - * There is no need to put them into separate cachelines. - */ - struct list_lru s_dentry_lru; - struct list_lru s_inode_lru; - struct rcu_head rcu; - struct work_struct destroy_work; - - struct mutex s_sync_lock; /* sync serialisation lock */ - - /* - * Indicates how deep in a filesystem stack this SB is - */ - int s_stack_depth; - - /* s_inode_list_lock protects s_inodes */ - spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; - struct list_head s_inodes; /* all inodes */ - - spinlock_t s_inode_wblist_lock; - struct list_head s_inodes_wb; /* writeback inodes */ -} __randomize_layout; - static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; @@ -2431,72 +2191,6 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); -/** - * enum freeze_holder - holder of the freeze - * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem - * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem - * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed - * @FREEZE_EXCL: a freeze that can only be undone by the owner - * - * Indicate who the owner of the freeze or thaw request is and whether - * the freeze needs to be exclusive or can nest. - * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the - * same holder aren't allowed. It is however allowed to hold a single - * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at - * the same time. This is relied upon by some filesystems during online - * repair or similar. - */ -enum freeze_holder { - FREEZE_HOLDER_KERNEL = (1U << 0), - FREEZE_HOLDER_USERSPACE = (1U << 1), - FREEZE_MAY_NEST = (1U << 2), - FREEZE_EXCL = (1U << 3), -}; - -struct super_operations { - struct inode *(*alloc_inode)(struct super_block *sb); - void (*destroy_inode)(struct inode *); - void (*free_inode)(struct inode *); - - void (*dirty_inode) (struct inode *, int flags); - int (*write_inode) (struct inode *, struct writeback_control *wbc); - int (*drop_inode) (struct inode *); - void (*evict_inode) (struct inode *); - void (*put_super) (struct super_block *); - int (*sync_fs)(struct super_block *sb, int wait); - int (*freeze_super) (struct super_block *, enum freeze_holder who, const void *owner); - int (*freeze_fs) (struct super_block *); - int (*thaw_super) (struct super_block *, enum freeze_holder who, const void *owner); - int (*unfreeze_fs) (struct super_block *); - int (*statfs) (struct dentry *, struct kstatfs *); - int (*remount_fs) (struct super_block *, int *, char *); - void (*umount_begin) (struct super_block *); - - int (*show_options)(struct seq_file *, struct dentry *); - int (*show_devname)(struct seq_file *, struct dentry *); - int (*show_path)(struct seq_file *, struct dentry *); - int (*show_stats)(struct seq_file *, struct dentry *); -#ifdef CONFIG_QUOTA - ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); - ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); - struct dquot __rcu **(*get_dquots)(struct inode *); -#endif - long (*nr_cached_objects)(struct super_block *, - struct shrink_control *); - long (*free_cached_objects)(struct super_block *, - struct shrink_control *); - /* - * If a filesystem can support graceful removal of a device and - * continue read-write operations, implement this callback. - * - * Return 0 if the filesystem can continue read-write. - * Non-zero return value or no such callback means the fs will be shutdown - * as usual. - */ - int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); - void (*shutdown)(struct super_block *sb); -}; - /* * Inode flags - they have no relation to superblock flags now */ diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h new file mode 100644 index 000000000000..45cfd45b9fe0 --- /dev/null +++ b/include/linux/fs/super_types.h @@ -0,0 +1,335 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FS_SUPER_TYPES_H +#define _LINUX_FS_SUPER_TYPES_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct backing_dev_info; +struct block_device; +struct dentry; +struct dentry_operations; +struct dquot_operations; +struct export_operations; +struct file; +struct file_system_type; +struct fscrypt_operations; +struct fsnotify_sb_info; +struct fsverity_operations; +struct kstatfs; +struct mount; +struct mtd_info; +struct quotactl_ops; +struct shrinker; +struct unicode_map; +struct user_namespace; +struct workqueue_struct; +struct writeback_control; +struct xattr_handler; + +extern struct super_block *blockdev_superblock; + +/* Possible states of 'frozen' field */ +enum { + SB_UNFROZEN = 0, /* FS is unfrozen */ + SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ + SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ + SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop internal threads if needed) */ + SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ +}; + +#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) + +struct sb_writers { + unsigned short frozen; /* Is sb frozen? */ + int freeze_kcount; /* How many kernel freeze requests? */ + int freeze_ucount; /* How many userspace freeze requests? */ + const void *freeze_owner; /* Owner of the freeze */ + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; +}; + +/** + * enum freeze_holder - holder of the freeze + * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem + * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem + * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed + * @FREEZE_EXCL: a freeze that can only be undone by the owner + * + * Indicate who the owner of the freeze or thaw request is and whether + * the freeze needs to be exclusive or can nest. + * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the + * same holder aren't allowed. It is however allowed to hold a single + * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at + * the same time. This is relied upon by some filesystems during online + * repair or similar. + */ +enum freeze_holder { + FREEZE_HOLDER_KERNEL = (1U << 0), + FREEZE_HOLDER_USERSPACE = (1U << 1), + FREEZE_MAY_NEST = (1U << 2), + FREEZE_EXCL = (1U << 3), +}; + +struct super_operations { + struct inode *(*alloc_inode)(struct super_block *sb); + void (*destroy_inode)(struct inode *inode); + void (*free_inode)(struct inode *inode); + void (*dirty_inode)(struct inode *inode, int flags); + int (*write_inode)(struct inode *inode, struct writeback_control *wbc); + int (*drop_inode)(struct inode *inode); + void (*evict_inode)(struct inode *inode); + void (*put_super)(struct super_block *sb); + int (*sync_fs)(struct super_block *sb, int wait); + int (*freeze_super)(struct super_block *sb, enum freeze_holder who, + const void *owner); + int (*freeze_fs)(struct super_block *sb); + int (*thaw_super)(struct super_block *sb, enum freeze_holder who, + const void *owner); + int (*unfreeze_fs)(struct super_block *sb); + int (*statfs)(struct dentry *dentry, struct kstatfs *kstatfs); + int (*remount_fs) (struct super_block *, int *, char *); + void (*umount_begin)(struct super_block *sb); + + int (*show_options)(struct seq_file *seq, struct dentry *dentry); + int (*show_devname)(struct seq_file *seq, struct dentry *dentry); + int (*show_path)(struct seq_file *seq, struct dentry *dentry); + int (*show_stats)(struct seq_file *seq, struct dentry *dentry); +#ifdef CONFIG_QUOTA + ssize_t (*quota_read)(struct super_block *sb, int type, char *data, + size_t len, loff_t off); + ssize_t (*quota_write)(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); + struct dquot __rcu **(*get_dquots)(struct inode *inode); +#endif + long (*nr_cached_objects)(struct super_block *sb, + struct shrink_control *sc); + long (*free_cached_objects)(struct super_block *sb, + struct shrink_control *sc); + /* + * If a filesystem can support graceful removal of a device and + * continue read-write operations, implement this callback. + * + * Return 0 if the filesystem can continue read-write. + * Non-zero return value or no such callback means the fs will be shutdown + * as usual. + */ + int (*remove_bdev)(struct super_block *sb, struct block_device *bdev); + void (*shutdown)(struct super_block *sb); +}; + +struct super_block { + struct list_head s_list; /* Keep this first */ + dev_t s_dev; /* search index; _not_ kdev_t */ + unsigned char s_blocksize_bits; + unsigned long s_blocksize; + loff_t s_maxbytes; /* Max file size */ + struct file_system_type *s_type; + const struct super_operations *s_op; + const struct dquot_operations *dq_op; + const struct quotactl_ops *s_qcop; + const struct export_operations *s_export_op; + unsigned long s_flags; + unsigned long s_iflags; /* internal SB_I_* flags */ + unsigned long s_magic; + struct dentry *s_root; + struct rw_semaphore s_umount; + int s_count; + atomic_t s_active; +#ifdef CONFIG_SECURITY + void *s_security; +#endif + const struct xattr_handler *const *s_xattr; +#ifdef CONFIG_FS_ENCRYPTION + const struct fscrypt_operations *s_cop; + struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ +#endif +#ifdef CONFIG_FS_VERITY + const struct fsverity_operations *s_vop; +#endif +#if IS_ENABLED(CONFIG_UNICODE) + struct unicode_map *s_encoding; + __u16 s_encoding_flags; +#endif + struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ + struct mount *s_mounts; /* list of mounts; _not_ for fs use */ + struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ + struct file *s_bdev_file; + struct backing_dev_info *s_bdi; + struct mtd_info *s_mtd; + struct hlist_node s_instances; + unsigned int s_quota_types; /* Bitmask of supported quota types */ + struct quota_info s_dquot; /* Diskquota specific options */ + + struct sb_writers s_writers; + + /* + * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and + * s_fsnotify_info together for cache efficiency. They are frequently + * accessed and rarely modified. + */ + void *s_fs_info; /* Filesystem private info */ + + /* Granularity of c/m/atime in ns (cannot be worse than a second) */ + u32 s_time_gran; + /* Time limits for c/m/atime in seconds */ + time64_t s_time_min; + time64_t s_time_max; +#ifdef CONFIG_FSNOTIFY + u32 s_fsnotify_mask; + struct fsnotify_sb_info *s_fsnotify_info; +#endif + + /* + * q: why are s_id and s_sysfs_name not the same? both are human + * readable strings that identify the filesystem + * a: s_id is allowed to change at runtime; it's used in log messages, + * and we want to when a device starts out as single device (s_id is dev + * name) but then a device is hot added and we have to switch to + * identifying it by UUID + * but s_sysfs_name is a handle for programmatic access, and can't + * change at runtime + */ + char s_id[32]; /* Informational name */ + uuid_t s_uuid; /* UUID */ + u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ + + /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ + char s_sysfs_name[UUID_STRING_LEN + 1]; + + unsigned int s_max_links; + unsigned int s_d_flags; /* default d_flags for dentries */ + + /* + * The next field is for VFS *only*. No filesystems have any business + * even looking at it. You had been warned. + */ + struct mutex s_vfs_rename_mutex; /* Kludge */ + + /* + * Filesystem subtype. If non-empty the filesystem type field + * in /proc/mounts will be "type.subtype" + */ + const char *s_subtype; + + const struct dentry_operations *__s_d_op; /* default d_op for dentries */ + + struct shrinker *s_shrink; /* per-sb shrinker handle */ + + /* Number of inodes with nlink == 0 but still referenced */ + atomic_long_t s_remove_count; + + /* Read-only state of the superblock is being changed */ + int s_readonly_remount; + + /* per-sb errseq_t for reporting writeback errors via syncfs */ + errseq_t s_wb_err; + + /* AIO completions deferred from interrupt context */ + struct workqueue_struct *s_dio_done_wq; + struct hlist_head s_pins; + + /* + * Owning user namespace and default context in which to + * interpret filesystem uids, gids, quotas, device nodes, + * xattrs and security labels. + */ + struct user_namespace *s_user_ns; + + /* + * The list_lru structure is essentially just a pointer to a table + * of per-node lru lists, each of which has its own spinlock. + * There is no need to put them into separate cachelines. + */ + struct list_lru s_dentry_lru; + struct list_lru s_inode_lru; + struct rcu_head rcu; + struct work_struct destroy_work; + + struct mutex s_sync_lock; /* sync serialisation lock */ + + /* + * Indicates how deep in a filesystem stack this SB is + */ + int s_stack_depth; + + /* s_inode_list_lock protects s_inodes */ + spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; + struct list_head s_inodes; /* all inodes */ + + spinlock_t s_inode_wblist_lock; + struct list_head s_inodes_wb; /* writeback inodes */ +} __randomize_layout; + +/* + * sb->s_flags. Note that these mirror the equivalent MS_* flags where + * represented in both. + */ +#define SB_RDONLY BIT(0) /* Mount read-only */ +#define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ +#define SB_NODEV BIT(2) /* Disallow access to device special files */ +#define SB_NOEXEC BIT(3) /* Disallow program execution */ +#define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ +#define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ +#define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ +#define SB_NOATIME BIT(10) /* Do not update access times. */ +#define SB_NODIRATIME BIT(11) /* Do not update directory access times */ +#define SB_SILENT BIT(15) +#define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ +#define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ +#define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ +#define SB_I_VERSION BIT(23) /* Update inode I_version field */ +#define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ + +/* These sb flags are internal to the kernel */ +#define SB_DEAD BIT(21) +#define SB_DYING BIT(24) +#define SB_FORCE BIT(27) +#define SB_NOSEC BIT(28) +#define SB_BORN BIT(29) +#define SB_ACTIVE BIT(30) +#define SB_NOUSER BIT(31) + +/* These flags relate to encoding and casefolding */ +#define SB_ENC_STRICT_MODE_FL (1 << 0) +#define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) + +#define sb_has_strict_encoding(sb) \ + (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) + +#if IS_ENABLED(CONFIG_UNICODE) +#define sb_no_casefold_compat_fallback(sb) \ + (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) +#else +#define sb_no_casefold_compat_fallback(sb) (1) +#endif + +/* sb->s_iflags */ +#define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ +#define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ +#define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ +#define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ + +/* sb->s_iflags to limit user namespace mounts */ +#define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ +#define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 +#define SB_I_UNTRUSTED_MOUNTER 0x00000040 +#define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 + +#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ +#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ +#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ +#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ +#define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ +#define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ +#define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ + +#endif /* _LINUX_FS_SUPER_TYPES_H */ -- cgit v1.2.3 From f7b3d14165222a3ad9c4d0d31dfa81e396751801 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Nov 2025 15:46:34 +0100 Subject: fs: add fs/super.h header Split out super block associated functions into a separate header. Link: https://patch.msgid.link/20251104-work-fs-header-v1-3-fb39a2efe39e@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 220 +------------------------------------------- include/linux/fs/super.h | 233 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+), 219 deletions(-) create mode 100644 include/linux/fs/super.h (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index ae71c359077a..64af28318fbf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,7 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H -#include +#include #include #include #include @@ -1662,66 +1662,6 @@ struct timespec64 simple_inode_init_ts(struct inode *inode); * Snapshotting support. */ -/* - * These are internal functions, please use sb_start_{write,pagefault,intwrite} - * instead. - */ -static inline void __sb_end_write(struct super_block *sb, int level) -{ - percpu_up_read(sb->s_writers.rw_sem + level-1); -} - -static inline void __sb_start_write(struct super_block *sb, int level) -{ - percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); -} - -static inline bool __sb_start_write_trylock(struct super_block *sb, int level) -{ - return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); -} - -#define __sb_writers_acquired(sb, lev) \ - percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) -#define __sb_writers_release(sb, lev) \ - percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_) - -/** - * __sb_write_started - check if sb freeze level is held - * @sb: the super we write to - * @level: the freeze level - * - * * > 0 - sb freeze level is held - * * 0 - sb freeze level is not held - * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN - */ -static inline int __sb_write_started(const struct super_block *sb, int level) -{ - return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); -} - -/** - * sb_write_started - check if SB_FREEZE_WRITE is held - * @sb: the super we write to - * - * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. - */ -static inline bool sb_write_started(const struct super_block *sb) -{ - return __sb_write_started(sb, SB_FREEZE_WRITE); -} - -/** - * sb_write_not_started - check if SB_FREEZE_WRITE is not held - * @sb: the super we write to - * - * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. - */ -static inline bool sb_write_not_started(const struct super_block *sb) -{ - return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; -} - /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to @@ -1752,118 +1692,6 @@ static inline bool file_write_not_started(const struct file *file) return sb_write_not_started(file_inode(file)->i_sb); } -/** - * sb_end_write - drop write access to a superblock - * @sb: the super we wrote to - * - * Decrement number of writers to the filesystem. Wake up possible waiters - * wanting to freeze the filesystem. - */ -static inline void sb_end_write(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_WRITE); -} - -/** - * sb_end_pagefault - drop write access to a superblock from a page fault - * @sb: the super we wrote to - * - * Decrement number of processes handling write page fault to the filesystem. - * Wake up possible waiters wanting to freeze the filesystem. - */ -static inline void sb_end_pagefault(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_PAGEFAULT); -} - -/** - * sb_end_intwrite - drop write access to a superblock for internal fs purposes - * @sb: the super we wrote to - * - * Decrement fs-internal number of writers to the filesystem. Wake up possible - * waiters wanting to freeze the filesystem. - */ -static inline void sb_end_intwrite(struct super_block *sb) -{ - __sb_end_write(sb, SB_FREEZE_FS); -} - -/** - * sb_start_write - get write access to a superblock - * @sb: the super we write to - * - * When a process wants to write data or metadata to a file system (i.e. dirty - * a page or an inode), it should embed the operation in a sb_start_write() - - * sb_end_write() pair to get exclusion against file system freezing. This - * function increments number of writers preventing freezing. If the file - * system is already frozen, the function waits until the file system is - * thawed. - * - * Since freeze protection behaves as a lock, users have to preserve - * ordering of freeze protection and other filesystem locks. Generally, - * freeze protection should be the outermost lock. In particular, we have: - * - * sb_start_write - * -> i_rwsem (write path, truncate, directory ops, ...) - * -> s_umount (freeze_super, thaw_super) - */ -static inline void sb_start_write(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_WRITE); -} - -static inline bool sb_start_write_trylock(struct super_block *sb) -{ - return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); -} - -/** - * sb_start_pagefault - get write access to a superblock from a page fault - * @sb: the super we write to - * - * When a process starts handling write page fault, it should embed the - * operation into sb_start_pagefault() - sb_end_pagefault() pair to get - * exclusion against file system freezing. This is needed since the page fault - * is going to dirty a page. This function increments number of running page - * faults preventing freezing. If the file system is already frozen, the - * function waits until the file system is thawed. - * - * Since page fault freeze protection behaves as a lock, users have to preserve - * ordering of freeze protection and other filesystem locks. It is advised to - * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault - * handling code implies lock dependency: - * - * mmap_lock - * -> sb_start_pagefault - */ -static inline void sb_start_pagefault(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_PAGEFAULT); -} - -/** - * sb_start_intwrite - get write access to a superblock for internal fs purposes - * @sb: the super we write to - * - * This is the third level of protection against filesystem freezing. It is - * free for use by a filesystem. The only requirement is that it must rank - * below sb_start_pagefault. - * - * For example filesystem can call sb_start_intwrite() when starting a - * transaction which somewhat eases handling of freezing for internal sources - * of filesystem changes (internal fs threads, discarding preallocation on file - * close, etc.). - */ -static inline void sb_start_intwrite(struct super_block *sb) -{ - __sb_start_write(sb, SB_FREEZE_FS); -} - -static inline bool sb_start_intwrite_trylock(struct super_block *sb) -{ - return __sb_start_write_trylock(sb, SB_FREEZE_FS); -} - bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); @@ -2233,7 +2061,6 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, */ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) -static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; } #define IS_RDONLY(inode) sb_rdonly((inode)->i_sb) #define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \ ((inode)->i_flags & S_SYNC)) @@ -2467,10 +2294,6 @@ extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); -int freeze_super(struct super_block *super, enum freeze_holder who, - const void *freeze_owner); -int thaw_super(struct super_block *super, enum freeze_holder who, - const void *freeze_owner); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); @@ -2657,12 +2480,6 @@ extern struct kmem_cache *names_cachep; #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) -extern struct super_block *blockdev_superblock; -static inline bool sb_is_blkdev_sb(struct super_block *sb) -{ - return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; -} - void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; @@ -3117,9 +2934,6 @@ static inline void remove_inode_hash(struct inode *inode) extern void inode_sb_list_add(struct inode *inode); extern void inode_add_lru(struct inode *inode); -extern int sb_set_blocksize(struct super_block *, int); -extern int sb_min_blocksize(struct super_block *, int); - int generic_file_mmap(struct file *, struct vm_area_struct *); int generic_file_mmap_prepare(struct vm_area_desc *desc); int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); @@ -3439,38 +3253,6 @@ static inline bool generic_ci_validate_strict_name(struct inode *dir, } #endif -static inline struct unicode_map *sb_encoding(const struct super_block *sb) -{ -#if IS_ENABLED(CONFIG_UNICODE) - return sb->s_encoding; -#else - return NULL; -#endif -} - -static inline bool sb_has_encoding(const struct super_block *sb) -{ - return !!sb_encoding(sb); -} - -/* - * Compare if two super blocks have the same encoding and flags - */ -static inline bool sb_same_encoding(const struct super_block *sb1, - const struct super_block *sb2) -{ -#if IS_ENABLED(CONFIG_UNICODE) - if (sb1->s_encoding == sb2->s_encoding) - return true; - - return (sb1->s_encoding && sb2->s_encoding && - (sb1->s_encoding->version == sb2->s_encoding->version) && - (sb1->s_encoding_flags == sb2->s_encoding_flags)); -#else - return true; -#endif -} - int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h new file mode 100644 index 000000000000..c0d22b12c1c9 --- /dev/null +++ b/include/linux/fs/super.h @@ -0,0 +1,233 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FS_SUPER_H +#define _LINUX_FS_SUPER_H + +#include +#include + +/* + * These are internal functions, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +static inline void __sb_end_write(struct super_block *sb, int level) +{ + percpu_up_read(sb->s_writers.rw_sem + level - 1); +} + +static inline void __sb_start_write(struct super_block *sb, int level) +{ + percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); +} + +static inline bool __sb_start_write_trylock(struct super_block *sb, int level) +{ + return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); +} + +#define __sb_writers_acquired(sb, lev) \ + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev) - 1], 1, _THIS_IP_) +#define __sb_writers_release(sb, lev) \ + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev) - 1], _THIS_IP_) + +/** + * __sb_write_started - check if sb freeze level is held + * @sb: the super we write to + * @level: the freeze level + * + * * > 0 - sb freeze level is held + * * 0 - sb freeze level is not held + * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN + */ +static inline int __sb_write_started(const struct super_block *sb, int level) +{ + return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); +} + +/** + * sb_write_started - check if SB_FREEZE_WRITE is held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE); +} + +/** + * sb_write_not_started - check if SB_FREEZE_WRITE is not held + * @sb: the super we write to + * + * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. + */ +static inline bool sb_write_not_started(const struct super_block *sb) +{ + return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; +} + +/** + * sb_end_write - drop write access to a superblock + * @sb: the super we wrote to + * + * Decrement number of writers to the filesystem. Wake up possible waiters + * wanting to freeze the filesystem. + */ +static inline void sb_end_write(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_WRITE); +} + +/** + * sb_end_pagefault - drop write access to a superblock from a page fault + * @sb: the super we wrote to + * + * Decrement number of processes handling write page fault to the filesystem. + * Wake up possible waiters wanting to freeze the filesystem. + */ +static inline void sb_end_pagefault(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_end_intwrite - drop write access to a superblock for internal fs purposes + * @sb: the super we wrote to + * + * Decrement fs-internal number of writers to the filesystem. Wake up possible + * waiters wanting to freeze the filesystem. + */ +static inline void sb_end_intwrite(struct super_block *sb) +{ + __sb_end_write(sb, SB_FREEZE_FS); +} + +/** + * sb_start_write - get write access to a superblock + * @sb: the super we write to + * + * When a process wants to write data or metadata to a file system (i.e. dirty + * a page or an inode), it should embed the operation in a sb_start_write() - + * sb_end_write() pair to get exclusion against file system freezing. This + * function increments number of writers preventing freezing. If the file + * system is already frozen, the function waits until the file system is + * thawed. + * + * Since freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. Generally, + * freeze protection should be the outermost lock. In particular, we have: + * + * sb_start_write + * -> i_rwsem (write path, truncate, directory ops, ...) + * -> s_umount (freeze_super, thaw_super) + */ +static inline void sb_start_write(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_WRITE); +} + +static inline bool sb_start_write_trylock(struct super_block *sb) +{ + return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); +} + +/** + * sb_start_pagefault - get write access to a superblock from a page fault + * @sb: the super we write to + * + * When a process starts handling write page fault, it should embed the + * operation into sb_start_pagefault() - sb_end_pagefault() pair to get + * exclusion against file system freezing. This is needed since the page fault + * is going to dirty a page. This function increments number of running page + * faults preventing freezing. If the file system is already frozen, the + * function waits until the file system is thawed. + * + * Since page fault freeze protection behaves as a lock, users have to preserve + * ordering of freeze protection and other filesystem locks. It is advised to + * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault + * handling code implies lock dependency: + * + * mmap_lock + * -> sb_start_pagefault + */ +static inline void sb_start_pagefault(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_PAGEFAULT); +} + +/** + * sb_start_intwrite - get write access to a superblock for internal fs purposes + * @sb: the super we write to + * + * This is the third level of protection against filesystem freezing. It is + * free for use by a filesystem. The only requirement is that it must rank + * below sb_start_pagefault. + * + * For example filesystem can call sb_start_intwrite() when starting a + * transaction which somewhat eases handling of freezing for internal sources + * of filesystem changes (internal fs threads, discarding preallocation on file + * close, etc.). + */ +static inline void sb_start_intwrite(struct super_block *sb) +{ + __sb_start_write(sb, SB_FREEZE_FS); +} + +static inline bool sb_start_intwrite_trylock(struct super_block *sb) +{ + return __sb_start_write_trylock(sb, SB_FREEZE_FS); +} + +static inline bool sb_rdonly(const struct super_block *sb) +{ + return sb->s_flags & SB_RDONLY; +} + +static inline bool sb_is_blkdev_sb(struct super_block *sb) +{ + return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; +} + +#if IS_ENABLED(CONFIG_UNICODE) +static inline struct unicode_map *sb_encoding(const struct super_block *sb) +{ + return sb->s_encoding; +} + +/* Compare if two super blocks have the same encoding and flags */ +static inline bool sb_same_encoding(const struct super_block *sb1, + const struct super_block *sb2) +{ + if (sb1->s_encoding == sb2->s_encoding) + return true; + + return (sb1->s_encoding && sb2->s_encoding && + (sb1->s_encoding->version == sb2->s_encoding->version) && + (sb1->s_encoding_flags == sb2->s_encoding_flags)); +} +#else +static inline struct unicode_map *sb_encoding(const struct super_block *sb) +{ + return NULL; +} + +static inline bool sb_same_encoding(const struct super_block *sb1, + const struct super_block *sb2) +{ + return true; +} +#endif + +static inline bool sb_has_encoding(const struct super_block *sb) +{ + return !!sb_encoding(sb); +} + +int sb_set_blocksize(struct super_block *sb, int size); +int sb_min_blocksize(struct super_block *sb, int size); + +int freeze_super(struct super_block *super, enum freeze_holder who, + const void *freeze_owner); +int thaw_super(struct super_block *super, enum freeze_holder who, + const void *freeze_owner); + +#endif /* _LINUX_FS_SUPER_H */ -- cgit v1.2.3 From 5b8ed52866e3d19e02860c7cf1d6bbbd70b619e9 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Nov 2025 18:04:48 +0100 Subject: fs: inline current_umask() and move it to fs_struct.h There is no good reason to have this as a func call, other than avoiding the churn of adding fs_struct.h as needed. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251104170448.630414-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 -- include/linux/fs_struct.h | 6 ++++++ include/linux/namei.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 64af28318fbf..c0c0095b2b60 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2336,8 +2336,6 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch va_end(args); } -extern int current_umask(void); - extern void ihold(struct inode * inode); extern void iput(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index baf200ab5c77..0070764b790a 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_STRUCT_H #define _LINUX_FS_STRUCT_H +#include #include #include #include @@ -41,4 +42,9 @@ static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) extern bool current_chrooted(void); +static inline int current_umask(void) +{ + return current->fs->umask; +} + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/include/linux/namei.h b/include/linux/namei.h index fed86221c69c..b0679c7420a8 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -7,6 +7,7 @@ #include #include #include +#include enum { MAX_NESTED_LINKS = 8 }; -- cgit v1.2.3 From 8e4d576ed3ff917eda65b989ba56b02d9a3894f9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Nov 2025 13:12:30 +0100 Subject: fs: add super_write_guard Link: https://patch.msgid.link/20251104-work-guards-v1-1-5108ac78a171@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs/super.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h index c0d22b12c1c9..b874105743b3 100644 --- a/include/linux/fs/super.h +++ b/include/linux/fs/super.h @@ -125,6 +125,11 @@ static inline void sb_start_write(struct super_block *sb) __sb_start_write(sb, SB_FREEZE_WRITE); } +DEFINE_GUARD(super_write, + struct super_block *, + sb_start_write(_T), + sb_end_write(_T)) + static inline bool sb_start_write_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); -- cgit v1.2.3 From 4868d2d52df6f724b01531843805a3b1322e2dd9 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Thu, 30 Oct 2025 09:57:43 +0800 Subject: crypto: hisilicon - qm updates BAR configuration On new platforms greater than QM_HW_V3, the configuration region for the live migration function of the accelerator device is no longer placed in the VF, but is instead placed in the PF. Therefore, the configuration region of the live migration function needs to be opened when the QM driver is loaded. When the QM driver is uninstalled, the driver needs to clear this configuration. Signed-off-by: Longfang Liu Reviewed-by: Shameer Kolothum Acked-by: Herbert Xu Link: https://lore.kernel.org/r/20251030015744.131771-2-liulongfang@huawei.com Signed-off-by: Alex Williamson --- include/linux/hisi_acc_qm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h index c4690e365ade..ca1ec437a3ca 100644 --- a/include/linux/hisi_acc_qm.h +++ b/include/linux/hisi_acc_qm.h @@ -99,6 +99,9 @@ #define QM_DEV_ALG_MAX_LEN 256 +#define QM_MIG_REGION_SEL 0x100198 +#define QM_MIG_REGION_EN BIT(0) + /* uacce mode of the driver */ #define UACCE_MODE_NOUACCE 0 /* don't use uacce */ #define UACCE_MODE_SVA 1 /* use uacce sva mode */ -- cgit v1.2.3 From 313a335057f0894e6e59290d4e7fb8b35ec250e6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 3 Nov 2025 15:57:33 +0100 Subject: coredump: mark struct mm_struct as const We don't actually modify it. Link: https://patch.msgid.link/20251103-work-creds-guards-prepare_creds-v1-7-b447b82f2c9b@kernel.org Signed-off-by: Christian Brauner --- include/linux/sched/coredump.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index b7fafe999073..624fda17a785 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,7 +8,7 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ -static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm) +static inline unsigned long __mm_flags_get_dumpable(const struct mm_struct *mm) { /* * By convention, dumpable bits are contained in first 32 bits of the -- cgit v1.2.3 From 34dc27f02cb3799d56a99002261e4d091da0cea4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:02 -0800 Subject: srcu: Create an srcu_expedite_current() function This commit creates an srcu_expedite_current() function that expedites the current (and possibly the next) SRCU grace period for the specified srcu_struct structure. This functionality will be inherited by RCU Tasks Trace courtesy of its mapping to SRCU fast. If the current SRCU grace period is already waiting, that wait will complete before the expediting takes effect. If there is no SRCU grace period in flight, this function might well create one. [ paulmck: Apply Zqiang feedback for PREEMPT_RT use. ] Signed-off-by: Paul E. McKenney Cc: Andrii Nakryiko Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: Signed-off-by: Frederic Weisbecker --- include/linux/srcutiny.h | 1 + include/linux/srcutree.h | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 51ce25f07930..3bfbd44cb1b3 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -103,6 +103,7 @@ static inline void srcu_barrier(struct srcu_struct *ssp) synchronize_srcu(ssp); } +static inline void srcu_expedite_current(struct srcu_struct *ssp) { } #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) #define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0) diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 42098e0fa0b7..93ad18acd6d0 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -42,6 +42,8 @@ struct srcu_data { struct timer_list delay_work; /* Delay for CB invoking */ struct work_struct work; /* Context for CB invoking. */ struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ + struct rcu_head srcu_ec_head; /* For srcu_expedite_current() use. */ + int srcu_ec_state; /* State for srcu_expedite_current(). */ struct srcu_node *mynode; /* Leaf srcu_node. */ unsigned long grpmask; /* Mask for leaf srcu_node */ /* ->srcu_data_have_cbs[]. */ @@ -135,6 +137,11 @@ struct srcu_struct { #define SRCU_STATE_SCAN1 1 #define SRCU_STATE_SCAN2 2 +/* Values for srcu_expedite_current() state (->srcu_ec_state). */ +#define SRCU_EC_IDLE 0 +#define SRCU_EC_PENDING 1 +#define SRCU_EC_REPOST 2 + /* * Values for initializing gp sequence fields. Higher values allow wrap arounds to * occur earlier. @@ -210,6 +217,7 @@ struct srcu_struct { int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void synchronize_srcu_expedited(struct srcu_struct *ssp); void srcu_barrier(struct srcu_struct *ssp); +void srcu_expedite_current(struct srcu_struct *ssp); void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf); // Converts a per-CPU pointer to an ->srcu_ctrs[] array element to that -- cgit v1.2.3 From ee90848499b169070dbf85a4276a45ccbb7ff7d3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:04 -0800 Subject: srcu: Create a DEFINE_SRCU_FAST() This commit creates DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST() macros that are similar to DEFINE_SRCU() and DEFINE_STATIC_SRCU(), but which create srcu_struct structures that are usable only by readers initiated by srcu_read_lock_fast() and friends. This commit does make DEFINE_SRCU_FAST() available to modules, in which case the per-CPU srcu_data structures are not created at compile time, but rather at module-load time. This means that the >srcu_reader_flavor field of the srcu_data structure is not available. Therefore, this commit instead creates an ->srcu_reader_flavor field in the srcu_struct structure, adds arguments to the DEFINE_SRCU()-related macros to initialize this new field, and extends the checks in the __srcu_check_read_flavor() function to include this new field. This commit also allows dynamically allocated srcu_struct structure to be marked for SRCU-fast readers. It does so by defining a new init_srcu_struct_fast() function that marks the specified srcu_struct structure for use by srcu_read_lock_fast() and friends. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Frederic Weisbecker --- include/linux/notifier.h | 2 +- include/linux/srcu.h | 16 ++++++++++++++-- include/linux/srcutiny.h | 13 ++++++++++--- include/linux/srcutree.h | 30 +++++++++++++++++++----------- 4 files changed, 44 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/notifier.h b/include/linux/notifier.h index b42e64734968..01b6c9d9956f 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -109,7 +109,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ .srcuu = __SRCU_USAGE_INIT(name.srcuu), \ - .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \ + .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu, 0), \ } #define ATOMIC_NOTIFIER_HEAD(name) \ diff --git a/include/linux/srcu.h b/include/linux/srcu.h index ada65b58bc4c..26de47820c58 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -25,8 +25,10 @@ struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC -int __init_srcu_struct(struct srcu_struct *ssp, const char *name, - struct lock_class_key *key); +int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); +#ifndef CONFIG_TINY_SRCU +int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); +#endif // #ifndef CONFIG_TINY_SRCU #define init_srcu_struct(ssp) \ ({ \ @@ -35,10 +37,20 @@ int __init_srcu_struct(struct srcu_struct *ssp, const char *name, __init_srcu_struct((ssp), #ssp, &__srcu_key); \ }) +#define init_srcu_struct_fast(ssp) \ +({ \ + static struct lock_class_key __srcu_key; \ + \ + __init_srcu_struct_fast((ssp), #ssp, &__srcu_key); \ +}) + #define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name }, #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *ssp); +#ifndef CONFIG_TINY_SRCU +int init_srcu_struct_fast(struct srcu_struct *ssp); +#endif // #ifndef CONFIG_TINY_SRCU #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 3bfbd44cb1b3..92e6ab53398f 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -31,7 +31,7 @@ struct srcu_struct { void srcu_drive_gp(struct work_struct *wp); -#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored) \ +#define __SRCU_STRUCT_INIT(name, __ignored, ___ignored, ____ignored) \ { \ .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ .srcu_cb_tail = &name.srcu_cb_head, \ @@ -44,13 +44,20 @@ void srcu_drive_gp(struct work_struct *wp); * Tree SRCU, which needs some per-CPU data. */ #define DEFINE_SRCU(name) \ - struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) + struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) #define DEFINE_STATIC_SRCU(name) \ - static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name) + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) +#define DEFINE_SRCU_FAST(name) DEFINE_SRCU(name) +#define DEFINE_STATIC_SRCU_FAST(name) \ + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) // Dummy structure for srcu_notifier_head. struct srcu_usage { }; #define __SRCU_USAGE_INIT(name) { } +#define __init_srcu_struct_fast __init_srcu_struct +#ifndef CONFIG_DEBUG_LOCK_ALLOC +#define init_srcu_struct_fast init_srcu_struct +#endif // #ifndef CONFIG_DEBUG_LOCK_ALLOC void synchronize_srcu(struct srcu_struct *ssp); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 93ad18acd6d0..7ff4a11bc5a3 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -104,6 +104,7 @@ struct srcu_usage { struct srcu_struct { struct srcu_ctr __percpu *srcu_ctrp; struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + u8 srcu_reader_flavor; struct lockdep_map dep_map; struct srcu_usage *srcu_sup; /* Update-side data. */ }; @@ -162,20 +163,21 @@ struct srcu_struct { .work = __DELAYED_WORK_INITIALIZER(name.work, NULL, 0), \ } -#define __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ +#define __SRCU_STRUCT_INIT_COMMON(name, usage_name, fast) \ .srcu_sup = &usage_name, \ + .srcu_reader_flavor = fast, \ __SRCU_DEP_MAP_INIT(name) -#define __SRCU_STRUCT_INIT_MODULE(name, usage_name) \ +#define __SRCU_STRUCT_INIT_MODULE(name, usage_name, fast) \ { \ - __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ + __SRCU_STRUCT_INIT_COMMON(name, usage_name, fast) \ } -#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name) \ +#define __SRCU_STRUCT_INIT(name, usage_name, pcpu_name, fast) \ { \ .sda = &pcpu_name, \ .srcu_ctrp = &pcpu_name.srcu_ctrs[0], \ - __SRCU_STRUCT_INIT_COMMON(name, usage_name) \ + __SRCU_STRUCT_INIT_COMMON(name, usage_name, fast) \ } /* @@ -196,23 +198,29 @@ struct srcu_struct { * init_srcu_struct(&my_srcu); * * See include/linux/percpu-defs.h for the rules on per-CPU variables. + * + * DEFINE_SRCU_FAST() creates an srcu_struct and associated structures + * whose readers must be of the SRCU-fast variety. */ #ifdef MODULE -# define __DEFINE_SRCU(name, is_static) \ +# define __DEFINE_SRCU(name, fast, is_static) \ static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage); \ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT_MODULE(name, name##_srcu_usage, \ + fast); \ extern struct srcu_struct * const __srcu_struct_##name; \ struct srcu_struct * const __srcu_struct_##name \ __section("___srcu_struct_ptrs") = &name #else -# define __DEFINE_SRCU(name, is_static) \ +# define __DEFINE_SRCU(name, fast, is_static) \ static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data); \ static struct srcu_usage name##_srcu_usage = __SRCU_USAGE_INIT(name##_srcu_usage); \ is_static struct srcu_struct name = \ - __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data) + __SRCU_STRUCT_INIT(name, name##_srcu_usage, name##_srcu_data, fast) #endif -#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) -#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, 0, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, 0, static) +#define DEFINE_SRCU_FAST(name) __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, /* not static */) +#define DEFINE_STATIC_SRCU_FAST(name) __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, static) int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void synchronize_srcu_expedited(struct srcu_struct *ssp); -- cgit v1.2.3 From 8235bcfd39e865763e764b4c968012bdfb808af1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:07 -0800 Subject: srcu: Require special srcu_struct define/init for SRCU-fast readers This commit adds CONFIG_PROVE_RCU=y checking to enforce the new rule that srcu_struct structures passed to srcu_read_lock_fast() and other SRCU-fast read-side markers be either initialized with init_srcu_struct_fast() on the one hand or defined using either DEFINE_SRCU_FAST() or DEFINE_STATIC_SRCU_FAST(). This will enable removal of the non-debug read-side checks from srcu_read_lock_fast() and friends, which on my laptop provides a 25% speedup (which admittedly amounts to about half a nanosecond, but when tracing fastpaths...) Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 26de47820c58..2982b5a6930f 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -271,17 +271,26 @@ static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp) * @ssp: srcu_struct in which to register the new reader. * * Enter an SRCU read-side critical section, but for a light-weight - * smp_mb()-free reader. See srcu_read_lock() for more information. - * - * If srcu_read_lock_fast() is ever used on an srcu_struct structure, - * then none of the other flavors may be used, whether before, during, - * or after. Note that grace-period auto-expediting is disabled for _fast - * srcu_struct structures because auto-expedited grace periods invoke - * synchronize_rcu_expedited(), IPIs and all. - * - * Note that srcu_read_lock_fast() can be invoked only from those contexts - * where RCU is watching, that is, from contexts where it would be legal - * to invoke rcu_read_lock(). Otherwise, lockdep will complain. + * smp_mb()-free reader. See srcu_read_lock() for more information. This + * function is NMI-safe, in a manner similar to srcu_read_lock_nmisafe(). + * + * For srcu_read_lock_fast() to be used on an srcu_struct structure, + * that structure must have been defined using either DEFINE_SRCU_FAST() + * or DEFINE_STATIC_SRCU_FAST() on the one hand or initialized with + * init_srcu_struct_fast() on the other. Such an srcu_struct structure + * cannot be passed to any non-fast variant of srcu_read_{,un}lock() or + * srcu_{down,up}_read(). In kernels built with CONFIG_PROVE_RCU=y, + * __srcu_check_read_flavor() will complain bitterly if you ignore this + * restriction. + * + * Grace-period auto-expediting is disabled for SRCU-fast srcu_struct + * structures because SRCU-fast expedited grace periods invoke + * synchronize_rcu_expedited(), IPIs and all. If you need expedited + * SRCU-fast grace periods, use synchronize_srcu_expedited(). + * + * The srcu_read_lock_fast() function can be invoked only from those + * contexts where RCU is watching, that is, from contexts where it would + * be legal to invoke rcu_read_lock(). Otherwise, lockdep will complain. */ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp) { @@ -317,7 +326,8 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_ * srcu_down_read() for more information. * * The same srcu_struct may be used concurrently by srcu_down_read_fast() - * and srcu_read_lock_fast(). + * and srcu_read_lock_fast(). However, the same definition/initialization + * requirements called out for srcu_read_lock_safe() apply. */ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp) { -- cgit v1.2.3 From ac51c40c2c148a75f3191ff401c9889a7fc12cb1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:08 -0800 Subject: srcu: Make SRCU-fast readers enforce use of SRCU-fast definition/init This commit makes CONFIG_PROVE_RCU=y kernels enforce the new rule that srcu_struct structures that are passed to srcu_read_lock_fast() and other SRCU-fast read-side markers be either initialized with init_srcu_struct_fast() on the one hand or defined with DEFINE_SRCU_FAST() or DEFINE_STATIC_SRCU_FAST() on the other. This eliminates the read-side test that was formerly included in srcu_read_lock_fast() and friends, speeding these primitives up by about 25% (admittedly only about half of a nanosecond, but when tracing on fastpaths...) Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 6 +++--- include/linux/srcutiny.h | 1 - include/linux/srcutree.h | 16 +--------------- 3 files changed, 4 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 2982b5a6930f..41e27c1d917d 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -297,7 +297,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * struct srcu_ctr __percpu *retval; RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast()."); - srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); retval = __srcu_read_lock_fast(ssp); rcu_try_lock_acquire(&ssp->dep_map); return retval; @@ -312,7 +312,7 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast_notrace(struct srcu_ { struct srcu_ctr __percpu *retval; - srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); retval = __srcu_read_lock_fast(ssp); return retval; } @@ -333,7 +333,7 @@ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct * { WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_down_read_fast()."); - srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); return __srcu_read_lock_fast(ssp); } diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 92e6ab53398f..1ecc3393fb26 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -112,7 +112,6 @@ static inline void srcu_barrier(struct srcu_struct *ssp) static inline void srcu_expedite_current(struct srcu_struct *ssp) { } #define srcu_check_read_flavor(ssp, read_flavor) do { } while (0) -#define srcu_check_read_flavor_force(ssp, read_flavor) do { } while (0) /* Defined here to avoid size increase for non-torture kernels. */ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 7ff4a11bc5a3..6080a9094618 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -307,21 +307,7 @@ __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); -// Record reader usage even for CONFIG_PROVE_RCU=n kernels. This is -// needed only for flavors that require grace-period smp_mb() calls to be -// promoted to synchronize_rcu(). -static inline void srcu_check_read_flavor_force(struct srcu_struct *ssp, int read_flavor) -{ - struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); - - if (likely(READ_ONCE(sdp->srcu_reader_flavor) & read_flavor)) - return; - - // Note that the cmpxchg() in __srcu_check_read_flavor() is fully ordered. - __srcu_check_read_flavor(ssp, read_flavor); -} - -// Record non-_lite() usage only for CONFIG_PROVE_RCU=y kernels. +// Record SRCU-reader usage type only for CONFIG_PROVE_RCU=y kernels. static inline void srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) { if (IS_ENABLED(CONFIG_PROVE_RCU)) -- cgit v1.2.3 From 88b6a93af4345e901206d0576bdb4e88ea3eaeb8 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Tue, 4 Nov 2025 00:49:24 +0100 Subject: dt-bindings: clock: rk3568: Add SCMI clock ids The Trusted Firmware on RK3568 exposes 3 clocks via the SCMI clock interface. Add descriptive IDs for them. The clock ids are used in both the older vendor-binary TF-A, as well as the recently merged upstream SCMI clock implementation. Link: https://review.trustedfirmware.org/c/TF-A/trusted-firmware-a/+/31265 Reviewed-by: Diederik de Haas Acked-by: Conor Dooley Link: https://patch.msgid.link/20251103234926.416137-2-heiko@sntech.de Signed-off-by: Heiko Stuebner --- include/dt-bindings/clock/rk3568-cru.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/rk3568-cru.h b/include/dt-bindings/clock/rk3568-cru.h index 5263085c5b23..18bb8d41d741 100644 --- a/include/dt-bindings/clock/rk3568-cru.h +++ b/include/dt-bindings/clock/rk3568-cru.h @@ -485,6 +485,12 @@ #define CLK_NR_CLKS (PCLK_CORE_PVTM + 1) +/* scmi-clocks indices */ + +#define SCMI_CLK_CPU 0 +#define SCMI_CLK_GPU 1 +#define SCMI_CLK_NPU 2 + /* pmu soft-reset indices */ /* pmucru_softrst_con0 */ #define SRST_P_PDPMU_NIU 0 -- cgit v1.2.3 From 34e82569d59391bf7d808a558ff631c4428b026d Mon Sep 17 00:00:00 2001 From: Xuanqiang Luo Date: Wed, 5 Nov 2025 12:19:57 -0800 Subject: rcu: use WRITE_ONCE() for ->next and ->pprev of hlist_nulls In rculist_nulls.h we can still see ordinary assignments to ->pprev and ->next of hlist_nulls. As noted in the two patches below: commit efd04f8a8b45 ("rcu: Use WRITE_ONCE() for assignments to ->next for rculist_nulls") commit 860c8802ace1 ("rcu: Use WRITE_ONCE() for assignments to ->pprev for hlist_nulls") We should use WRITE_ONCE(). Signed-off-by: Xuanqiang Luo Signed-off-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker --- include/linux/rculist_nulls.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h index 89186c499dd4..d5a656cc4c6a 100644 --- a/include/linux/rculist_nulls.h +++ b/include/linux/rculist_nulls.h @@ -138,7 +138,7 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, if (last) { WRITE_ONCE(n->next, last->next); - n->pprev = &last->next; + WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_nulls_next_rcu(last), n); } else { hlist_nulls_add_head_rcu(n, h); @@ -148,8 +148,8 @@ static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, /* after that hlist_nulls_del will work */ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n) { - n->pprev = &n->next; - n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL); + WRITE_ONCE(n->pprev, &n->next); + WRITE_ONCE(n->next, (struct hlist_nulls_node *)NULLS_MARKER(NULL)); } /** -- cgit v1.2.3 From ca38f0f65eefd79889b409c89c6932d7e2fe0993 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Tue, 4 Nov 2025 00:40:32 +0100 Subject: dt-bindings: clock: rk3568: Drop CLK_NR_CLKS define CLK_NR_CLKS has always only be used on the driver side to calculate array sizes should never have been part of the clock-binding. Let's drop it, since the kernel code no longer uses it either and nothing else has ever used it. Acked-by: Conor Dooley Signed-off-by: Heiko Stuebner Link: https://patch.msgid.link/20251103234032.413563-3-heiko@sntech.de --- include/dt-bindings/clock/rk3568-cru.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/clock/rk3568-cru.h b/include/dt-bindings/clock/rk3568-cru.h index 18bb8d41d741..1e0aef8a645d 100644 --- a/include/dt-bindings/clock/rk3568-cru.h +++ b/include/dt-bindings/clock/rk3568-cru.h @@ -483,8 +483,6 @@ #define PCLK_CORE_PVTM 450 -#define CLK_NR_CLKS (PCLK_CORE_PVTM + 1) - /* scmi-clocks indices */ #define SCMI_CLK_CPU 0 -- cgit v1.2.3 From b4ce5923e780d6896d4aaf19de5a27652b8bf1ea Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 5 Nov 2025 09:03:59 +0000 Subject: bpf, x86: add new map type: instructions array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On bpf(BPF_PROG_LOAD) syscall user-supplied BPF programs are translated by the verifier into "xlated" BPF programs. During this process the original instructions offsets might be adjusted and/or individual instructions might be replaced by new sets of instructions, or deleted. Add a new BPF map type which is aimed to keep track of how, for a given program, the original instructions were relocated during the verification. Also, besides keeping track of the original -> xlated mapping, make x86 JIT to build the xlated -> jitted mapping for every instruction listed in an instruction array. This is required for every future application of instruction arrays: static keys, indirect jumps and indirect calls. A map of the BPF_MAP_TYPE_INSN_ARRAY type must be created with a u32 keys and value of size 8. The values have different semantics for userspace and for BPF space. For userspace a value consists of two u32 values – xlated and jitted offsets. For BPF side the value is a real pointer to a jitted instruction. On map creation/initialization, before loading the program, each element of the map should be initialized to point to an instruction offset within the program. Before the program load such maps should be made frozen. After the program verification xlated and jitted offsets can be read via the bpf(2) syscall. If a tracked instruction is removed by the verifier, then the xlated offset is set to (u32)-1 which is considered to be too big for a valid BPF program offset. One such a map can, obviously, be used to track one and only one BPF program. If the verification process was unsuccessful, then the same map can be re-used to verify the program with a different log level. However, if the program was loaded fine, then such a map, being frozen in any case, can't be reused by other programs even after the program release. Example. Consider the following original and xlated programs: Original prog: Xlated prog: 0: r1 = 0x0 0: r1 = 0 1: *(u32 *)(r10 - 0x4) = r1 1: *(u32 *)(r10 -4) = r1 2: r2 = r10 2: r2 = r10 3: r2 += -0x4 3: r2 += -4 4: r1 = 0x0 ll 4: r1 = map[id:88] 6: call 0x1 6: r1 += 272 7: r0 = *(u32 *)(r2 +0) 8: if r0 >= 0x1 goto pc+3 9: r0 <<= 3 10: r0 += r1 11: goto pc+1 12: r0 = 0 7: r6 = r0 13: r6 = r0 8: if r6 == 0x0 goto +0x2 14: if r6 == 0x0 goto pc+4 9: call 0x76 15: r0 = 0xffffffff8d2079c0 17: r0 = *(u64 *)(r0 +0) 10: *(u64 *)(r6 + 0x0) = r0 18: *(u64 *)(r6 +0) = r0 11: r0 = 0x0 19: r0 = 0x0 12: exit 20: exit An instruction array map, containing, e.g., instructions [0,4,7,12] will be translated by the verifier to [0,4,13,20]. A map with index 5 (the middle of 16-byte instruction) or indexes greater than 12 (outside the program boundaries) would be rejected. The functionality provided by this patch will be extended in consequent patches to implement BPF Static Keys, indirect jumps, and indirect calls. Signed-off-by: Anton Protopopov Reviewed-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251105090410.1250500-2-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 15 +++++++++++++++ include/linux/bpf_types.h | 1 + include/linux/bpf_verifier.h | 2 ++ include/uapi/linux/bpf.h | 21 +++++++++++++++++++++ 4 files changed, 39 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a47d67db3be5..9d41a6affcef 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3797,4 +3797,19 @@ int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char * const char **linep, int *nump); struct bpf_prog *bpf_prog_find_from_stack(void); +int bpf_insn_array_init(struct bpf_map *map, const struct bpf_prog *prog); +int bpf_insn_array_ready(struct bpf_map *map); +void bpf_insn_array_release(struct bpf_map *map); +void bpf_insn_array_adjust(struct bpf_map *map, u32 off, u32 len); +void bpf_insn_array_adjust_after_remove(struct bpf_map *map, u32 off, u32 len); + +#ifdef CONFIG_BPF_SYSCALL +void bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image); +#else +static inline void +bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) +{ +} +#endif + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index fa78f49d4a9a..b13de31e163f 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -133,6 +133,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_BLOOM_FILTER, bloom_filter_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_USER_RINGBUF, user_ringbuf_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_ARENA, arena_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_INSN_ARRAY, insn_array_map_ops) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c6eb68b6389c..6b820d8d77af 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -754,8 +754,10 @@ struct bpf_verifier_env { struct list_head free_list; /* list of struct bpf_verifier_state_list */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ + struct bpf_map *insn_array_maps[MAX_USED_MAPS]; /* array of INSN_ARRAY map's to be relocated */ u32 used_map_cnt; /* number of used maps */ u32 used_btf_cnt; /* number of used BTF objects */ + u32 insn_array_map_cnt; /* number of used maps of type BPF_MAP_TYPE_INSN_ARRAY */ u32 id_gen; /* used to generate unique reg IDs */ u32 hidden_subprog_cnt; /* number of hidden subprogs */ int exception_callback_subprog; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d73f165394d..f5713f59ac10 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1026,6 +1026,7 @@ enum bpf_map_type { BPF_MAP_TYPE_USER_RINGBUF, BPF_MAP_TYPE_CGRP_STORAGE, BPF_MAP_TYPE_ARENA, + BPF_MAP_TYPE_INSN_ARRAY, __MAX_BPF_MAP_TYPE }; @@ -7649,4 +7650,24 @@ enum bpf_kfunc_flags { BPF_F_PAD_ZEROS = (1ULL << 0), }; +/* + * Values of a BPF_MAP_TYPE_INSN_ARRAY entry must be of this type. + * + * Before the map is used the orig_off field should point to an + * instruction inside the program being loaded. The other fields + * must be set to 0. + * + * After the program is loaded, the xlated_off will be adjusted + * by the verifier to point to the index of the original instruction + * in the xlated program. If the instruction is deleted, it will + * be set to (u32)-1. The jitted_off will be set to the corresponding + * offset in the jitted image of the program. + */ +struct bpf_insn_array_value { + __u32 orig_off; + __u32 xlated_off; + __u32 jitted_off; + __u32 :32; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ -- cgit v1.2.3 From 493d9e0d608339a32f568504d5fd411a261bb0af Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Wed, 5 Nov 2025 09:04:06 +0000 Subject: bpf, x86: add support for indirect jumps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for a new instruction BPF_JMP|BPF_X|BPF_JA, SRC=0, DST=Rx, off=0, imm=0 which does an indirect jump to a location stored in Rx. The register Rx should have type PTR_TO_INSN. This new type assures that the Rx register contains a value (or a range of values) loaded from a correct jump table – map of type instruction array. For example, for a C switch LLVM will generate the following code: 0: r3 = r1 # "switch (r3)" 1: if r3 > 0x13 goto +0x666 # check r3 boundaries 2: r3 <<= 0x3 # adjust to an index in array of addresses 3: r1 = 0xbeef ll # r1 is PTR_TO_MAP_VALUE, r1->map_ptr=M 5: r1 += r3 # r1 inherits boundaries from r3 6: r1 = *(u64 *)(r1 + 0x0) # r1 now has type INSN_TO_PTR 7: gotox r1 # jit will generate proper code Here the gotox instruction corresponds to one particular map. This is possible however to have a gotox instruction which can be loaded from different maps, e.g. 0: r1 &= 0x1 1: r2 <<= 0x3 2: r3 = 0x0 ll # load from map M_1 4: r3 += r2 5: if r1 == 0x0 goto +0x4 6: r1 <<= 0x3 7: r3 = 0x0 ll # load from map M_2 9: r3 += r1 A: r1 = *(u64 *)(r3 + 0x0) B: gotox r1 # jump to target loaded from M_1 or M_2 During check_cfg stage the verifier will collect all the maps which point to inside the subprog being verified. When building the config, the high 16 bytes of the insn_state are used, so this patch (theoretically) supports jump tables of up to 2^16 slots. During the later stage, in check_indirect_jump, it is checked that the register Rx was loaded from a particular instruction array. Signed-off-by: Anton Protopopov Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251105090410.1250500-9-a.s.protopopov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9d41a6affcef..09d5dc541d1c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1001,6 +1001,7 @@ enum bpf_reg_type { PTR_TO_ARENA, PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ + PTR_TO_INSN, /* reg points to a bpf program instruction */ CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6b820d8d77af..5441341f1ab9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -527,6 +527,7 @@ struct bpf_insn_aux_data { struct { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ + struct bpf_iarray *jt; /* jump table for gotox instruction */ }; struct { enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ @@ -840,6 +841,7 @@ struct bpf_verifier_env { struct bpf_scc_info **scc_info; u32 scc_cnt; struct bpf_iarray *succ; + struct bpf_iarray *gotox_tmp_buf; }; static inline struct bpf_func_info_aux *subprog_aux(struct bpf_verifier_env *env, int subprog) @@ -1050,6 +1052,13 @@ static inline bool bpf_stack_narrow_access_ok(int off, int fill_size, int spill_ return !(off % BPF_REG_SIZE); } +static inline bool insn_is_gotox(struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_JMP && + BPF_OP(insn->code) == BPF_JA && + BPF_SRC(insn->code) == BPF_X; +} + const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type); const char *dynptr_type_str(enum bpf_dynptr_type type); const char *iter_type_str(const struct btf *btf, u32 btf_id); -- cgit v1.2.3 From 0593447248044ab609b43b947d0e198c887ac281 Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 25 Oct 2025 22:50:20 -0700 Subject: lib/crypto: sha3: Add SHA-3 support Add SHA-3 support to lib/crypto/. All six algorithms in the SHA-3 family are supported: four digests (SHA3-224, SHA3-256, SHA3-384, and SHA3-512) and two extendable-output functions (SHAKE128 and SHAKE256). The SHAKE algorithms will be required for ML-DSA. [EB: simplified the API to use fewer types and functions, fixed bug that sometimes caused incorrect SHAKE output, cleaned up the documentation, dropped an ad-hoc test that was inconsistent with the rest of lib/crypto/, and many other cleanups] Signed-off-by: David Howells Co-developed-by: Eric Biggers Tested-by: Harald Freudenberger Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251026055032.1413733-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha3.h | 322 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 319 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/crypto/sha3.h b/include/crypto/sha3.h index 41e1b83a6d91..c0c468ee099e 100644 --- a/include/crypto/sha3.h +++ b/include/crypto/sha3.h @@ -1,11 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Common values for SHA-3 algorithms + * + * See also Documentation/crypto/sha3.rst */ #ifndef __CRYPTO_SHA3_H__ #define __CRYPTO_SHA3_H__ #include +#include #define SHA3_224_DIGEST_SIZE (224 / 8) #define SHA3_224_BLOCK_SIZE (200 - 2 * SHA3_224_DIGEST_SIZE) @@ -23,14 +26,327 @@ #define SHA3_512_BLOCK_SIZE (200 - 2 * SHA3_512_DIGEST_SIZE) #define SHA3_512_EXPORT_SIZE SHA3_STATE_SIZE + SHA3_512_BLOCK_SIZE + 1 +/* + * SHAKE128 and SHAKE256 actually have variable output size, but this is used to + * calculate the block size (rate) analogously to the above. + */ +#define SHAKE128_DEFAULT_SIZE (128 / 8) +#define SHAKE128_BLOCK_SIZE (200 - 2 * SHAKE128_DEFAULT_SIZE) +#define SHAKE256_DEFAULT_SIZE (256 / 8) +#define SHAKE256_BLOCK_SIZE (200 - 2 * SHAKE256_DEFAULT_SIZE) + #define SHA3_STATE_SIZE 200 struct shash_desc; +int crypto_sha3_init(struct shash_desc *desc); + +/* + * State for the Keccak-f[1600] permutation: 25 64-bit words. + * + * We usually keep the state words as little-endian, to make absorbing and + * squeezing easier. (It means that absorbing and squeezing can just treat the + * state as a byte array.) The state words are converted to native-endian only + * temporarily by implementations of the permutation that need native-endian + * words. Of course, that conversion is a no-op on little-endian machines. + */ struct sha3_state { - u64 st[SHA3_STATE_SIZE / 8]; + union { + u64 st[SHA3_STATE_SIZE / 8]; /* temporarily retained for compatibility purposes */ + + __le64 words[SHA3_STATE_SIZE / 8]; + u8 bytes[SHA3_STATE_SIZE]; + + u64 native_words[SHA3_STATE_SIZE / 8]; /* see comment above */ + }; }; -int crypto_sha3_init(struct shash_desc *desc); +/* Internal context, shared by the digests (SHA3-*) and the XOFs (SHAKE*) */ +struct __sha3_ctx { + struct sha3_state state; + u8 digest_size; /* Digests only: the digest size in bytes */ + u8 block_size; /* Block size in bytes */ + u8 absorb_offset; /* Index of next state byte to absorb into */ + u8 squeeze_offset; /* XOFs only: index of next state byte to extract */ +}; + +void __sha3_update(struct __sha3_ctx *ctx, const u8 *in, size_t in_len); + +/** + * struct sha3_ctx - Context for SHA3-224, SHA3-256, SHA3-384, or SHA3-512 + * @ctx: private + */ +struct sha3_ctx { + struct __sha3_ctx ctx; +}; + +/** + * sha3_zeroize_ctx() - Zeroize a SHA-3 context + * @ctx: The context to zeroize + * + * This is already called by sha3_final(). Call this explicitly when abandoning + * a context without calling sha3_final(). + */ +static inline void sha3_zeroize_ctx(struct sha3_ctx *ctx) +{ + memzero_explicit(ctx, sizeof(*ctx)); +} + +/** + * struct shake_ctx - Context for SHAKE128 or SHAKE256 + * @ctx: private + */ +struct shake_ctx { + struct __sha3_ctx ctx; +}; + +/** + * shake_zeroize_ctx() - Zeroize a SHAKE context + * @ctx: The context to zeroize + * + * Call this after the last squeeze. + */ +static inline void shake_zeroize_ctx(struct shake_ctx *ctx) +{ + memzero_explicit(ctx, sizeof(*ctx)); +} + +/** + * sha3_224_init() - Initialize a context for SHA3-224 + * @ctx: The context to initialize + * + * This begins a new SHA3-224 message digest computation. + * + * Context: Any context. + */ +static inline void sha3_224_init(struct sha3_ctx *ctx) +{ + *ctx = (struct sha3_ctx){ + .ctx.digest_size = SHA3_224_DIGEST_SIZE, + .ctx.block_size = SHA3_224_BLOCK_SIZE, + }; +} + +/** + * sha3_256_init() - Initialize a context for SHA3-256 + * @ctx: The context to initialize + * + * This begins a new SHA3-256 message digest computation. + * + * Context: Any context. + */ +static inline void sha3_256_init(struct sha3_ctx *ctx) +{ + *ctx = (struct sha3_ctx){ + .ctx.digest_size = SHA3_256_DIGEST_SIZE, + .ctx.block_size = SHA3_256_BLOCK_SIZE, + }; +} + +/** + * sha3_384_init() - Initialize a context for SHA3-384 + * @ctx: The context to initialize + * + * This begins a new SHA3-384 message digest computation. + * + * Context: Any context. + */ +static inline void sha3_384_init(struct sha3_ctx *ctx) +{ + *ctx = (struct sha3_ctx){ + .ctx.digest_size = SHA3_384_DIGEST_SIZE, + .ctx.block_size = SHA3_384_BLOCK_SIZE, + }; +} + +/** + * sha3_512_init() - Initialize a context for SHA3-512 + * @ctx: The context to initialize + * + * This begins a new SHA3-512 message digest computation. + * + * Context: Any context. + */ +static inline void sha3_512_init(struct sha3_ctx *ctx) +{ + *ctx = (struct sha3_ctx){ + .ctx.digest_size = SHA3_512_DIGEST_SIZE, + .ctx.block_size = SHA3_512_BLOCK_SIZE, + }; +} + +/** + * sha3_update() - Update a SHA-3 digest context with input data + * @ctx: The context to update; must have been initialized + * @in: The input data + * @in_len: Length of the input data in bytes + * + * This can be called any number of times to add data to a SHA3-224, SHA3-256, + * SHA3-384, or SHA3-512 digest (depending on which init function was called). + * + * Context: Any context. + */ +static inline void sha3_update(struct sha3_ctx *ctx, + const u8 *in, size_t in_len) +{ + __sha3_update(&ctx->ctx, in, in_len); +} + +/** + * sha3_final() - Finish computing a SHA-3 message digest + * @ctx: The context to finalize; must have been initialized + * @out: (output) The resulting SHA3-224, SHA3-256, SHA3-384, or SHA3-512 + * message digest, matching the init function that was called. Note that + * the size differs for each one; see SHA3_*_DIGEST_SIZE. + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void sha3_final(struct sha3_ctx *ctx, u8 *out); + +/** + * shake128_init() - Initialize a context for SHAKE128 + * @ctx: The context to initialize + * + * This begins a new SHAKE128 extendable-output function (XOF) computation. + * + * Context: Any context. + */ +static inline void shake128_init(struct shake_ctx *ctx) +{ + *ctx = (struct shake_ctx){ + .ctx.block_size = SHAKE128_BLOCK_SIZE, + }; +} + +/** + * shake256_init() - Initialize a context for SHAKE256 + * @ctx: The context to initialize + * + * This begins a new SHAKE256 extendable-output function (XOF) computation. + * + * Context: Any context. + */ +static inline void shake256_init(struct shake_ctx *ctx) +{ + *ctx = (struct shake_ctx){ + .ctx.block_size = SHAKE256_BLOCK_SIZE, + }; +} + +/** + * shake_update() - Update a SHAKE context with input data + * @ctx: The context to update; must have been initialized + * @in: The input data + * @in_len: Length of the input data in bytes + * + * This can be called any number of times to add more input data to SHAKE128 or + * SHAKE256. This cannot be called after squeezing has begun. + * + * Context: Any context. + */ +static inline void shake_update(struct shake_ctx *ctx, + const u8 *in, size_t in_len) +{ + __sha3_update(&ctx->ctx, in, in_len); +} + +/** + * shake_squeeze() - Generate output from SHAKE128 or SHAKE256 + * @ctx: The context to squeeze; must have been initialized + * @out: Where to write the resulting output data + * @out_len: The amount of data to extract to @out in bytes + * + * This may be called multiple times. A number of consecutive squeezes laid + * end-to-end will yield the same output as one big squeeze generating the same + * total amount of output. More input cannot be provided after squeezing has + * begun. After the last squeeze, call shake_zeroize_ctx(). + * + * Context: Any context. + */ +void shake_squeeze(struct shake_ctx *ctx, u8 *out, size_t out_len); + +/** + * sha3_224() - Compute SHA3-224 digest in one shot + * @in: The input data to be digested + * @in_len: Length of the input data in bytes + * @out: The buffer into which the digest will be stored + * + * Convenience function that computes a SHA3-224 digest. Use this instead of + * the incremental API if you're able to provide all the input at once. + * + * Context: Any context. + */ +void sha3_224(const u8 *in, size_t in_len, u8 out[SHA3_224_DIGEST_SIZE]); + +/** + * sha3_256() - Compute SHA3-256 digest in one shot + * @in: The input data to be digested + * @in_len: Length of the input data in bytes + * @out: The buffer into which the digest will be stored + * + * Convenience function that computes a SHA3-256 digest. Use this instead of + * the incremental API if you're able to provide all the input at once. + * + * Context: Any context. + */ +void sha3_256(const u8 *in, size_t in_len, u8 out[SHA3_256_DIGEST_SIZE]); + +/** + * sha3_384() - Compute SHA3-384 digest in one shot + * @in: The input data to be digested + * @in_len: Length of the input data in bytes + * @out: The buffer into which the digest will be stored + * + * Convenience function that computes a SHA3-384 digest. Use this instead of + * the incremental API if you're able to provide all the input at once. + * + * Context: Any context. + */ +void sha3_384(const u8 *in, size_t in_len, u8 out[SHA3_384_DIGEST_SIZE]); + +/** + * sha3_512() - Compute SHA3-512 digest in one shot + * @in: The input data to be digested + * @in_len: Length of the input data in bytes + * @out: The buffer into which the digest will be stored + * + * Convenience function that computes a SHA3-512 digest. Use this instead of + * the incremental API if you're able to provide all the input at once. + * + * Context: Any context. + */ +void sha3_512(const u8 *in, size_t in_len, u8 out[SHA3_512_DIGEST_SIZE]); + +/** + * shake128() - Compute SHAKE128 in one shot + * @in: The input data to be used + * @in_len: Length of the input data in bytes + * @out: The buffer into which the output will be stored + * @out_len: Length of the output to produce in bytes + * + * Convenience function that computes SHAKE128 in one shot. Use this instead of + * the incremental API if you're able to provide all the input at once as well + * as receive all the output at once. All output lengths are supported. + * + * Context: Any context. + */ +void shake128(const u8 *in, size_t in_len, u8 *out, size_t out_len); + +/** + * shake256() - Compute SHAKE256 in one shot + * @in: The input data to be used + * @in_len: Length of the input data in bytes + * @out: The buffer into which the output will be stored + * @out_len: Length of the output to produce in bytes + * + * Convenience function that computes SHAKE256 in one shot. Use this instead of + * the incremental API if you're able to provide all the input at once as well + * as receive all the output at once. All output lengths are supported. + * + * Context: Any context. + */ +void shake256(const u8 *in, size_t in_len, u8 *out, size_t out_len); -#endif +#endif /* __CRYPTO_SHA3_H__ */ -- cgit v1.2.3 From f1799d17285ca99243328cd92133a9f84ee3a593 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 25 Oct 2025 22:50:31 -0700 Subject: crypto: sha3 - Reimplement using library API Replace sha3_generic.c with a new file sha3.c which implements the SHA-3 crypto_shash algorithms on top of the SHA-3 library API. Change the driver name suffix from "-generic" to "-lib" to reflect that these algorithms now just use the (possibly arch-optimized) library. This closely mirrors crypto/{md5,sha1,sha256,sha512,blake2b}.c. Implement export_core and import_core, since crypto/hmac.c expects these to be present. (Note that there is no security purpose in wrapping SHA-3 with HMAC. HMAC was designed for older algorithms that don't resist length extension attacks. But since someone could be using "hmac(sha3-*)" via crypto_shash anyway, keep supporting it for now.) Reviewed-by: Ard Biesheuvel Tested-by: Harald Freudenberger Link: https://lore.kernel.org/r/20251026055032.1413733-15-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha3.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/crypto/sha3.h b/include/crypto/sha3.h index c0c468ee099e..c9e4182ff74f 100644 --- a/include/crypto/sha3.h +++ b/include/crypto/sha3.h @@ -37,10 +37,6 @@ #define SHA3_STATE_SIZE 200 -struct shash_desc; - -int crypto_sha3_init(struct shash_desc *desc); - /* * State for the Keccak-f[1600] permutation: 25 64-bit words. * @@ -52,8 +48,6 @@ int crypto_sha3_init(struct shash_desc *desc); */ struct sha3_state { union { - u64 st[SHA3_STATE_SIZE / 8]; /* temporarily retained for compatibility purposes */ - __le64 words[SHA3_STATE_SIZE / 8]; u8 bytes[SHA3_STATE_SIZE]; -- cgit v1.2.3 From 512c83265796d613f21255c766839eaed1c1cc79 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 4 Nov 2025 20:51:27 -0800 Subject: IB/rdmavt: rdmavt_qp.h: clean up kernel-doc comments Correct the kernel-doc comments format to avoid around 35 kernel-doc warnings: - use struct keyword to introduce struct kernel-doc comments - use correct variable name for some struct members - use correct function name in comments for some functions - fix spelling in a few comments - use a ':' instead of '-' to separate struct members from their descriptions - add a function name heading for rvt_div_mtu() This leaves one struct member that is not described: rdmavt_qp.h:206: warning: Function parameter or struct member 'wq' not described in 'rvt_krwq' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251105045127.106822-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- include/rdma/rdmavt_qp.h | 70 +++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index d67892944193..71140ea0aeb2 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -144,7 +144,7 @@ #define RVT_SEND_COMPLETION_ONLY (IB_SEND_RESERVED_START << 1) /** - * rvt_ud_wr - IB UD work plus AH cache + * struct rvt_ud_wr - IB UD work plus AH cache * @wr: valid IB work request * @attr: pointer to an allocated AH attribute * @@ -184,10 +184,10 @@ struct rvt_swqe { * struct rvt_krwq - kernel struct receive work request * @p_lock: lock to protect producer of the kernel buffer * @head: index of next entry to fill - * @c_lock:lock to protect consumer of the kernel buffer + * @c_lock: lock to protect consumer of the kernel buffer * @tail: index of next entry to pull - * @count: count is aproximate of total receive enteries posted - * @rvt_rwqe: struct of receive work request queue entry + * @count: count is approximate of total receive entries posted + * @curr_wq: struct of receive work request queue entry * * This structure is used to contain the head pointer, * tail pointer and receive work queue entries for kernel @@ -309,10 +309,10 @@ struct rvt_ack_entry { #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1) /** - * rvt_operation_params - op table entry - * @length - the length to copy into the swqe entry - * @qpt_support - a bit mask indicating QP type support - * @flags - RVT_OPERATION flags (see above) + * struct rvt_operation_params - op table entry + * @length: the length to copy into the swqe entry + * @qpt_support: a bit mask indicating QP type support + * @flags: RVT_OPERATION flags (see above) * * This supports table driven post send so that * the driver can have differing an potentially @@ -552,7 +552,7 @@ static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n) /** * rvt_is_user_qp - return if this is user mode QP - * @qp - the target QP + * @qp: the target QP */ static inline bool rvt_is_user_qp(struct rvt_qp *qp) { @@ -561,7 +561,7 @@ static inline bool rvt_is_user_qp(struct rvt_qp *qp) /** * rvt_get_qp - get a QP reference - * @qp - the QP to hold + * @qp: the QP to hold */ static inline void rvt_get_qp(struct rvt_qp *qp) { @@ -570,7 +570,7 @@ static inline void rvt_get_qp(struct rvt_qp *qp) /** * rvt_put_qp - release a QP reference - * @qp - the QP to release + * @qp: the QP to release */ static inline void rvt_put_qp(struct rvt_qp *qp) { @@ -580,7 +580,7 @@ static inline void rvt_put_qp(struct rvt_qp *qp) /** * rvt_put_swqe - drop mr refs held by swqe - * @wqe - the send wqe + * @wqe: the send wqe * * This drops any mr references held by the swqe */ @@ -597,8 +597,8 @@ static inline void rvt_put_swqe(struct rvt_swqe *wqe) /** * rvt_qp_wqe_reserve - reserve operation - * @qp - the rvt qp - * @wqe - the send wqe + * @qp: the rvt qp + * @wqe: the send wqe * * This routine used in post send to record * a wqe relative reserved operation use. @@ -612,8 +612,8 @@ static inline void rvt_qp_wqe_reserve( /** * rvt_qp_wqe_unreserve - clean reserved operation - * @qp - the rvt qp - * @flags - send wqe flags + * @qp: the rvt qp + * @flags: send wqe flags * * This decrements the reserve use count. * @@ -653,8 +653,8 @@ u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len); /** * rvt_div_round_up_mtu - round up divide - * @qp - the qp pair - * @len - the length + * @qp: the qp pair + * @len: the length * * Perform a shift based mtu round up divide */ @@ -664,8 +664,9 @@ static inline u32 rvt_div_round_up_mtu(struct rvt_qp *qp, u32 len) } /** - * @qp - the qp pair - * @len - the length + * rvt_div_mtu - shift-based divide + * @qp: the qp pair + * @len: the length * * Perform a shift based mtu divide */ @@ -676,7 +677,7 @@ static inline u32 rvt_div_mtu(struct rvt_qp *qp, u32 len) /** * rvt_timeout_to_jiffies - Convert a ULP timeout input into jiffies - * @timeout - timeout input(0 - 31). + * @timeout: timeout input(0 - 31). * * Return a timeout value in jiffies. */ @@ -690,7 +691,8 @@ static inline unsigned long rvt_timeout_to_jiffies(u8 timeout) /** * rvt_lookup_qpn - return the QP with the given QPN - * @ibp: the ibport + * @rdi: rvt device info structure + * @rvp: the ibport * @qpn: the QP number to look up * * The caller must hold the rcu_read_lock(), and keep the lock until @@ -716,9 +718,9 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi, } /** - * rvt_mod_retry_timer - mod a retry timer - * @qp - the QP - * @shift - timeout shift to wait for multiple packets + * rvt_mod_retry_timer_ext - mod a retry timer + * @qp: the QP + * @shift: timeout shift to wait for multiple packets * Modify a potentially already running retry timer */ static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift) @@ -753,7 +755,7 @@ static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe) } /** - * rvt_qp_sqwe_incr - increment ring index + * rvt_qp_swqe_incr - increment ring index * @qp: the qp * @val: the starting value * @@ -811,10 +813,10 @@ static inline void rvt_send_cq(struct rvt_qp *qp, struct ib_wc *wc, /** * rvt_qp_complete_swqe - insert send completion - * @qp - the qp - * @wqe - the send wqe - * @opcode - wc operation (driver dependent) - * @status - completion status + * @qp: the qp + * @wqe: the send wqe + * @opcode: wc operation (driver dependent) + * @status: completion status * * Update the s_last information, and then insert a send * completion into the completion @@ -891,7 +893,7 @@ void rvt_ruc_loopback(struct rvt_qp *qp); /** * struct rvt_qp_iter - the iterator for QPs - * @qp - the current QP + * @qp: the current QP * * This structure defines the current iterator * state for sequenced access to all QPs relative @@ -913,7 +915,7 @@ struct rvt_qp_iter { /** * ib_cq_tail - Return tail index of cq buffer - * @send_cq - The cq for send + * @send_cq: The cq for send * * This is called in qp_iter_print to get tail * of cq buffer. @@ -929,7 +931,7 @@ static inline u32 ib_cq_tail(struct ib_cq *send_cq) /** * ib_cq_head - Return head index of cq buffer - * @send_cq - The cq for send + * @send_cq: The cq for send * * This is called in qp_iter_print to get head * of cq buffer. @@ -945,7 +947,7 @@ static inline u32 ib_cq_head(struct ib_cq *send_cq) /** * rvt_free_rq - free memory allocated for rvt_rq struct - * @rvt_rq: request queue data structure + * @rq: request queue data structure * * This function should only be called if the rvt_mmap_info() * has not succeeded. -- cgit v1.2.3 From 5f20bc206beb902e32b77216cb7935b46ca00b0a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 23 Oct 2025 12:46:14 -0700 Subject: platform/x86: ISST: isst_if.h: fix all kernel-doc warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix all kernel-doc warnings in : - don't use "[]" in the variable name in kernel-doc - add a few missing entries - change "power_domain" to "power_domain_id" in kernel-doc to match the struct member name - add a leading '@' on a few existing kernel-doc lines - use '_' instead of '-' in struct member names Examples (but not all 27 warnings): Warning: include/uapi/linux/isst_if.h:63 struct member 'cpu_map' not described in 'isst_if_cpu_maps' Warning: ../include/uapi/linux/isst_if.h:95 struct member 'req_count' not described in 'isst_if_io_regs' Warning: include/uapi/linux/isst_if.h:132 struct member 'mbox_cmd' not described in 'isst_if_mbox_cmds' Warning: ../include/uapi/linux/isst_if.h:183 struct member 'supported' not described in 'isst_core_power' Warning: ../include/uapi/linux/isst_if.h:206 struct member 'power_domain_id' not described in 'isst_clos_param' Warning: ../include/uapi/linux/isst_if.h:239 struct member 'assoc_info' not described in 'isst_if_clos_assoc_cmds' Warning: ../include/uapi/linux/isst_if.h:286 struct member 'sst_tf_support' not described in 'isst_perf_level_info' Warning: ../include/uapi/linux/isst_if.h:375 struct member 'trl_freq_mhz' not described in 'isst_perf_level_data_info' Warning: ../include/uapi/linux/isst_if.h:475 struct member 'max_buckets' not described in 'isst_turbo_freq_info' Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251023194615.180824-1-rdunlap@infradead.org Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/uapi/linux/isst_if.h | 50 ++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/isst_if.h b/include/uapi/linux/isst_if.h index 8197a4800604..40aa545101a3 100644 --- a/include/uapi/linux/isst_if.h +++ b/include/uapi/linux/isst_if.h @@ -52,7 +52,7 @@ struct isst_if_cpu_map { /** * struct isst_if_cpu_maps - structure for CPU map IOCTL * @cmd_count: Number of CPU mapping command in cpu_map[] - * @cpu_map[]: Holds one or more CPU map data structure + * @cpu_map: Holds one or more CPU map data structure * * This structure used with ioctl ISST_IF_GET_PHY_ID to send * one or more CPU mapping commands. Here IOCTL return value indicates @@ -82,8 +82,8 @@ struct isst_if_io_reg { /** * struct isst_if_io_regs - structure for IO register commands - * @cmd_count: Number of io reg commands in io_reg[] - * @io_reg[]: Holds one or more io_reg command structure + * @req_count: Number of io reg commands in io_reg[] + * @io_reg: Holds one or more io_reg command structure * * This structure used with ioctl ISST_IF_IO_CMD to send * one or more read/write commands to PUNIT. Here IOCTL return value @@ -120,7 +120,7 @@ struct isst_if_mbox_cmd { /** * struct isst_if_mbox_cmds - structure for mailbox commands * @cmd_count: Number of mailbox commands in mbox_cmd[] - * @mbox_cmd[]: Holds one or more mbox commands + * @mbox_cmd: Holds one or more mbox commands * * This structure used with ioctl ISST_IF_MBOX_COMMAND to send * one or more mailbox commands to PUNIT. Here IOCTL return value @@ -152,7 +152,7 @@ struct isst_if_msr_cmd { /** * struct isst_if_msr_cmds - structure for msr commands * @cmd_count: Number of mailbox commands in msr_cmd[] - * @msr_cmd[]: Holds one or more msr commands + * @msr_cmd: Holds one or more msr commands * * This structure used with ioctl ISST_IF_MSR_COMMAND to send * one or more MSR commands. IOCTL return value indicates number of @@ -167,8 +167,9 @@ struct isst_if_msr_cmds { * struct isst_core_power - Structure to get/set core_power feature * @get_set: 0: Get, 1: Set * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @enable: Feature enable status + * @supported: Power domain supports SST_CP interface * @priority_type: Priority type for the feature (ordered/proportional) * * Structure to get/set core_power feature state using IOCTL @@ -187,11 +188,11 @@ struct isst_core_power { * struct isst_clos_param - Structure to get/set clos praram * @get_set: 0: Get, 1: Set * @socket_id: Socket/package id - * @power_domain: Power Domain id - * clos: Clos ID for the parameters - * min_freq_mhz: Minimum frequency in MHz - * max_freq_mhz: Maximum frequency in MHz - * prop_prio: Proportional priority from 0-15 + * @power_domain_id: Power Domain id + * @clos: Clos ID for the parameters + * @min_freq_mhz: Minimum frequency in MHz + * @max_freq_mhz: Maximum frequency in MHz + * @prop_prio: Proportional priority from 0-15 * * Structure to get/set per clos property using IOCTL * ISST_IF_CLOS_PARAM. @@ -209,7 +210,7 @@ struct isst_clos_param { /** * struct isst_if_clos_assoc - Structure to assign clos to a CPU * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @logical_cpu: CPU number * @clos: Clos ID to assign to the logical CPU * @@ -228,6 +229,7 @@ struct isst_if_clos_assoc { * @get_set: Request is for get or set * @punit_cpu_map: Set to 1 if the CPU number is punit numbering not * Linux CPU number + * @assoc_info: CLOS data for this CPU * * Structure used to get/set associate CPUs to clos using IOCTL * ISST_IF_CLOS_ASSOC. @@ -257,7 +259,7 @@ struct isst_tpmi_instance_count { /** * struct isst_perf_level_info - Structure to get information on SST-PP levels * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @logical_cpu: CPU number * @clos: Clos ID to assign to the logical CPU * @max_level: Maximum performance level supported by the platform @@ -267,8 +269,8 @@ struct isst_tpmi_instance_count { * @feature_state: SST-BF and SST-TF (enabled/disabled) status at current level * @locked: SST-PP performance level change is locked/unlocked * @enabled: SST-PP feature is enabled or not - * @sst-tf_support: SST-TF support status at this level - * @sst-bf_support: SST-BF support status at this level + * @sst_tf_support: SST-TF support status at this level + * @sst_bf_support: SST-BF support status at this level * * Structure to get SST-PP details using IOCTL ISST_IF_PERF_LEVELS. */ @@ -289,7 +291,7 @@ struct isst_perf_level_info { /** * struct isst_perf_level_control - Structure to set SST-PP level * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: level to set * * Structure used change SST-PP level using IOCTL ISST_IF_PERF_SET_LEVEL. @@ -303,7 +305,7 @@ struct isst_perf_level_control { /** * struct isst_perf_feature_control - Structure to activate SST-BF/SST-TF * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @feature: bit 0 = SST-BF state, bit 1 = SST-TF state * * Structure used to enable SST-BF/SST-TF using IOCTL ISST_IF_PERF_SET_FEATURE. @@ -320,7 +322,7 @@ struct isst_perf_feature_control { /** * struct isst_perf_level_data_info - Structure to get SST-PP level details * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @tdp_ratio: TDP Ratio * @base_freq_mhz: Base frequency in MHz @@ -341,8 +343,8 @@ struct isst_perf_feature_control { * @pm_fabric_freq_mhz: Fabric (Uncore) minimum frequency * @max_buckets: Maximum trl buckets * @max_trl_levels: Maximum trl levels - * @bucket_core_counts[TRL_MAX_BUCKETS]: Number of cores per bucket - * @trl_freq_mhz[TRL_MAX_LEVELS][TRL_MAX_BUCKETS]: maximum frequency + * @bucket_core_counts: Number of cores per bucket + * @trl_freq_mhz: maximum frequency * for a bucket and trl level * * Structure used to get information on frequencies and TDP for a SST-PP @@ -402,7 +404,7 @@ struct isst_perf_level_fabric_info { /** * struct isst_perf_level_cpu_mask - Structure to get SST-PP level CPU mask * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @punit_cpu_map: Set to 1 if the CPU number is punit numbering not * Linux CPU number. If 0 CPU buffer is copied to user space @@ -430,7 +432,7 @@ struct isst_perf_level_cpu_mask { /** * struct isst_base_freq_info - Structure to get SST-BF frequencies * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @high_base_freq_mhz: High priority CPU base frequency * @low_base_freq_mhz: Low priority CPU base frequency @@ -453,9 +455,11 @@ struct isst_base_freq_info { /** * struct isst_turbo_freq_info - Structure to get SST-TF frequencies * @socket_id: Socket/package id - * @power_domain: Power Domain id + * @power_domain_id: Power Domain id * @level: SST-PP level for which caller wants to get information * @max_clip_freqs: Maximum number of low priority core clipping frequencies + * @max_buckets: Maximum trl buckets + * @max_trl_levels: Maximum trl levels * @lp_clip_freq_mhz: Clip frequencies per trl level * @bucket_core_counts: Maximum number of cores for a bucket * @trl_freq_mhz: Frequencies per trl level for each bucket -- cgit v1.2.3 From 6b47af35a6dded074ff583361f6d6668dd7a401d Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Fri, 31 Oct 2025 16:48:11 +0530 Subject: net: selftests: export packet creation helpers for driver use Export the network selftest packet creation infrastructure to allow network drivers to reuse the existing selftest framework instead of duplicating packet creation code. Signed-off-by: Raju Rangoju Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20251031111811.775434-1-Raju.Rangoju@amd.com Signed-off-by: Paolo Abeni --- include/net/selftests.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'include') diff --git a/include/net/selftests.h b/include/net/selftests.h index e65e8d230d33..c36e07406ad4 100644 --- a/include/net/selftests.h +++ b/include/net/selftests.h @@ -3,9 +3,48 @@ #define _NET_SELFTESTS #include +#include + +struct net_packet_attrs { + const unsigned char *src; + const unsigned char *dst; + u32 ip_src; + u32 ip_dst; + bool tcp; + u16 sport; + u16 dport; + int timeout; + int size; + int max_size; + u8 id; + u16 queue_mapping; + bool bad_csum; +}; + +struct net_test_priv { + struct net_packet_attrs *packet; + struct packet_type pt; + struct completion comp; + int double_vlan; + int vlan_id; + int ok; +}; + +struct netsfhdr { + __be32 version; + __be64 magic; + u8 id; +} __packed; + +#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct netsfhdr)) +#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL +#define NET_LB_TIMEOUT msecs_to_jiffies(200) #if IS_ENABLED(CONFIG_NET_SELFTESTS) +struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id, + struct net_packet_attrs *attr); void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf); int net_selftest_get_count(void); @@ -13,6 +52,12 @@ void net_selftest_get_strings(u8 *data); #else +static inline struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id, + struct net_packet_attrs *attr) +{ + return NULL; +} + static inline void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { -- cgit v1.2.3 From 1b0f3f9ee41ee2bdd206667f85ea2aa36dfe6e69 Mon Sep 17 00:00:00 2001 From: Shuming Fan Date: Thu, 6 Nov 2025 17:33:35 +0800 Subject: ASoC: SDCA: support Q7.8 volume format The SDCA specification uses Q7.8 volume format. This patch adds a field to indicate whether it is SDCA volume control and supports the volume settings. Signed-off-by: Shuming Fan Reviewed-by: Charles Keepax Link: https://patch.msgid.link/20251106093335.1363237-1-shumingf@realtek.com Signed-off-by: Mark Brown --- include/sound/soc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 1aebf14fcf80..53b4129ee97a 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1225,6 +1225,7 @@ struct soc_mixer_control { unsigned int sign_bit; unsigned int invert:1; unsigned int autodisable:1; + unsigned int sdca_q78:1; #ifdef CONFIG_SND_SOC_TOPOLOGY struct snd_soc_dobj dobj; #endif -- cgit v1.2.3 From b340412a3b22b60b5e19cce8726940c7b5b14439 Mon Sep 17 00:00:00 2001 From: James Calligeros Date: Sat, 25 Oct 2025 10:24:36 +1000 Subject: mfd: macsmc: Add new __SMC_KEY macro When using the _SMC_KEY macro in switch/case statements, GCC 15.2.1 errors out with 'case label does not reduce to an integer constant'. Introduce a new __SMC_KEY macro that can be used instead. Signed-off-by: James Calligeros Link: https://patch.msgid.link/20251025-macsmc-subdevs-v4-5-374d5c9eba0e@gmail.com Signed-off-by: Lee Jones --- include/linux/mfd/macsmc.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mfd/macsmc.h b/include/linux/mfd/macsmc.h index 6b13f01a8592..f6f80c33b5cf 100644 --- a/include/linux/mfd/macsmc.h +++ b/include/linux/mfd/macsmc.h @@ -41,6 +41,7 @@ typedef u32 smc_key; */ #define SMC_KEY(s) (smc_key)(_SMC_KEY(#s)) #define _SMC_KEY(s) (((s)[0] << 24) | ((s)[1] << 16) | ((s)[2] << 8) | (s)[3]) +#define __SMC_KEY(a, b, c, d) (((u32)(a) << 24) | ((u32)(b) << 16) | ((u32)(c) << 8) | ((u32)(d))) #define APPLE_SMC_READABLE BIT(7) #define APPLE_SMC_WRITABLE BIT(6) -- cgit v1.2.3 From d306cbbc34cc9aa6ed2235472110fe797f887db7 Mon Sep 17 00:00:00 2001 From: Atharva Tiwari Date: Tue, 7 Oct 2025 18:35:10 +0530 Subject: mfd: macsmc: Make SMC write buffers const Mark the write buffer arguments in apple_smc_write(), apple_smc_rw(), and apple_smc_write_atomic() as const. These functions do not modify the data provided by the caller, so the parameters should be const qualified. Signed-off-by: Atharva Tiwari Reviewed-by: Sven Peter Signed-off-by: Lee Jones --- include/linux/mfd/macsmc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mfd/macsmc.h b/include/linux/mfd/macsmc.h index f6f80c33b5cf..cc09ecce0df7 100644 --- a/include/linux/mfd/macsmc.h +++ b/include/linux/mfd/macsmc.h @@ -150,7 +150,7 @@ int apple_smc_read(struct apple_smc *smc, smc_key key, void *buf, size_t size); * * Return: Zero on success, negative errno on error */ -int apple_smc_write(struct apple_smc *smc, smc_key key, void *buf, size_t size); +int apple_smc_write(struct apple_smc *smc, smc_key key, const void *buf, size_t size); /** * apple_smc_enter_atomic - Enter atomic mode to be able to use apple_smc_write_atomic @@ -177,7 +177,7 @@ int apple_smc_enter_atomic(struct apple_smc *smc); * * Return: Zero on success, negative errno on error */ -int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, void *buf, size_t size); +int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, const void *buf, size_t size); /** * apple_smc_rw - Write and then read using the given SMC key @@ -190,7 +190,7 @@ int apple_smc_write_atomic(struct apple_smc *smc, smc_key key, void *buf, size_t * * Return: Zero on success, negative errno on error */ -int apple_smc_rw(struct apple_smc *smc, smc_key key, void *wbuf, size_t wsize, +int apple_smc_rw(struct apple_smc *smc, smc_key key, const void *wbuf, size_t wsize, void *rbuf, size_t rsize); /** -- cgit v1.2.3 From 617347e716178d3a317a129ece05116967f06d53 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Wed, 25 Jun 2025 14:32:58 +0100 Subject: mfd: wl1273-core: Remove the header The wl1273 FM radio is on Arnd's unused driver list: https://lore.kernel.org/lkml/a15bb180-401d-49ad-a212-0c81d613fbc8@app.fastmail.com/ Other patches have removed the core, the ASoC code and the Radio code. With all those in, remove the header. Also, tidy the ref in the docs. Signed-off-by: Dr. David Alan Gilbert Acked-by: Arnd Bergmann Signed-off-by: Lee Jones --- include/linux/mfd/wl1273-core.h | 277 ---------------------------------------- 1 file changed, 277 deletions(-) delete mode 100644 include/linux/mfd/wl1273-core.h (limited to 'include') diff --git a/include/linux/mfd/wl1273-core.h b/include/linux/mfd/wl1273-core.h deleted file mode 100644 index c28cf76d5c31..000000000000 --- a/include/linux/mfd/wl1273-core.h +++ /dev/null @@ -1,277 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * include/linux/mfd/wl1273-core.h - * - * Some definitions for the wl1273 radio receiver/transmitter chip. - * - * Copyright (C) 2010 Nokia Corporation - * Author: Matti J. Aaltonen - */ - -#ifndef WL1273_CORE_H -#define WL1273_CORE_H - -#include -#include - -#define WL1273_FM_DRIVER_NAME "wl1273-fm" -#define RX71_FM_I2C_ADDR 0x22 - -#define WL1273_STEREO_GET 0 -#define WL1273_RSSI_LVL_GET 1 -#define WL1273_IF_COUNT_GET 2 -#define WL1273_FLAG_GET 3 -#define WL1273_RDS_SYNC_GET 4 -#define WL1273_RDS_DATA_GET 5 -#define WL1273_FREQ_SET 10 -#define WL1273_AF_FREQ_SET 11 -#define WL1273_MOST_MODE_SET 12 -#define WL1273_MOST_BLEND_SET 13 -#define WL1273_DEMPH_MODE_SET 14 -#define WL1273_SEARCH_LVL_SET 15 -#define WL1273_BAND_SET 16 -#define WL1273_MUTE_STATUS_SET 17 -#define WL1273_RDS_PAUSE_LVL_SET 18 -#define WL1273_RDS_PAUSE_DUR_SET 19 -#define WL1273_RDS_MEM_SET 20 -#define WL1273_RDS_BLK_B_SET 21 -#define WL1273_RDS_MSK_B_SET 22 -#define WL1273_RDS_PI_MASK_SET 23 -#define WL1273_RDS_PI_SET 24 -#define WL1273_RDS_SYSTEM_SET 25 -#define WL1273_INT_MASK_SET 26 -#define WL1273_SEARCH_DIR_SET 27 -#define WL1273_VOLUME_SET 28 -#define WL1273_AUDIO_ENABLE 29 -#define WL1273_PCM_MODE_SET 30 -#define WL1273_I2S_MODE_CONFIG_SET 31 -#define WL1273_POWER_SET 32 -#define WL1273_INTX_CONFIG_SET 33 -#define WL1273_PULL_EN_SET 34 -#define WL1273_HILO_SET 35 -#define WL1273_SWITCH2FREF 36 -#define WL1273_FREQ_DRIFT_REPORT 37 - -#define WL1273_PCE_GET 40 -#define WL1273_FIRM_VER_GET 41 -#define WL1273_ASIC_VER_GET 42 -#define WL1273_ASIC_ID_GET 43 -#define WL1273_MAN_ID_GET 44 -#define WL1273_TUNER_MODE_SET 45 -#define WL1273_STOP_SEARCH 46 -#define WL1273_RDS_CNTRL_SET 47 - -#define WL1273_WRITE_HARDWARE_REG 100 -#define WL1273_CODE_DOWNLOAD 101 -#define WL1273_RESET 102 - -#define WL1273_FM_POWER_MODE 254 -#define WL1273_FM_INTERRUPT 255 - -/* Transmitter API */ - -#define WL1273_CHANL_SET 55 -#define WL1273_SCAN_SPACING_SET 56 -#define WL1273_REF_SET 57 -#define WL1273_POWER_ENB_SET 90 -#define WL1273_POWER_ATT_SET 58 -#define WL1273_POWER_LEV_SET 59 -#define WL1273_AUDIO_DEV_SET 60 -#define WL1273_PILOT_DEV_SET 61 -#define WL1273_RDS_DEV_SET 62 -#define WL1273_PUPD_SET 91 -#define WL1273_AUDIO_IO_SET 63 -#define WL1273_PREMPH_SET 64 -#define WL1273_MONO_SET 66 -#define WL1273_MUTE 92 -#define WL1273_MPX_LMT_ENABLE 67 -#define WL1273_PI_SET 93 -#define WL1273_ECC_SET 69 -#define WL1273_PTY 70 -#define WL1273_AF 71 -#define WL1273_DISPLAY_MODE 74 -#define WL1273_RDS_REP_SET 77 -#define WL1273_RDS_CONFIG_DATA_SET 98 -#define WL1273_RDS_DATA_SET 99 -#define WL1273_RDS_DATA_ENB 94 -#define WL1273_TA_SET 78 -#define WL1273_TP_SET 79 -#define WL1273_DI_SET 80 -#define WL1273_MS_SET 81 -#define WL1273_PS_SCROLL_SPEED 82 -#define WL1273_TX_AUDIO_LEVEL_TEST 96 -#define WL1273_TX_AUDIO_LEVEL_TEST_THRESHOLD 73 -#define WL1273_TX_AUDIO_INPUT_LEVEL_RANGE_SET 54 -#define WL1273_RX_ANTENNA_SELECT 87 -#define WL1273_I2C_DEV_ADDR_SET 86 -#define WL1273_REF_ERR_CALIB_PARAM_SET 88 -#define WL1273_REF_ERR_CALIB_PERIODICITY_SET 89 -#define WL1273_SOC_INT_TRIGGER 52 -#define WL1273_SOC_AUDIO_PATH_SET 83 -#define WL1273_SOC_PCMI_OVERRIDE 84 -#define WL1273_SOC_I2S_OVERRIDE 85 -#define WL1273_RSSI_BLOCK_SCAN_FREQ_SET 95 -#define WL1273_RSSI_BLOCK_SCAN_START 97 -#define WL1273_RSSI_BLOCK_SCAN_DATA_GET 5 -#define WL1273_READ_FMANT_TUNE_VALUE 104 - -#define WL1273_RDS_OFF 0 -#define WL1273_RDS_ON 1 -#define WL1273_RDS_RESET 2 - -#define WL1273_AUDIO_DIGITAL 0 -#define WL1273_AUDIO_ANALOG 1 - -#define WL1273_MODE_RX BIT(0) -#define WL1273_MODE_TX BIT(1) -#define WL1273_MODE_OFF BIT(2) -#define WL1273_MODE_SUSPENDED BIT(3) - -#define WL1273_RADIO_CHILD BIT(0) -#define WL1273_CODEC_CHILD BIT(1) - -#define WL1273_RX_MONO 1 -#define WL1273_RX_STEREO 0 -#define WL1273_TX_MONO 0 -#define WL1273_TX_STEREO 1 - -#define WL1273_MAX_VOLUME 0xffff -#define WL1273_DEFAULT_VOLUME 0x78b8 - -/* I2S protocol, left channel first, data width 16 bits */ -#define WL1273_PCM_DEF_MODE 0x00 - -/* Rx */ -#define WL1273_AUDIO_ENABLE_I2S BIT(0) -#define WL1273_AUDIO_ENABLE_ANALOG BIT(1) - -/* Tx */ -#define WL1273_AUDIO_IO_SET_ANALOG 0 -#define WL1273_AUDIO_IO_SET_I2S 1 - -#define WL1273_PUPD_SET_OFF 0x00 -#define WL1273_PUPD_SET_ON 0x01 -#define WL1273_PUPD_SET_RETENTION 0x10 - -/* I2S mode */ -#define WL1273_IS2_WIDTH_32 0x0 -#define WL1273_IS2_WIDTH_40 0x1 -#define WL1273_IS2_WIDTH_22_23 0x2 -#define WL1273_IS2_WIDTH_23_22 0x3 -#define WL1273_IS2_WIDTH_48 0x4 -#define WL1273_IS2_WIDTH_50 0x5 -#define WL1273_IS2_WIDTH_60 0x6 -#define WL1273_IS2_WIDTH_64 0x7 -#define WL1273_IS2_WIDTH_80 0x8 -#define WL1273_IS2_WIDTH_96 0x9 -#define WL1273_IS2_WIDTH_128 0xa -#define WL1273_IS2_WIDTH 0xf - -#define WL1273_IS2_FORMAT_STD (0x0 << 4) -#define WL1273_IS2_FORMAT_LEFT (0x1 << 4) -#define WL1273_IS2_FORMAT_RIGHT (0x2 << 4) -#define WL1273_IS2_FORMAT_USER (0x3 << 4) - -#define WL1273_IS2_MASTER (0x0 << 6) -#define WL1273_IS2_SLAVEW (0x1 << 6) - -#define WL1273_IS2_TRI_AFTER_SENDING (0x0 << 7) -#define WL1273_IS2_TRI_ALWAYS_ACTIVE (0x1 << 7) - -#define WL1273_IS2_SDOWS_RR (0x0 << 8) -#define WL1273_IS2_SDOWS_RF (0x1 << 8) -#define WL1273_IS2_SDOWS_FR (0x2 << 8) -#define WL1273_IS2_SDOWS_FF (0x3 << 8) - -#define WL1273_IS2_TRI_OPT (0x0 << 10) -#define WL1273_IS2_TRI_ALWAYS (0x1 << 10) - -#define WL1273_IS2_RATE_48K (0x0 << 12) -#define WL1273_IS2_RATE_44_1K (0x1 << 12) -#define WL1273_IS2_RATE_32K (0x2 << 12) -#define WL1273_IS2_RATE_22_05K (0x4 << 12) -#define WL1273_IS2_RATE_16K (0x5 << 12) -#define WL1273_IS2_RATE_12K (0x8 << 12) -#define WL1273_IS2_RATE_11_025 (0x9 << 12) -#define WL1273_IS2_RATE_8K (0xa << 12) -#define WL1273_IS2_RATE (0xf << 12) - -#define WL1273_I2S_DEF_MODE (WL1273_IS2_WIDTH_32 | \ - WL1273_IS2_FORMAT_STD | \ - WL1273_IS2_MASTER | \ - WL1273_IS2_TRI_AFTER_SENDING | \ - WL1273_IS2_SDOWS_RR | \ - WL1273_IS2_TRI_OPT | \ - WL1273_IS2_RATE_48K) - -#define SCHAR_MIN (-128) -#define SCHAR_MAX 127 - -#define WL1273_FR_EVENT BIT(0) -#define WL1273_BL_EVENT BIT(1) -#define WL1273_RDS_EVENT BIT(2) -#define WL1273_BBLK_EVENT BIT(3) -#define WL1273_LSYNC_EVENT BIT(4) -#define WL1273_LEV_EVENT BIT(5) -#define WL1273_IFFR_EVENT BIT(6) -#define WL1273_PI_EVENT BIT(7) -#define WL1273_PD_EVENT BIT(8) -#define WL1273_STIC_EVENT BIT(9) -#define WL1273_MAL_EVENT BIT(10) -#define WL1273_POW_ENB_EVENT BIT(11) -#define WL1273_SCAN_OVER_EVENT BIT(12) -#define WL1273_ERROR_EVENT BIT(13) - -#define TUNER_MODE_STOP_SEARCH 0 -#define TUNER_MODE_PRESET 1 -#define TUNER_MODE_AUTO_SEEK 2 -#define TUNER_MODE_AF 3 -#define TUNER_MODE_AUTO_SEEK_PI 4 -#define TUNER_MODE_AUTO_SEEK_BULK 5 - -#define RDS_BLOCK_SIZE 3 - -struct wl1273_fm_platform_data { - int (*request_resources) (struct i2c_client *client); - void (*free_resources) (void); - void (*enable) (void); - void (*disable) (void); - - u8 forbidden_modes; - unsigned int children; -}; - -#define WL1273_FM_CORE_CELLS 2 - -#define WL1273_BAND_OTHER 0 -#define WL1273_BAND_JAPAN 1 - -#define WL1273_BAND_JAPAN_LOW 76000 -#define WL1273_BAND_JAPAN_HIGH 90000 -#define WL1273_BAND_OTHER_LOW 87500 -#define WL1273_BAND_OTHER_HIGH 108000 - -#define WL1273_BAND_TX_LOW 76000 -#define WL1273_BAND_TX_HIGH 108000 - -struct wl1273_core { - struct mfd_cell cells[WL1273_FM_CORE_CELLS]; - struct wl1273_fm_platform_data *pdata; - - unsigned int mode; - unsigned int i2s_mode; - unsigned int volume; - unsigned int audio_mode; - unsigned int channel_number; - struct mutex lock; /* for serializing fm radio operations */ - - struct i2c_client *client; - - int (*read)(struct wl1273_core *core, u8, u16 *); - int (*write)(struct wl1273_core *core, u8, u16); - int (*write_data)(struct wl1273_core *core, u8 *, u16); - int (*set_audio)(struct wl1273_core *core, unsigned int); - int (*set_volume)(struct wl1273_core *core, unsigned int); -}; - -#endif /* ifndef WL1273_CORE_H */ -- cgit v1.2.3 From dd064d5101ea473d39c39ffaa8beeb8f47bbeb09 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:18 +0800 Subject: ext4: introduce seq counter for the extent status entry In the iomap_write_iter(), the iomap buffered write frame does not hold any locks between querying the inode extent mapping info and performing page cache writes. As a result, the extent mapping can be changed due to concurrent I/O in flight. Similarly, in the iomap_writepage_map(), the write-back process faces a similar problem: concurrent changes can invalidate the extent mapping before the I/O is submitted. Therefore, both of these processes must recheck the mapping info after acquiring the folio lock. To address this, similar to XFS, we propose introducing an extent sequence number to serve as a validity cookie for the extent. After commit 24b7a2331fcd ("ext4: clairfy the rules for modifying extents"), we can ensure the extent information should always be processed through the extent status tree, and the extent status tree is always uptodate under i_rwsem or invalidate_lock or folio lock, so it's safe to introduce this sequence number. The sequence number will be increased whenever the extent status tree changes, preparing for the buffered write iomap conversion. Besides, this mechanism is also applicable for the moving extents case. In move_extent_per_page(), it also needs to reacquire data_sem and check the mapping info again under the folio lock. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-3-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- include/trace/events/ext4.h | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index a374e7ea7e57..6a0754d38acf 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2210,7 +2210,8 @@ DECLARE_EVENT_CLASS(ext4__es_extent, __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) - __field( char, status ) + __field( char, status ) + __field( u64, seq ) ), TP_fast_assign( @@ -2220,13 +2221,15 @@ DECLARE_EVENT_CLASS(ext4__es_extent, __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, - __entry->pblk, show_extent_status(__entry->status)) + __entry->pblk, show_extent_status(__entry->status), + __entry->seq) ); DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, @@ -2251,6 +2254,7 @@ TRACE_EVENT(ext4_es_remove_extent, __field( ino_t, ino ) __field( loff_t, lblk ) __field( loff_t, len ) + __field( u64, seq ) ), TP_fast_assign( @@ -2258,12 +2262,13 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->lblk, __entry->len) + __entry->lblk, __entry->len, __entry->seq) ); TRACE_EVENT(ext4_es_find_extent_range_enter, @@ -2523,6 +2528,7 @@ TRACE_EVENT(ext4_es_insert_delayed_extent, __field( char, status ) __field( bool, lclu_allocated ) __field( bool, end_allocated ) + __field( u64, seq ) ), TP_fast_assign( @@ -2534,15 +2540,16 @@ TRACE_EVENT(ext4_es_insert_delayed_extent, __entry->status = ext4_es_status(es); __entry->lclu_allocated = lclu_allocated; __entry->end_allocated = end_allocated; + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " - "allocated %d %d", + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s allocated %d %d seq %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status), - __entry->lclu_allocated, __entry->end_allocated) + __entry->lclu_allocated, __entry->end_allocated, + __entry->seq) ); /* fsmap traces */ -- cgit v1.2.3 From 9dbf945320b11c5865d2f550f8e972566d04d181 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Mon, 13 Oct 2025 09:51:28 +0800 Subject: ext4: add two trace points for moving extents To facilitate tracking the length, type, and outcome of the move extent, add a trace point at both the entry and exit of mext_move_extent(). Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Message-ID: <20251013015128.499308-13-yi.zhang@huaweicloud.com> Signed-off-by: Theodore Ts'o --- include/trace/events/ext4.h | 74 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6a0754d38acf..a05bdd48e16e 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -3016,6 +3016,80 @@ TRACE_EVENT(ext4_update_sb, __entry->fsblk, __entry->flags) ); +TRACE_EVENT(ext4_move_extent_enter, + TP_PROTO(struct inode *orig_inode, struct ext4_map_blocks *orig_map, + struct inode *donor_inode, ext4_lblk_t donor_lblk), + + TP_ARGS(orig_inode, orig_map, donor_inode, donor_lblk), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, orig_ino) + __field(ext4_lblk_t, orig_lblk) + __field(unsigned int, orig_flags) + __field(ino_t, donor_ino) + __field(ext4_lblk_t, donor_lblk) + __field(unsigned int, len) + ), + + TP_fast_assign( + __entry->dev = orig_inode->i_sb->s_dev; + __entry->orig_ino = orig_inode->i_ino; + __entry->orig_lblk = orig_map->m_lblk; + __entry->orig_flags = orig_map->m_flags; + __entry->donor_ino = donor_inode->i_ino; + __entry->donor_lblk = donor_lblk; + __entry->len = orig_map->m_len; + ), + + TP_printk("dev %d,%d origin ino %lu lblk %u flags %s donor ino %lu lblk %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->orig_ino, __entry->orig_lblk, + show_mflags(__entry->orig_flags), + (unsigned long) __entry->donor_ino, __entry->donor_lblk, + __entry->len) +); + +TRACE_EVENT(ext4_move_extent_exit, + TP_PROTO(struct inode *orig_inode, ext4_lblk_t orig_lblk, + struct inode *donor_inode, ext4_lblk_t donor_lblk, + unsigned int m_len, u64 move_len, int move_type, int ret), + + TP_ARGS(orig_inode, orig_lblk, donor_inode, donor_lblk, m_len, + move_len, move_type, ret), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, orig_ino) + __field(ext4_lblk_t, orig_lblk) + __field(ino_t, donor_ino) + __field(ext4_lblk_t, donor_lblk) + __field(unsigned int, m_len) + __field(u64, move_len) + __field(int, move_type) + __field(int, ret) + ), + + TP_fast_assign( + __entry->dev = orig_inode->i_sb->s_dev; + __entry->orig_ino = orig_inode->i_ino; + __entry->orig_lblk = orig_lblk; + __entry->donor_ino = donor_inode->i_ino; + __entry->donor_lblk = donor_lblk; + __entry->m_len = m_len; + __entry->move_len = move_len; + __entry->move_type = move_type; + __entry->ret = ret; + ), + + TP_printk("dev %d,%d origin ino %lu lblk %u donor ino %lu lblk %u m_len %u, move_len %llu type %d ret %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->orig_ino, __entry->orig_lblk, + (unsigned long) __entry->donor_ino, __entry->donor_lblk, + __entry->m_len, __entry->move_len, __entry->move_type, + __entry->ret) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 6a571d762cda6c25517c5533b8bd06d56028cdcb Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 4 Nov 2025 18:39:05 +0530 Subject: soc: qcom: socinfo: Add support for new fields in revision 20 Add support for socinfo version 20. Version 20 adds a new field package id and its zeroth bit contain information that can be can be used to tune temperature thresholds on devices which might be able to withstand higher temperatures. Zeroth bit value 1 means that its heat dissipation is better and more relaxed thermal scheme can be put in place and 0 means a more aggressive scheme may be needed. Reviewed-by: Konrad Dybcio Reviewed-by: Dmitry Baryshkov Signed-off-by: Mukesh Ojha Link: https://lore.kernel.org/r/20251104130906.167666-1-mukesh.ojha@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/socinfo.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index 608950443eee..c4dae173cc30 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -82,6 +82,8 @@ struct socinfo { __le32 num_func_clusters; __le32 boot_cluster; __le32 boot_core; + /* Version 20 */ + __le32 raw_package_type; }; /* Internal feature codes */ -- cgit v1.2.3 From 6918667af5a7315eff3c56d871be4c5439f7f9d2 Mon Sep 17 00:00:00 2001 From: Mukesh Ojha Date: Tue, 4 Nov 2025 18:39:06 +0530 Subject: soc: qcom: socinfo: Add reserve field to support future extension Some of the new field added to socinfo structure with version 21, 22 and 23 which is only used by boot firmware and it is of no use for Linux.Add reserve field in socinfo so that the structure remain updated and prepared if we get any new field in future which could be used by Linux. While at it, also updates switch case for backward compatibility if the SoC runs with boot firmware which has these new version added. Signed-off-by: Mukesh Ojha Link: https://lore.kernel.org/r/20251104130906.167666-2-mukesh.ojha@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/linux/soc/qcom/socinfo.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/soc/qcom/socinfo.h b/include/linux/soc/qcom/socinfo.h index c4dae173cc30..ba823a0013c5 100644 --- a/include/linux/soc/qcom/socinfo.h +++ b/include/linux/soc/qcom/socinfo.h @@ -84,6 +84,8 @@ struct socinfo { __le32 boot_core; /* Version 20 */ __le32 raw_package_type; + /* Version 21, 22, 23 */ + __le32 reserve1[4]; }; /* Internal feature codes */ -- cgit v1.2.3 From 9352d40c8bcd2ef29366d2c38b163c0b115039ed Mon Sep 17 00:00:00 2001 From: Mohammad Heib Date: Sat, 25 Oct 2025 16:08:58 +0300 Subject: devlink: Add new "max_mac_per_vf" generic device param Add a new device generic parameter to controls the maximum number of MAC filters allowed per VF. For example, to limit a VF to 3 MAC addresses: $ devlink dev param set pci/0000:3b:00.0 name max_mac_per_vf \ value 3 \ cmode runtime Signed-off-by: Mohammad Heib Reviewed-by: Simon Horman Signed-off-by: Jacob Keller Signed-off-by: Tony Nguyen --- include/net/devlink.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 9e824f61e40f..d01046ef0577 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -532,6 +532,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, + DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -602,6 +603,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME "num_doorbells" #define DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME "max_mac_per_vf" +#define DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE DEVLINK_PARAM_TYPE_U32 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ -- cgit v1.2.3 From c6230446b1a6f3c91effafd99f604de455da52e5 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Mon, 3 Nov 2025 12:20:20 +0000 Subject: net: dsa: add tagging driver for MaxLinear GSW1xx switch family Add support for a new DSA tagging protocol driver for the MaxLinear GSW1xx switch family. The GSW1xx switches use a proprietary 8-byte special tag inserted between the source MAC address and the EtherType field to indicate the source and destination ports for frames traversing the CPU port. Implement the tag handling logic to insert the special tag on transmit and parse it on receive. Signed-off-by: Daniel Golle Reviewed-by: Alexander Sverdlin Tested-by: Alexander Sverdlin Link: https://patch.msgid.link/0e973ebfd9433c30c96f50670da9e9449a0d98f2.1762170107.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 ++ include/uapi/linux/if_ether.h | 1 + 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 67762fdaf3c7..2df2e2ead9a8 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -56,6 +56,7 @@ struct tc_action; #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 #define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE 29 #define DSA_TAG_PROTO_YT921X_VALUE 30 +#define DSA_TAG_PROTO_MXL_GSW1XX_VALUE 31 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, @@ -89,6 +90,7 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_LAN937X = DSA_TAG_PROTO_LAN937X_VALUE, DSA_TAG_PROTO_VSC73XX_8021Q = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE, DSA_TAG_PROTO_YT921X = DSA_TAG_PROTO_YT921X_VALUE, + DSA_TAG_PROTO_MXL_GSW1XX = DSA_TAG_PROTO_MXL_GSW1XX_VALUE, }; struct dsa_switch; diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index cfd200c87e5e..2c93b7b731c8 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -92,6 +92,7 @@ #define ETH_P_ETHERCAT 0x88A4 /* EtherCAT */ #define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ #define ETH_P_802_EX1 0x88B5 /* 802.1 Local Experimental 1. */ +#define ETH_P_MXLGSW 0x88C3 /* MaxLinear GSW DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ #define ETH_P_TIPC 0x88CA /* TIPC */ #define ETH_P_LLDP 0x88CC /* Link Layer Discovery Protocol */ -- cgit v1.2.3 From 38724a474c0fc37b6604e8b20c75d87446fc2fd1 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Thu, 30 Oct 2025 14:59:46 +0100 Subject: ice: add virtchnl definitions and static data for GTP RSS Add virtchnl protocol header and field definitions for advanced RSS configuration including GTPC, GTPU, L2TPv2, ECPRI, PPP, GRE, and IP fragment headers. - Define new virtchnl protocol header types - Add RSS field selectors for tunnel protocols - Extend static mapping arrays for protocol field matching - Add L2TPv2 session ID and length+session ID field support This provides the foundational definitions needed for VF RSS configuration of tunnel protocols. Co-developed-by: Dan Nowlin Signed-off-by: Dan Nowlin Co-developed-by: Jie Wang Signed-off-by: Jie Wang Co-developed-by: Junfeng Guo Signed-off-by: Junfeng Guo Co-developed-by: Qi Zhang Signed-off-by: Qi Zhang Co-developed-by: Ting Xu Signed-off-by: Ting Xu Signed-off-by: Przemek Kitszel Signed-off-by: Aleksandr Loktionov Signed-off-by: Tony Nguyen --- include/linux/avf/virtchnl.h | 50 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'include') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 5be1881abbb6..11bdab5522fd 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -1253,6 +1253,17 @@ enum virtchnl_proto_hdr_type { VIRTCHNL_PROTO_HDR_ESP, VIRTCHNL_PROTO_HDR_AH, VIRTCHNL_PROTO_HDR_PFCP, + VIRTCHNL_PROTO_HDR_GTPC, + VIRTCHNL_PROTO_HDR_ECPRI, + VIRTCHNL_PROTO_HDR_L2TPV2, + VIRTCHNL_PROTO_HDR_PPP, + /* IPv4 and IPv6 Fragment header types are only associated to + * VIRTCHNL_PROTO_HDR_IPV4 and VIRTCHNL_PROTO_HDR_IPV6 respectively, + * cannot be used independently. + */ + VIRTCHNL_PROTO_HDR_IPV4_FRAG, + VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG, + VIRTCHNL_PROTO_HDR_GRE, }; /* Protocol header field within a protocol header. */ @@ -1275,6 +1286,7 @@ enum virtchnl_proto_hdr_field { VIRTCHNL_PROTO_HDR_IPV4_DSCP, VIRTCHNL_PROTO_HDR_IPV4_TTL, VIRTCHNL_PROTO_HDR_IPV4_PROT, + VIRTCHNL_PROTO_HDR_IPV4_CHKSUM, /* IPV6 */ VIRTCHNL_PROTO_HDR_IPV6_SRC = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6), @@ -1282,18 +1294,34 @@ enum virtchnl_proto_hdr_field { VIRTCHNL_PROTO_HDR_IPV6_TC, VIRTCHNL_PROTO_HDR_IPV6_HOP_LIMIT, VIRTCHNL_PROTO_HDR_IPV6_PROT, + /* IPV6 Prefix */ + VIRTCHNL_PROTO_HDR_IPV6_PREFIX32_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX32_DST, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX40_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX40_DST, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX48_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX48_DST, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX56_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX56_DST, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX64_DST, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX96_SRC, + VIRTCHNL_PROTO_HDR_IPV6_PREFIX96_DST, /* TCP */ VIRTCHNL_PROTO_HDR_TCP_SRC_PORT = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_TCP), VIRTCHNL_PROTO_HDR_TCP_DST_PORT, + VIRTCHNL_PROTO_HDR_TCP_CHKSUM, /* UDP */ VIRTCHNL_PROTO_HDR_UDP_SRC_PORT = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_UDP), VIRTCHNL_PROTO_HDR_UDP_DST_PORT, + VIRTCHNL_PROTO_HDR_UDP_CHKSUM, /* SCTP */ VIRTCHNL_PROTO_HDR_SCTP_SRC_PORT = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_SCTP), VIRTCHNL_PROTO_HDR_SCTP_DST_PORT, + VIRTCHNL_PROTO_HDR_SCTP_CHKSUM, /* GTPU_IP */ VIRTCHNL_PROTO_HDR_GTPU_IP_TEID = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_IP), @@ -1317,6 +1345,28 @@ enum virtchnl_proto_hdr_field { VIRTCHNL_PROTO_HDR_PFCP_S_FIELD = PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_PFCP), VIRTCHNL_PROTO_HDR_PFCP_SEID, + /* GTPC */ + VIRTCHNL_PROTO_HDR_GTPC_TEID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPC), + /* ECPRI */ + VIRTCHNL_PROTO_HDR_ECPRI_MSG_TYPE = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_ECPRI), + VIRTCHNL_PROTO_HDR_ECPRI_PC_RTC_ID, + /* IPv4 Dummy Fragment */ + VIRTCHNL_PROTO_HDR_IPV4_FRAG_PKID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV4_FRAG), + /* IPv6 Extension Fragment */ + VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG_PKID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_IPV6_EH_FRAG), + /* GTPU_DWN/UP */ + VIRTCHNL_PROTO_HDR_GTPU_DWN_QFI = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_DWN), + VIRTCHNL_PROTO_HDR_GTPU_UP_QFI = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_GTPU_EH_PDU_UP), + /* L2TPv2 */ + VIRTCHNL_PROTO_HDR_L2TPV2_SESS_ID = + PROTO_HDR_FIELD_START(VIRTCHNL_PROTO_HDR_L2TPV2), + VIRTCHNL_PROTO_HDR_L2TPV2_LEN_SESS_ID, }; struct virtchnl_proto_hdr { -- cgit v1.2.3 From 9311e6c29b348b005e79228ef6facd38ebcc73f9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Nov 2025 08:12:36 -1000 Subject: cgroup: Fix sleeping from invalid context warning on PREEMPT_RT cgroup_task_dead() is called from finish_task_switch() which runs with preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The function needs to acquire css_set_lock which is a regular spinlock that can sleep on RT kernels, leading to "sleeping function called from invalid context" warnings. css_set_lock is too large in scope to convert to a raw_spinlock. However, the unlinking operations don't need to run synchronously - they just need to complete after the task is done running. On PREEMPT_RT, defer the work through irq_work. While the work doesn't need to happen immediately, it can't be delayed indefinitely either as the dead task pins the cgroup and task_struct can be pinned indefinitely. Use the lazy version of irq_work to allow batching and lower impact while ensuring timely completion. v2: Use IRQ_WORK_INIT_LAZY instead of immediate irq_work and add explanation for why the work can't be delayed indefinitely (Sebastian Andrzej Siewior). Fixes: d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out") Reported-by: Calvin Owens Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org Signed-off-by: Tejun Heo --- include/linux/sched.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..5e80d48488ef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1324,7 +1324,10 @@ struct task_struct { struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; -#endif +#ifdef CONFIG_PREEMPT_RT + struct llist_node cg_dead_lnode; +#endif /* CONFIG_PREEMPT_RT */ +#endif /* CONFIG_CGROUPS */ #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; -- cgit v1.2.3 From 15638d52cbcf6e969f4a5e2757b118355db583f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Nov 2025 14:52:15 -0500 Subject: block: fix cached zone reporting after zone append was used No zone plugs are allocated when a zone is opened by calling Zone Append on it. This makes the cached zone reporting report incorrectly empty zones if the file system is unmounted and report zones is called after that, e.g. by xfstests test cases using the scratch device. Fix this by recording if zone append was used on a device, and disable cached reporting for the device until a ZONE_RESET_ALL happens that guarantees all zones are empty. We could probably do even better using a per-zone flag, but the practical use cache for zone reporting after the initial mount are rather limited, so let's keep things simple for now. Fixes: 31f0656a4ab7 ("block: introduce blkdev_report_zones_cached()") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f0ab02e0a673..6a498aa7f7e7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -173,6 +173,7 @@ struct gendisk { #define GD_ADDED 4 #define GD_SUPPRESS_PART_SCAN 5 #define GD_OWNS_QUEUE 6 +#define GD_ZONE_APPEND_USED 7 struct mutex open_mutex; /* open/close mutex */ unsigned open_partitions; /* number of open partitions */ -- cgit v1.2.3 From 24ab8efb9aea77764dd99d2bad41fd8991223013 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 31 Oct 2025 22:20:55 +0100 Subject: xsk: Move NETDEV_XDP_ACT_ZC into generic header Move NETDEV_XDP_ACT_ZC into xdp_sock_drv.h header such that external code can reuse it, and rename it into more generic NETDEV_XDP_ACT_XSK. Signed-off-by: Daniel Borkmann Co-developed-by: David Wei Signed-off-by: David Wei Reviewed-by: Maciej Fijalkowski Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20251031212103.310683-7-daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- include/net/xdp_sock_drv.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 4f2d3268a676..242e34f771cc 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -12,6 +12,10 @@ #define XDP_UMEM_MIN_CHUNK_SHIFT 11 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) +#define NETDEV_XDP_ACT_XSK (NETDEV_XDP_ACT_BASIC | \ + NETDEV_XDP_ACT_REDIRECT | \ + NETDEV_XDP_ACT_XSK_ZEROCOPY) + struct xsk_cb_desc { void *src; u8 off; -- cgit v1.2.3 From 2f6b2565d43cdb5087cac23d530cca84aa3d897e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 14 Oct 2025 08:04:55 -0700 Subject: block: accumulate memory segment gaps per bio The blk-mq dma iterator has an optimization for requests that align to the device's iommu merge boundary. This boundary may be larger than the device's virtual boundary, but the code had been depending on that queue limit to know ahead of time if the request is guaranteed to align to that optimization. Rather than rely on that queue limit, which many devices may not report, save the lowest set bit of any boundary gap between each segment in the bio while checking the segments. The request stores the value for merging and quickly checking per io if the request can use iova optimizations. Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 ++ include/linux/blk-mq.h | 16 ++++++++++++++++ include/linux/blk_types.h | 12 ++++++++++++ 3 files changed, 30 insertions(+) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 16c1c85613b7..ad2d57908c1c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -324,6 +324,8 @@ extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, unsigned *segs, unsigned max_bytes, unsigned len_align); +u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, + u8 gaps_bit); /** * bio_next_split - get next @sectors from a bio, splitting if necessary diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b25d12545f46..b54506b3b76d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -152,6 +152,14 @@ struct request { unsigned short nr_phys_segments; unsigned short nr_integrity_segments; + /* + * The lowest set bit for address gaps between physical segments. This + * provides information necessary for dma optimization opprotunities, + * like for testing if the segments can be coalesced against the + * device's iommu granule. + */ + unsigned char phys_gap_bit; + #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; struct blk_crypto_keyslot *crypt_keyslot; @@ -208,6 +216,14 @@ struct request { void *end_io_data; }; +/* + * Returns a mask with all bits starting at req->phys_gap_bit set to 1. + */ +static inline unsigned long req_phys_gap_mask(const struct request *req) +{ + return ~(((1 << req->phys_gap_bit) >> 1) - 1); +} + static inline enum req_op req_op(const struct request *req) { return req->cmd_flags & REQ_OP_MASK; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8e8d1cc8b06c..53501ebb0623 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -218,6 +218,18 @@ struct bio { enum rw_hint bi_write_hint; u8 bi_write_stream; blk_status_t bi_status; + + /* + * The bvec gap bit indicates the lowest set bit in any address offset + * between all bi_io_vecs. This field is initialized only after the bio + * is split to the hardware limits (see bio_split_io_at()). The value + * may be used to consider DMA optimization when performing that + * mapping. The value is compared to a power of two mask where the + * result depends on any bit set within the mask, so saving the lowest + * bit is sufficient to know if any segment gap collides with the mask. + */ + u8 bi_bvec_gap_bit; + atomic_t __bi_remaining; struct bvec_iter bi_iter; -- cgit v1.2.3 From 4c0a17e28340e458627d672564200406e220d6a3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 5 Nov 2025 10:05:33 +0100 Subject: slab: prevent recursive kmalloc() in alloc_empty_sheaf() We want to expand usage of sheaves to all non-boot caches, including kmalloc caches. Since sheaves themselves are also allocated by kmalloc(), we need to prevent excessive or infinite recursion - depending on sheaf size, the sheaf can be allocated from smaller, same or larger kmalloc size bucket, there's no particular constraint. This is similar to allocating the objext arrays so let's just reuse the existing mechanisms for those. __GFP_NO_OBJ_EXT in alloc_empty_sheaf() will prevent a nested kmalloc() from allocating a sheaf itself - it will either have sheaves already, or fallback to a non-sheaf-cached allocation (so bootstrap of sheaves in a kmalloc cache that allocates sheaves from its own size bucket is possible). Additionally, reuse OBJCGS_CLEAR_MASK to clear unwanted gfp flags from the nested allocation. Link: https://patch.msgid.link/20251105-sheaves-cleanups-v1-5-b8218e1ac7ef@suse.cz Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/gfp_types.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 65db9349f905..3de43b12209e 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -55,9 +55,7 @@ enum { #ifdef CONFIG_LOCKDEP ___GFP_NOLOCKDEP_BIT, #endif -#ifdef CONFIG_SLAB_OBJ_EXT ___GFP_NO_OBJ_EXT_BIT, -#endif ___GFP_LAST_BIT }; @@ -98,11 +96,7 @@ enum { #else #define ___GFP_NOLOCKDEP 0 #endif -#ifdef CONFIG_SLAB_OBJ_EXT #define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT) -#else -#define ___GFP_NO_OBJ_EXT 0 -#endif /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) -- cgit v1.2.3 From ce284f882022ebcb953984c7eccf4fc4eb531978 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Thu, 16 Oct 2025 15:38:01 +0200 Subject: pwm: Export `pwmchip_release` for external use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The upcoming Rust abstraction layer for the PWM subsystem uses a custom `dev->release` handler to safely manage the lifetime of its driver data. To prevent leaking the memory of the `struct pwm_chip` (allocated by `pwmchip_alloc`), this custom handler must also call the original `pwmchip_release` function to complete the cleanup. Make `pwmchip_release` a global, exported function so that it can be called from the Rust FFI bridge. This involves removing the `static` keyword, adding a prototype to the public header, and exporting the symbol. Reviewed-by: Elle Rhumsaa Signed-off-by: Michal Wilczynski Link: https://patch.msgid.link/20251016-rust-next-pwm-working-fan-for-sending-v16-1-a5df2405d2bd@samsung.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 549ac4aaad59..148f056f336b 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -488,6 +488,12 @@ int __pwmchip_add(struct pwm_chip *chip, struct module *owner); #define pwmchip_add(chip) __pwmchip_add(chip, THIS_MODULE) void pwmchip_remove(struct pwm_chip *chip); +/* + * For FFI wrapper use only: + * The Rust PWM abstraction needs this to properly free the pwm_chip. + */ +void pwmchip_release(struct device *dev); + int __devm_pwmchip_add(struct device *dev, struct pwm_chip *chip, struct module *owner); #define devm_pwmchip_add(dev, chip) __devm_pwmchip_add(dev, chip, THIS_MODULE) -- cgit v1.2.3 From 37827223f86aa71b267769d5f51ca16b44b45ae5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:12 -0800 Subject: srcu: Add SRCU_READ_FLAVOR_FAST_UPDOWN CPP macro This commit adds the SRCU_READ_FLAVOR_FAST_UPDOWN=0x8 macro and adjusts rcutorture to make use of it. In this commit, both SRCU_READ_FLAVOR_FAST=0x4 and the new SRCU_READ_FLAVOR_FAST_UPDOWN test SRCU-fast. When the SRCU-fast-updown is added, the new SRCU_READ_FLAVOR_FAST_UPDOWN macro will test it when passed to the rcutorture.reader_flavor module parameter. The old SRCU_READ_FLAVOR_FAST macro's value changed from 0x8 to 0x4. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Sebastian Andrzej Siewior Cc: Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 41e27c1d917d..1dd6812aabe7 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -56,13 +56,15 @@ int init_srcu_struct_fast(struct srcu_struct *ssp); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */ -#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). -#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). -// 0x4 // SRCU-lite is no longer with us. -#define SRCU_READ_FLAVOR_FAST 0x8 // srcu_read_lock_fast(). -#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ - SRCU_READ_FLAVOR_FAST) // All of the above. -#define SRCU_READ_FLAVOR_SLOWGP SRCU_READ_FLAVOR_FAST +#define SRCU_READ_FLAVOR_NORMAL 0x1 // srcu_read_lock(). +#define SRCU_READ_FLAVOR_NMI 0x2 // srcu_read_lock_nmisafe(). +// 0x4 // SRCU-lite is no longer with us. +#define SRCU_READ_FLAVOR_FAST 0x4 // srcu_read_lock_fast(). +#define SRCU_READ_FLAVOR_FAST_UPDOWN 0x8 // srcu_read_lock_fast(). +#define SRCU_READ_FLAVOR_ALL (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \ + SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN) + // All of the above. +#define SRCU_READ_FLAVOR_SLOWGP (SRCU_READ_FLAVOR_FAST | SRCU_READ_FLAVOR_FAST_UPDOWN) // Flavors requiring synchronize_rcu() // instead of smp_mb(). void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); -- cgit v1.2.3 From 187de7c212e5fa87779e1026bf949337bca0cdaa Mon Sep 17 00:00:00 2001 From: John Ogness Date: Mon, 27 Oct 2025 17:18:03 +0106 Subject: printk: nbcon: Allow unsafe write_atomic() for panic There may be console drivers that have not yet figured out a way to implement safe atomic printing (->write_atomic() callback). These drivers could choose to only implement threaded printing (->write_thread() callback), but then it is guaranteed that _no_ output will be printed during panic. Not even attempted. As a result, developers may be tempted to implement unsafe ->write_atomic() callbacks and/or implement some sort of custom deferred printing trickery to try to make it work. This goes against the principle intention of the nbcon API as well as endangers other nbcon drivers that are doing things correctly (safely). As a compromise, allow nbcon drivers to implement unsafe ->write_atomic() callbacks by providing a new console flag CON_NBCON_ATOMIC_UNSAFE. When specified, the ->write_atomic() callback for that console will _only_ be called during the final "hope and pray" flush attempt at the end of a panic: nbcon_atomic_flush_unsafe(). Signed-off-by: John Ogness Link: https://lore.kernel.org/lkml/b2qps3uywhmjaym4mht2wpxul4yqtuuayeoq4iv4k3zf5wdgh3@tocu6c7mj4lt Reviewed-by: Petr Mladek Link: https://lore.kernel.org/all/swdpckuwwlv3uiessmtnf2jwlx3jusw6u7fpk5iggqo4t2vdws@7rpjso4gr7qp/ [1] Link: https://lore.kernel.org/all/20251103-fix_netpoll_aa-v4-1-4cfecdf6da7c@debian.org/ [2] Link: https://patch.msgid.link/20251027161212.334219-2-john.ogness@linutronix.de [pmladek@suse.com: Fix build with rework/nbcon-in-kdb branch.] Signed-off-by: Petr Mladek --- include/linux/console.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/console.h b/include/linux/console.h index d17f1f525bec..5f17321ed962 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -186,6 +186,8 @@ static inline void con_debug_leave(void) { } * printing callbacks must not be called. * @CON_NBCON: Console can operate outside of the legacy style console_lock * constraints. + * @CON_NBCON_ATOMIC_UNSAFE: The write_atomic() callback is not safe and is + * therefore only used by nbcon_atomic_flush_unsafe(). */ enum cons_flags { CON_PRINTBUFFER = BIT(0), @@ -197,6 +199,7 @@ enum cons_flags { CON_EXTENDED = BIT(6), CON_SUSPENDED = BIT(7), CON_NBCON = BIT(8), + CON_NBCON_ATOMIC_UNSAFE = BIT(9), }; /** @@ -608,6 +611,7 @@ extern void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt, extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt); extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt); extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt); +extern bool nbcon_allow_unsafe_takeover(void); extern bool nbcon_kdb_try_acquire(struct console *con, struct nbcon_write_context *wctxt); extern void nbcon_kdb_release(struct nbcon_write_context *wctxt); @@ -627,9 +631,18 @@ static inline bool console_is_usable(struct console *con, short flags, bool use_ return false; if (flags & CON_NBCON) { - /* The write_atomic() callback is optional. */ - if (use_atomic && !con->write_atomic) - return false; + if (use_atomic) { + /* The write_atomic() callback is optional. */ + if (!con->write_atomic) + return false; + + /* + * An unsafe write_atomic() callback is only usable + * when unsafe takeovers are allowed. + */ + if ((flags & CON_NBCON_ATOMIC_UNSAFE) && !nbcon_allow_unsafe_takeover()) + return false; + } /* * For the !use_atomic case, @printk_kthreads_running is not -- cgit v1.2.3 From 7ab06ea41af53aa1713186ceaa154179e4b0d4c9 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Wed, 5 Nov 2025 18:38:49 +0800 Subject: arch_topology: Provide a stub topology_core_has_smt() for !CONFIG_GENERIC_ARCH_TOPOLOGY The arm_pmu driver is using topology_core_has_smt() for retrieving the SMT implementation which depends on CONFIG_GENERIC_ARCH_TOPOLOGY. The config is optional on arm platforms so provide a !CONFIG_GENERIC_ARCH_TOPOLOGY stub for topology_core_has_smt(). Fixes: c3d78c34ad00 ("perf: arm_pmuv3: Don't use PMCCNTR_EL0 on SMT cores") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511041757.vuCGOmFc-lkp@intel.com/ Suggested-by: Will Deacon Signed-off-by: Yicong Yang Reviewed-by: Mark Brown Signed-off-by: Will Deacon --- include/linux/arch_topology.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index daa1af2e8204..0c2a8b846c20 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -100,6 +100,10 @@ static inline bool topology_core_has_smt(int cpu) return cpu_topology[cpu].thread_id != -1; } -#endif +#else + +static inline bool topology_core_has_smt(int cpu) { return false; } + +#endif /* CONFIG_GENERIC_ARCH_TOPOLOGY */ #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ -- cgit v1.2.3 From 25976c314f6596254c9b1e2291d94393b7d5ae81 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 7 Nov 2025 15:38:44 +0900 Subject: block: introduce bdev_zone_start() Introduce the function bdev_zone_start() as a more explicit (and clear) replacement for ALIGN_DOWN() to get the start sector of a zone containing a particular sector of a zoned block device. Use this new helper in blkdev_get_zone_info() and blkdev_report_zones_cached(). Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a498aa7f7e7..2fff8a80dbd2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1522,6 +1522,12 @@ static inline sector_t bdev_zone_sectors(struct block_device *bdev) return q->limits.chunk_sectors; } +static inline sector_t bdev_zone_start(struct block_device *bdev, + sector_t sector) +{ + return sector & ~(bdev_zone_sectors(bdev) - 1); +} + static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev, sector_t sector) { -- cgit v1.2.3 From be88c549e9d78828a2e06126ed7e17fc2e030f1f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 6 Nov 2025 00:32:40 +0000 Subject: tcp: Call tcp_syn_ack_timeout() directly. Since DCCP has been removed, we do not need to use request_sock_ops.syn_ack_timeout(). Let's call tcp_syn_ack_timeout() directly. Now other function pointers of request_sock_ops are protocol-dependent. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251106003357.273403-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/request_sock.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index cd4d4cf71d0d..9b9e04f6bb89 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -36,7 +36,6 @@ struct request_sock_ops { struct sk_buff *skb, enum sk_rst_reason reason); void (*destructor)(struct request_sock *req); - void (*syn_ack_timeout)(const struct request_sock *req); }; struct saved_syn { -- cgit v1.2.3 From 3ce5dd8161ecdf12ffe0af99ff8980f1432f64a5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 6 Nov 2025 00:32:41 +0000 Subject: tcp: Remove timeout arg from reqsk_queue_hash_req(). inet_csk_reqsk_queue_hash_add() is no longer shared by DCCP. We do not need to pass req->timeout down to reqsk_queue_hash_req(). Let's move tcp_timeout_init() from tcp_conn_request() to reqsk_queue_hash_req(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251106003357.273403-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b4b886647607..90a99a2fc804 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -267,8 +267,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req, struct sock *child); -bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout); +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req); struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child, struct request_sock *req, bool own_req); -- cgit v1.2.3 From 207ce0f6bc131812c96cf4f6db328af5396cebac Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 6 Nov 2025 00:32:43 +0000 Subject: tcp: Remove timeout arg from reqsk_timeout(). reqsk_timeout() is always called with @timeout being TCP_RTO_MAX. Let's remove the arg. As a prep for the next patch, reqsk_timeout() is moved to tcp.h and renamed to tcp_reqsk_timeout(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251106003357.273403-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 8 -------- include/net/tcp.h | 7 +++++++ 2 files changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 90a99a2fc804..fd40af2221b9 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -290,14 +290,6 @@ static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req); -static inline unsigned long -reqsk_timeout(struct request_sock *req, unsigned long max_timeout) -{ - u64 timeout = (u64)req->timeout << req->num_timeout; - - return (unsigned long)min_t(u64, timeout, max_timeout); -} - void inet_csk_destroy_sock(struct sock *sk); void inet_csk_prepare_for_destroy_sock(struct sock *sk); void inet_csk_prepare_forced_close(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 0aa1f07d036a..0c7274ac7ed5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -841,6 +841,13 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp) return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } +static inline unsigned long tcp_reqsk_timeout(struct request_sock *req) +{ + u64 timeout = (u64)req->timeout << req->num_timeout; + + return (unsigned long)min_t(u64, timeout, TCP_RTO_MAX); +} + u32 tcp_delack_max(const struct sock *sk); /* Compute the actual rto_min value */ -- cgit v1.2.3 From 1e9d3005e02cba82047d49f859982fc73b9a100b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 6 Nov 2025 00:32:44 +0000 Subject: tcp: Apply max RTO to non-TFO SYN+ACK. Since commit 54a378f43425 ("tcp: add the ability to control max RTO"), TFO SYN+ACK RTO is capped by the TFO full sk's inet_csk(sk)->icsk_rto_max. The value is inherited from the parent listener. Let's apply the same cap to non-TFO SYN+ACK. Note that req->rsk_listener is always non-NULL when we call tcp_reqsk_timeout() in reqsk_timer_handler() or tcp_check_req(). It could be NULL for SYN cookie req, but we do not use req->timeout then. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251106003357.273403-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 0c7274ac7ed5..4833ec7903ec 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -845,7 +845,8 @@ static inline unsigned long tcp_reqsk_timeout(struct request_sock *req) { u64 timeout = (u64)req->timeout << req->num_timeout; - return (unsigned long)min_t(u64, timeout, TCP_RTO_MAX); + return (unsigned long)min_t(u64, timeout, + tcp_rto_max(req->rsk_listener)); } u32 tcp_delack_max(const struct sock *sk); -- cgit v1.2.3 From 416dd649f3aa3774907c668167a29c668dbc634b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 6 Nov 2025 11:52:36 +0000 Subject: tcp: add net.ipv4.tcp_comp_sack_rtt_percent TCP SACK compression has been added in 2018 in commit 5d9f4262b7ea ("tcp: add SACK compression"). It is working great for WAN flows (with large RTT). Wifi in particular gets a significant boost _when_ ACK are suppressed. Add a new sysctl so that we can tune the very conservative 5 % value that has been used so far in this formula, so that small RTT flows can benefit from this feature. delay = min ( 5 % of RTT, 1 ms) This patch adds new tcp_comp_sack_rtt_percent sysctl to ease experiments and tuning. Given that we cap the delay to 1ms (tcp_comp_sack_delay_ns sysctl), set the default value to 33 %. Quoting Neal Cardwell ( https://lore.kernel.org/netdev/CADVnQymZ1tFnEA1Q=vtECs0=Db7zHQ8=+WCQtnhHFVbEOzjVnQ@mail.gmail.com/ ) The rationale for 33% is basically to try to facilitate pipelining, where there are always at least 3 ACKs and 3 GSO/TSO skbs per SRTT, so that the path can maintain a budget for 3 full-sized GSO/TSO skbs "in flight" at all times: + 1 skb in the qdisc waiting to be sent by the NIC next + 1 skb being sent by the NIC (being serialized by the NIC out onto the wire) + 1 skb being received and aggregated by the receiver machine's aggregation mechanism (some combination of LRO, GRO, and sack compression) Note that this is basically the same magic number (3) and the same rationales as: (a) tcp_tso_should_defer() ensuring that we defer sending data for no longer than cwnd/tcp_tso_win_divisor (where tcp_tso_win_divisor = 3), and (b) bbr_quantization_budget() ensuring that cwnd is at least 3 GSO/TSO skbs to maintain pipelining and full throughput at low RTTs Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Link: https://patch.msgid.link/20251106115236.3450026-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 0e96c90e56c6..de9d36acc8e2 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -221,6 +221,7 @@ struct netns_ipv4 { int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; unsigned int sysctl_tcp_child_ehash_entries; + int sysctl_tcp_comp_sack_rtt_percent; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; int sysctl_max_syn_backlog; -- cgit v1.2.3 From b87ee13e34931779ac1dcd3264beba50b54966fd Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Wed, 5 Nov 2025 10:42:12 +0530 Subject: net: phy: phy-c45: add OATC14 10BASE-T1S PHY cable diagnostic support Add support for Open Alliance TC14 (OATC14) 10BASE-T1S PHYs cable diagnostic feature. This patch implements: - genphy_c45_oatc14_cable_test_start() to initiate a cable test - genphy_c45_oatc14_cable_test_get_status() to retrieve test results - Helper function to map PHY cable test status to ethtool result codes - Function declarations and exports for use by PHY drivers This enables ethtool to report ok, open, short, and undetectable cable conditions on OATC14 10Base-T1S PHYs. Open Alliance TC14 10BASE-T1S Advanced Diagnostic PHY Features Specification ref: https://opensig.org/wp-content/uploads/2025/06/OPEN_Alliance_10BASE-T1S_Advanced_PHY_features_for-automotive_Ethernet_V2.1b.pdf Signed-off-by: Parthiban Veerasooran Link: https://patch.msgid.link/20251105051213.50443-2-parthiban.veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index d145a200ea21..bf5457341ca8 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2251,6 +2251,9 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev, int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); +int genphy_c45_oatc14_cable_test_start(struct phy_device *phydev); +int genphy_c45_oatc14_cable_test_get_status(struct phy_device *phydev, + bool *finished); /* The gen10g_* functions are the old Clause 45 stub */ int gen10g_config_aneg(struct phy_device *phydev); -- cgit v1.2.3 From f73e0f46bbfab29b111ff52d047f15aa13623972 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 5 Nov 2025 23:09:17 +0100 Subject: net: phy: fixed_phy: shrink size of struct fixed_phy_status All three members are effectively of type bool, so make this explicit and shrink size of struct fixed_phy_status. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/9eca3d7e-fa64-4724-8fdc-f2c1a8f2ae8f@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy_fixed.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/phy_fixed.h b/include/linux/phy_fixed.h index 8bade999831c..436bff20f324 100644 --- a/include/linux/phy_fixed.h +++ b/include/linux/phy_fixed.h @@ -5,11 +5,11 @@ #include struct fixed_phy_status { - int link; int speed; int duplex; - int pause; - int asym_pause; + bool link:1; + bool pause:1; + bool asym_pause:1; }; struct device_node; -- cgit v1.2.3 From dae4a92399fa8d68aa917db6bb3245f83021e762 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 5 Nov 2025 16:26:02 -0800 Subject: psp: report basic stats from the core Track and report stats common to all psp devices from the core. A 'stale-event' is when the core marks the rx state of an active psp_assoc as incapable of authenticating psp encapsulated data. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20251106002608.1578518-2-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- include/net/psp/types.h | 9 +++++++++ include/uapi/linux/psp.h | 10 ++++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 31cee64b7c86..5b0ccaac3882 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -59,6 +59,10 @@ struct psp_dev_config { * device key * @stale_assocs: associations which use a rotated out key * + * @stats: statistics maintained by the core + * @stats.rotations: See stats attr key-rotations + * @stats.stales: See stats attr stale-events + * * @rcu: RCU head for freeing the structure */ struct psp_dev { @@ -81,6 +85,11 @@ struct psp_dev { struct list_head prev_assocs; struct list_head stale_assocs; + struct { + unsigned long rotations; + unsigned long stales; + } stats; + struct rcu_head rcu; }; diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index 607c42c39ba5..31592760ad79 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -45,6 +45,15 @@ enum { PSP_A_KEYS_MAX = (__PSP_A_KEYS_MAX - 1) }; +enum { + PSP_A_STATS_DEV_ID = 1, + PSP_A_STATS_KEY_ROTATIONS, + PSP_A_STATS_STALE_EVENTS, + + __PSP_A_STATS_MAX, + PSP_A_STATS_MAX = (__PSP_A_STATS_MAX - 1) +}; + enum { PSP_CMD_DEV_GET = 1, PSP_CMD_DEV_ADD_NTF, @@ -55,6 +64,7 @@ enum { PSP_CMD_KEY_ROTATE_NTF, PSP_CMD_RX_ASSOC, PSP_CMD_TX_ASSOC, + PSP_CMD_GET_STATS, __PSP_CMD_MAX, PSP_CMD_MAX = (__PSP_CMD_MAX - 1) -- cgit v1.2.3 From f05d26198cf2c71f25f6bbe62ca4481c15543922 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 5 Nov 2025 16:26:04 -0800 Subject: psp: add stats from psp spec to driver facing api Provide a driver api for reporting device statistics required by the "Implementation Requirements" section of the PSP Architecture Specification. Use a warning to ensure drivers report stats required by the spec. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20251106002608.1578518-4-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- include/net/psp/types.h | 23 +++++++++++++++++++++++ include/uapi/linux/psp.h | 8 ++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/net/psp/types.h b/include/net/psp/types.h index 5b0ccaac3882..25a9096d4e7d 100644 --- a/include/net/psp/types.h +++ b/include/net/psp/types.h @@ -150,6 +150,22 @@ struct psp_assoc { u8 drv_data[] __aligned(8); }; +struct psp_dev_stats { + union { + struct { + u64 rx_packets; + u64 rx_bytes; + u64 rx_auth_fail; + u64 rx_error; + u64 rx_bad; + u64 tx_packets; + u64 tx_bytes; + u64 tx_error; + }; + DECLARE_FLEX_ARRAY(u64, required); + }; +}; + /** * struct psp_dev_ops - netdev driver facing PSP callbacks */ @@ -188,6 +204,13 @@ struct psp_dev_ops { * Remove an association from the device. */ void (*tx_key_del)(struct psp_dev *psd, struct psp_assoc *pas); + + /** + * @get_stats: get statistics from the device + * Stats required by the spec must be maintained and filled in. + * Stats must be filled in member-by-member, never memset the struct. + */ + void (*get_stats)(struct psp_dev *psd, struct psp_dev_stats *stats); }; #endif /* __NET_PSP_H */ diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index 31592760ad79..d8449c043ba1 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -49,6 +49,14 @@ enum { PSP_A_STATS_DEV_ID = 1, PSP_A_STATS_KEY_ROTATIONS, PSP_A_STATS_STALE_EVENTS, + PSP_A_STATS_RX_PACKETS, + PSP_A_STATS_RX_BYTES, + PSP_A_STATS_RX_AUTH_FAIL, + PSP_A_STATS_RX_ERROR, + PSP_A_STATS_RX_BAD, + PSP_A_STATS_TX_PACKETS, + PSP_A_STATS_TX_BYTES, + PSP_A_STATS_TX_ERROR, __PSP_A_STATS_MAX, PSP_A_STATS_MAX = (__PSP_A_STATS_MAX - 1) -- cgit v1.2.3 From 8fdfdb1488162c195f3f0af10b7bc2b8b42928c5 Mon Sep 17 00:00:00 2001 From: Markus Probst Date: Tue, 4 Nov 2025 14:24:32 +0000 Subject: scsi: sd: Add manage_restart device attribute to scsi_disk In addition to the already existing manage_shutdown, manage_system_start_stop and manage_runtime_start_stop device scsi_disk attributes, add manage_restart, which allows the high-level device driver (sd) to manage the device power state for SYSTEM_RESTART if set to 1. This attribute is necessary for the following commit "ata: stop disk on restart if ACPI power resources are found" to avoid a potential disk power failure in the case the SATA power connector does not retain the power state after a restart. Reviewed-by: Damien Le Moal Signed-off-by: Markus Probst Link: https://patch.msgid.link/20251104142413.322347-2-markus.probst@posteo.de Signed-off-by: Martin K. Petersen --- include/scsi/scsi_device.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 6d6500148c4b..c7e657ac8b6d 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -178,6 +178,12 @@ struct scsi_device { */ unsigned manage_shutdown:1; + /* + * If true, let the high-level device driver (sd) manage the device + * power state for system restart (reboot) operations. + */ + unsigned manage_restart:1; + /* * If set and if the device is runtime suspended, ask the high-level * device driver (sd) to force a runtime resume of the device. -- cgit v1.2.3 From bf9f0b00bb7fd0470c1255bcc8e76c81d122a609 Mon Sep 17 00:00:00 2001 From: Jai Luthra Date: Wed, 29 Oct 2025 16:00:08 +0530 Subject: include: linux: Destage VCHIQ interface headers Move the VCHIQ headers from drivers/staging/vc04_services/include to include/linux/raspberrypi This is done so that they can be shared between the VCHIQ interface (which is going to be de-staged in a subsequent commit from staging) and the VCHIQ drivers left in the staging/vc04_services (namely bcm2835-audio, bcm2835-camera). The include/linux/raspberrypi/ provides a central location to serve both of these areas. Co-developed-by: Umang Jain Signed-off-by: Umang Jain Reviewed-by: Laurent Pinchart Signed-off-by: Jai Luthra Link: https://patch.msgid.link/20251029-vchiq-destage-v3-4-da8d6c83c2c5@ideasonboard.com Signed-off-by: Greg Kroah-Hartman --- include/linux/raspberrypi/vchiq.h | 112 ++++++ include/linux/raspberrypi/vchiq_arm.h | 164 ++++++++ include/linux/raspberrypi/vchiq_bus.h | 60 +++ include/linux/raspberrypi/vchiq_cfg.h | 41 ++ include/linux/raspberrypi/vchiq_core.h | 646 ++++++++++++++++++++++++++++++ include/linux/raspberrypi/vchiq_debugfs.h | 22 + 6 files changed, 1045 insertions(+) create mode 100644 include/linux/raspberrypi/vchiq.h create mode 100644 include/linux/raspberrypi/vchiq_arm.h create mode 100644 include/linux/raspberrypi/vchiq_bus.h create mode 100644 include/linux/raspberrypi/vchiq_cfg.h create mode 100644 include/linux/raspberrypi/vchiq_core.h create mode 100644 include/linux/raspberrypi/vchiq_debugfs.h (limited to 'include') diff --git a/include/linux/raspberrypi/vchiq.h b/include/linux/raspberrypi/vchiq.h new file mode 100644 index 000000000000..ee4469f4fc51 --- /dev/null +++ b/include/linux/raspberrypi/vchiq.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */ + +#ifndef VCHIQ_H +#define VCHIQ_H + +#define VCHIQ_MAKE_FOURCC(x0, x1, x2, x3) \ + (((x0) << 24) | ((x1) << 16) | ((x2) << 8) | (x3)) + +enum vchiq_reason { + VCHIQ_SERVICE_OPENED, /* service, -, - */ + VCHIQ_SERVICE_CLOSED, /* service, -, - */ + VCHIQ_MESSAGE_AVAILABLE, /* service, header, - */ + VCHIQ_BULK_TRANSMIT_DONE, /* service, -, bulk_userdata */ + VCHIQ_BULK_RECEIVE_DONE, /* service, -, bulk_userdata */ + VCHIQ_BULK_TRANSMIT_ABORTED, /* service, -, bulk_userdata */ + VCHIQ_BULK_RECEIVE_ABORTED /* service, -, bulk_userdata */ +}; + +enum vchiq_bulk_mode { + VCHIQ_BULK_MODE_CALLBACK, + VCHIQ_BULK_MODE_BLOCKING, + VCHIQ_BULK_MODE_NOCALLBACK, + VCHIQ_BULK_MODE_WAITING /* Reserved for internal use */ +}; + +enum vchiq_service_option { + VCHIQ_SERVICE_OPTION_AUTOCLOSE, + VCHIQ_SERVICE_OPTION_SLOT_QUOTA, + VCHIQ_SERVICE_OPTION_MESSAGE_QUOTA, + VCHIQ_SERVICE_OPTION_SYNCHRONOUS, + VCHIQ_SERVICE_OPTION_TRACE +}; + +struct vchiq_header { + /* The message identifier - opaque to applications. */ + int msgid; + + /* Size of message data. */ + unsigned int size; + + char data[]; /* message */ +}; + +struct vchiq_element { + const void __user *data; + unsigned int size; +}; + +struct vchiq_instance; +struct vchiq_state; + +struct vchiq_service_base { + int fourcc; + int (*callback)(struct vchiq_instance *instance, + enum vchiq_reason reason, + struct vchiq_header *header, + unsigned int handle, + void *cb_data, void __user *cb_userdata); + void *userdata; +}; + +struct vchiq_completion_data_kernel { + enum vchiq_reason reason; + struct vchiq_header *header; + void *service_userdata; + void *cb_data; + void __user *cb_userdata; +}; + +struct vchiq_service_params_kernel { + int fourcc; + int (*callback)(struct vchiq_instance *instance, + enum vchiq_reason reason, + struct vchiq_header *header, + unsigned int handle, + void *cb_data, void __user *cb_userdata); + void *userdata; + short version; /* Increment for non-trivial changes */ + short version_min; /* Update for incompatible changes */ +}; + +extern int vchiq_initialise(struct vchiq_state *state, + struct vchiq_instance **pinstance); +extern int vchiq_shutdown(struct vchiq_instance *instance); +extern int vchiq_connect(struct vchiq_instance *instance); +extern int vchiq_open_service(struct vchiq_instance *instance, + const struct vchiq_service_params_kernel *params, + unsigned int *pservice); +extern int vchiq_close_service(struct vchiq_instance *instance, + unsigned int service); +extern int vchiq_use_service(struct vchiq_instance *instance, unsigned int service); +extern int vchiq_release_service(struct vchiq_instance *instance, + unsigned int service); +extern void vchiq_msg_queue_push(struct vchiq_instance *instance, unsigned int handle, + struct vchiq_header *header); +extern void vchiq_release_message(struct vchiq_instance *instance, unsigned int service, + struct vchiq_header *header); +extern int vchiq_queue_kernel_message(struct vchiq_instance *instance, unsigned int handle, + void *data, unsigned int size); +extern int vchiq_bulk_transmit(struct vchiq_instance *instance, unsigned int service, + const void *data, unsigned int size, void *userdata, + enum vchiq_bulk_mode mode); +extern int vchiq_bulk_receive(struct vchiq_instance *instance, unsigned int service, + void *data, unsigned int size, void *userdata, + enum vchiq_bulk_mode mode); +extern void *vchiq_get_service_userdata(struct vchiq_instance *instance, unsigned int service); +extern int vchiq_get_peer_version(struct vchiq_instance *instance, unsigned int handle, + short *peer_version); +extern struct vchiq_header *vchiq_msg_hold(struct vchiq_instance *instance, unsigned int handle); + +#endif /* VCHIQ_H */ diff --git a/include/linux/raspberrypi/vchiq_arm.h b/include/linux/raspberrypi/vchiq_arm.h new file mode 100644 index 000000000000..e32b02f99024 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_arm.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* + * Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + */ + +#ifndef VCHIQ_ARM_H +#define VCHIQ_ARM_H + +#include +#include +#include +#include +#include "vchiq_core.h" +#include "vchiq_debugfs.h" + +/* Some per-instance constants */ +#define MAX_COMPLETIONS 128 +#define MAX_SERVICES 64 +#define MAX_ELEMENTS 8 +#define MSG_QUEUE_SIZE 128 + +#define VCHIQ_DRV_MAX_CALLBACKS 10 + +struct rpi_firmware; +struct vchiq_device; + +enum USE_TYPE_E { + USE_TYPE_SERVICE, + USE_TYPE_VCHIQ +}; + +struct vchiq_platform_info { + unsigned int cache_line_size; +}; + +struct vchiq_drv_mgmt { + struct rpi_firmware *fw; + const struct vchiq_platform_info *info; + + bool connected; + int num_deferred_callbacks; + /* Protects connected and num_deferred_callbacks */ + struct mutex connected_mutex; + + void (*deferred_callback[VCHIQ_DRV_MAX_CALLBACKS])(void); + + struct semaphore free_fragments_sema; + struct semaphore free_fragments_mutex; + char *fragments_base; + char *free_fragments; + unsigned int fragments_size; + + void __iomem *regs; + + struct vchiq_state state; +}; + +struct user_service { + struct vchiq_service *service; + void __user *userdata; + struct vchiq_instance *instance; + char is_vchi; + char dequeue_pending; + char close_pending; + int message_available_pos; + int msg_insert; + int msg_remove; + struct completion insert_event; + struct completion remove_event; + struct completion close_event; + struct vchiq_header *msg_queue[MSG_QUEUE_SIZE]; +}; + +struct bulk_waiter_node { + struct bulk_waiter bulk_waiter; + int pid; + struct list_head list; +}; + +struct vchiq_instance { + struct vchiq_state *state; + struct vchiq_completion_data_kernel completions[MAX_COMPLETIONS]; + int completion_insert; + int completion_remove; + struct completion insert_event; + struct completion remove_event; + struct mutex completion_mutex; + + int connected; + int closing; + int pid; + int mark; + int use_close_delivered; + int trace; + + struct list_head bulk_waiter_list; + struct mutex bulk_waiter_list_mutex; + + struct vchiq_debugfs_node debugfs_node; +}; + +int +vchiq_use_service(struct vchiq_instance *instance, unsigned int handle); + +extern int +vchiq_release_service(struct vchiq_instance *instance, unsigned int handle); + +extern int +vchiq_check_service(struct vchiq_service *service); + +extern void +vchiq_dump_service_use_state(struct vchiq_state *state); + +extern int +vchiq_use_internal(struct vchiq_state *state, struct vchiq_service *service, + enum USE_TYPE_E use_type); +extern int +vchiq_release_internal(struct vchiq_state *state, + struct vchiq_service *service); + +extern struct vchiq_debugfs_node * +vchiq_instance_get_debugfs_node(struct vchiq_instance *instance); + +extern int +vchiq_instance_get_use_count(struct vchiq_instance *instance); + +extern int +vchiq_instance_get_pid(struct vchiq_instance *instance); + +extern int +vchiq_instance_get_trace(struct vchiq_instance *instance); + +extern void +vchiq_instance_set_trace(struct vchiq_instance *instance, int trace); + +extern void +vchiq_add_connected_callback(struct vchiq_device *device, + void (*callback)(void)); + +#if IS_ENABLED(CONFIG_VCHIQ_CDEV) + +extern void +vchiq_deregister_chrdev(void); + +extern int +vchiq_register_chrdev(struct device *parent); + +#else + +static inline void vchiq_deregister_chrdev(void) { } +static inline int vchiq_register_chrdev(struct device *parent) { return 0; } + +#endif /* IS_ENABLED(CONFIG_VCHIQ_CDEV) */ + +extern int +service_callback(struct vchiq_instance *vchiq_instance, enum vchiq_reason reason, + struct vchiq_header *header, unsigned int handle, + void *cb_data, void __user *cb_userdata); + +extern void +free_bulk_waiter(struct vchiq_instance *instance); + +#endif /* VCHIQ_ARM_H */ diff --git a/include/linux/raspberrypi/vchiq_bus.h b/include/linux/raspberrypi/vchiq_bus.h new file mode 100644 index 000000000000..9de179b39f85 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_bus.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2023 Ideas On Board Oy + */ + +#ifndef _VCHIQ_DEVICE_H +#define _VCHIQ_DEVICE_H + +#include +#include + +struct vchiq_drv_mgmt; + +struct vchiq_device { + struct device dev; + struct vchiq_drv_mgmt *drv_mgmt; +}; + +struct vchiq_driver { + int (*probe)(struct vchiq_device *device); + void (*remove)(struct vchiq_device *device); + int (*resume)(struct vchiq_device *device); + int (*suspend)(struct vchiq_device *device, + pm_message_t state); + + const struct vchiq_device_id *id_table; + struct device_driver driver; +}; + +static inline struct vchiq_device *to_vchiq_device(struct device *d) +{ + return container_of(d, struct vchiq_device, dev); +} + +static inline struct vchiq_driver *to_vchiq_driver(struct device_driver *d) +{ + return container_of(d, struct vchiq_driver, driver); +} + +extern const struct bus_type vchiq_bus_type; + +struct vchiq_device * +vchiq_device_register(struct device *parent, const char *name); +void vchiq_device_unregister(struct vchiq_device *dev); + +int vchiq_driver_register(struct vchiq_driver *vchiq_drv); +void vchiq_driver_unregister(struct vchiq_driver *vchiq_drv); + +/** + * module_vchiq_driver() - Helper macro for registering a vchiq driver + * @__vchiq_driver: vchiq driver struct + * + * Helper macro for vchiq drivers which do not do anything special in + * module init/exit. This eliminates a lot of boilerplate. Each module may only + * use this macro once, and calling it replaces module_init() and module_exit() + */ +#define module_vchiq_driver(__vchiq_driver) \ + module_driver(__vchiq_driver, vchiq_driver_register, vchiq_driver_unregister) + +#endif /* _VCHIQ_DEVICE_H */ diff --git a/include/linux/raspberrypi/vchiq_cfg.h b/include/linux/raspberrypi/vchiq_cfg.h new file mode 100644 index 000000000000..a16d0299996c --- /dev/null +++ b/include/linux/raspberrypi/vchiq_cfg.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2010-2014 Broadcom. All rights reserved. */ + +#ifndef VCHIQ_CFG_H +#define VCHIQ_CFG_H + +#define VCHIQ_MAGIC VCHIQ_MAKE_FOURCC('V', 'C', 'H', 'I') +/* The version of VCHIQ - change with any non-trivial change */ +#define VCHIQ_VERSION 8 +/* + * The minimum compatible version - update to match VCHIQ_VERSION with any + * incompatible change + */ +#define VCHIQ_VERSION_MIN 3 + +/* The version that introduced the VCHIQ_IOC_LIB_VERSION ioctl */ +#define VCHIQ_VERSION_LIB_VERSION 7 + +/* The version that introduced the VCHIQ_IOC_CLOSE_DELIVERED ioctl */ +#define VCHIQ_VERSION_CLOSE_DELIVERED 7 + +/* The version that made it safe to use SYNCHRONOUS mode */ +#define VCHIQ_VERSION_SYNCHRONOUS_MODE 8 + +#define VCHIQ_MAX_STATES 1 +#define VCHIQ_MAX_SERVICES 4096 +#define VCHIQ_MAX_SLOTS 128 +#define VCHIQ_MAX_SLOTS_PER_SIDE 64 + +#define VCHIQ_NUM_CURRENT_BULKS 32 +#define VCHIQ_NUM_SERVICE_BULKS 4 + +#ifndef VCHIQ_ENABLE_DEBUG +#define VCHIQ_ENABLE_DEBUG 1 +#endif + +#ifndef VCHIQ_ENABLE_STATS +#define VCHIQ_ENABLE_STATS 1 +#endif + +#endif /* VCHIQ_CFG_H */ diff --git a/include/linux/raspberrypi/vchiq_core.h b/include/linux/raspberrypi/vchiq_core.h new file mode 100644 index 000000000000..e7bf7a114985 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_core.h @@ -0,0 +1,646 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2010-2012 Broadcom. All rights reserved. */ + +#ifndef VCHIQ_CORE_H +#define VCHIQ_CORE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vchiq.h" +#include "vchiq_cfg.h" + +/* Do this so that we can test-build the code on non-rpi systems */ +#if IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) + +#else + +#ifndef dsb +#define dsb(a) +#endif + +#endif /* IS_ENABLED(CONFIG_RASPBERRYPI_FIRMWARE) */ + +#define VCHIQ_SERVICE_HANDLE_INVALID 0 + +#define VCHIQ_SLOT_SIZE 4096 +#define VCHIQ_MAX_MSG_SIZE (VCHIQ_SLOT_SIZE - sizeof(struct vchiq_header)) + +#define VCHIQ_SLOT_MASK (VCHIQ_SLOT_SIZE - 1) +#define VCHIQ_SLOT_QUEUE_MASK (VCHIQ_MAX_SLOTS_PER_SIDE - 1) +#define VCHIQ_SLOT_ZERO_SLOTS DIV_ROUND_UP(sizeof(struct vchiq_slot_zero), \ + VCHIQ_SLOT_SIZE) + +#define BITSET_SIZE(b) ((b + 31) >> 5) +#define BITSET_WORD(b) (b >> 5) +#define BITSET_BIT(b) (1 << (b & 31)) +#define BITSET_IS_SET(bs, b) (bs[BITSET_WORD(b)] & BITSET_BIT(b)) +#define BITSET_SET(bs, b) (bs[BITSET_WORD(b)] |= BITSET_BIT(b)) + +enum { + DEBUG_ENTRIES, +#if VCHIQ_ENABLE_DEBUG + DEBUG_SLOT_HANDLER_COUNT, + DEBUG_SLOT_HANDLER_LINE, + DEBUG_PARSE_LINE, + DEBUG_PARSE_HEADER, + DEBUG_PARSE_MSGID, + DEBUG_AWAIT_COMPLETION_LINE, + DEBUG_DEQUEUE_MESSAGE_LINE, + DEBUG_SERVICE_CALLBACK_LINE, + DEBUG_MSG_QUEUE_FULL_COUNT, + DEBUG_COMPLETION_QUEUE_FULL_COUNT, +#endif + DEBUG_MAX +}; + +#if VCHIQ_ENABLE_DEBUG + +#define DEBUG_INITIALISE(local) int *debug_ptr = (local)->debug +#define DEBUG_TRACE(d) \ + do { debug_ptr[DEBUG_ ## d] = __LINE__; dsb(sy); } while (0) +#define DEBUG_VALUE(d, v) \ + do { debug_ptr[DEBUG_ ## d] = (v); dsb(sy); } while (0) +#define DEBUG_COUNT(d) \ + do { debug_ptr[DEBUG_ ## d]++; dsb(sy); } while (0) + +#else /* VCHIQ_ENABLE_DEBUG */ + +#define DEBUG_INITIALISE(local) +#define DEBUG_TRACE(d) +#define DEBUG_VALUE(d, v) +#define DEBUG_COUNT(d) + +#endif /* VCHIQ_ENABLE_DEBUG */ + +enum vchiq_connstate { + VCHIQ_CONNSTATE_DISCONNECTED, + VCHIQ_CONNSTATE_CONNECTING, + VCHIQ_CONNSTATE_CONNECTED, + VCHIQ_CONNSTATE_PAUSING, + VCHIQ_CONNSTATE_PAUSE_SENT, + VCHIQ_CONNSTATE_PAUSED, + VCHIQ_CONNSTATE_RESUMING, + VCHIQ_CONNSTATE_PAUSE_TIMEOUT, + VCHIQ_CONNSTATE_RESUME_TIMEOUT +}; + +enum { + VCHIQ_SRVSTATE_FREE, + VCHIQ_SRVSTATE_HIDDEN, + VCHIQ_SRVSTATE_LISTENING, + VCHIQ_SRVSTATE_OPENING, + VCHIQ_SRVSTATE_OPEN, + VCHIQ_SRVSTATE_OPENSYNC, + VCHIQ_SRVSTATE_CLOSESENT, + VCHIQ_SRVSTATE_CLOSERECVD, + VCHIQ_SRVSTATE_CLOSEWAIT, + VCHIQ_SRVSTATE_CLOSED +}; + +enum vchiq_bulk_dir { + VCHIQ_BULK_TRANSMIT, + VCHIQ_BULK_RECEIVE +}; + +struct vchiq_bulk { + short mode; + short dir; + void *cb_data; + void __user *cb_userdata; + struct bulk_waiter *waiter; + dma_addr_t dma_addr; + int size; + void *remote_data; + int remote_size; + int actual; + void *offset; + void __user *uoffset; +}; + +struct vchiq_bulk_queue { + int local_insert; /* Where to insert the next local bulk */ + int remote_insert; /* Where to insert the next remote bulk (master) */ + int process; /* Bulk to transfer next */ + int remote_notify; /* Bulk to notify the remote client of next (mstr) */ + int remove; /* Bulk to notify the local client of, and remove, next */ + struct vchiq_bulk bulks[VCHIQ_NUM_SERVICE_BULKS]; +}; + +/* + * Remote events provide a way of presenting several virtual doorbells to a + * peer (ARM host to VPU) using only one physical doorbell. They can be thought + * of as a way for the peer to signal a semaphore, in this case implemented as + * a workqueue. + * + * Remote events remain signalled until acknowledged by the receiver, and they + * are non-counting. They are designed in such a way as to minimise the number + * of interrupts and avoid unnecessary waiting. + * + * A remote_event is as small data structures that live in shared memory. It + * comprises two booleans - armed and fired: + * + * The sender sets fired when they signal the receiver. + * If fired is set, the receiver has been signalled and need not wait. + * The receiver sets the armed field before they begin to wait. + * If armed is set, the receiver is waiting and wishes to be woken by interrupt. + */ +struct remote_event { + int armed; + int fired; + u32 __unused; +}; + +struct opaque_platform_state; + +struct vchiq_slot { + char data[VCHIQ_SLOT_SIZE]; +}; + +struct vchiq_slot_info { + /* Use two counters rather than one to avoid the need for a mutex. */ + short use_count; + short release_count; +}; + +/* + * VCHIQ is a reliable connection-oriented datagram protocol. + * + * A VCHIQ service is equivalent to a TCP connection, except: + * + FOURCCs are used for the rendezvous, and port numbers are assigned at the + * time the connection is established. + * + There is less of a distinction between server and client sockets, the only + * difference being which end makes the first move. + * + For a multi-client server, the server creates new "listening" services as + * the existing one becomes connected - there is no need to specify the + * maximum number of clients up front. + * + Data transfer is reliable but packetized (messages have defined ends). + * + Messages can be either short (capable of fitting in a slot) and in-band, + * or copied between external buffers (bulk transfers). + */ +struct vchiq_service { + struct vchiq_service_base base; + unsigned int handle; + struct kref ref_count; + struct rcu_head rcu; + int srvstate; + void (*userdata_term)(void *userdata); + unsigned int localport; + unsigned int remoteport; + int public_fourcc; + int client_id; + char auto_close; + char sync; + char closing; + char trace; + atomic_t poll_flags; + short version; + short version_min; + short peer_version; + + struct vchiq_state *state; + struct vchiq_instance *instance; + + int service_use_count; + + struct vchiq_bulk_queue bulk_tx; + struct vchiq_bulk_queue bulk_rx; + + struct completion remove_event; + struct completion bulk_remove_event; + struct mutex bulk_mutex; + + struct service_stats_struct { + int quota_stalls; + int slot_stalls; + int bulk_stalls; + int error_count; + int ctrl_tx_count; + int ctrl_rx_count; + int bulk_tx_count; + int bulk_rx_count; + int bulk_aborted_count; + u64 ctrl_tx_bytes; + u64 ctrl_rx_bytes; + u64 bulk_tx_bytes; + u64 bulk_rx_bytes; + } stats; + + int msg_queue_read; + int msg_queue_write; + struct completion msg_queue_pop; + struct completion msg_queue_push; + struct vchiq_header *msg_queue[VCHIQ_MAX_SLOTS]; +}; + +/* + * The quota information is outside struct vchiq_service so that it can + * be statically allocated, since for accounting reasons a service's slot + * usage is carried over between users of the same port number. + */ +struct vchiq_service_quota { + unsigned short slot_quota; + unsigned short slot_use_count; + unsigned short message_quota; + unsigned short message_use_count; + struct completion quota_event; + int previous_tx_index; +}; + +struct vchiq_shared_state { + /* A non-zero value here indicates that the content is valid. */ + int initialised; + + /* The first and last (inclusive) slots allocated to the owner. */ + int slot_first; + int slot_last; + + /* The slot allocated to synchronous messages from the owner. */ + int slot_sync; + + /* + * Signalling this event indicates that owner's slot handler thread + * should run. + */ + struct remote_event trigger; + + /* + * Indicates the byte position within the stream where the next message + * will be written. The least significant bits are an index into the + * slot. The next bits are the index of the slot in slot_queue. + */ + int tx_pos; + + /* This event should be signalled when a slot is recycled. */ + struct remote_event recycle; + + /* The slot_queue index where the next recycled slot will be written. */ + int slot_queue_recycle; + + /* This event should be signalled when a synchronous message is sent. */ + struct remote_event sync_trigger; + + /* + * This event should be signalled when a synchronous message has been + * released. + */ + struct remote_event sync_release; + + /* A circular buffer of slot indexes. */ + int slot_queue[VCHIQ_MAX_SLOTS_PER_SIDE]; + + /* Debugging state */ + int debug[DEBUG_MAX]; +}; + +/* + * vchiq_slot_zero describes the memory shared between the ARM host and the + * VideoCore VPU. The "master" and "slave" states are owned by the respective + * sides but visible to the other; the slots are shared, and the remaining + * fields are read-only. + * + * In the configuration used by this implementation, the memory is allocated + * by the host, the VPU is the master (the side which controls the DMA for bulk + * transfers), and the host is the slave. + * + * The ownership of slots changes with use: + * + When empty they are owned by the sender. + * + When partially filled they are shared with the receiver. + * + When completely full they are owned by the receiver. + * + When the receiver has finished processing the contents, they are recycled + * back to the sender. + */ +struct vchiq_slot_zero { + int magic; + short version; + short version_min; + int slot_zero_size; + int slot_size; + int max_slots; + int max_slots_per_side; + int platform_data[2]; + struct vchiq_shared_state master; + struct vchiq_shared_state slave; + struct vchiq_slot_info slots[VCHIQ_MAX_SLOTS]; +}; + +/* + * This is the private runtime state used by each side. The same structure was + * originally used by both sides, but implementations have since diverged. + */ +struct vchiq_state { + struct device *dev; + int id; + int initialised; + enum vchiq_connstate conn_state; + short version_common; + + struct vchiq_shared_state *local; + struct vchiq_shared_state *remote; + struct vchiq_slot *slot_data; + + unsigned short default_slot_quota; + unsigned short default_message_quota; + + /* Event indicating connect message received */ + struct completion connect; + + /* Mutex protecting services */ + struct mutex mutex; + struct vchiq_instance **instance; + + /* Processes all incoming messages which aren't synchronous */ + struct task_struct *slot_handler_thread; + + /* + * Slots which have been fully processed and released by the (peer) + * receiver are added to the receiver queue, which is asynchronously + * processed by the recycle thread. + */ + struct task_struct *recycle_thread; + + /* + * Processes incoming synchronous messages + * + * The synchronous message channel is shared between all synchronous + * services, and provides a way for urgent messages to bypass + * potentially long queues of asynchronous messages in the normal slots. + * + * There can be only one outstanding synchronous message in + * each direction, and as a precious shared resource synchronous + * services should be used sparingly. + */ + struct task_struct *sync_thread; + + /* Local implementation of the trigger remote event */ + wait_queue_head_t trigger_event; + + /* Local implementation of the recycle remote event */ + wait_queue_head_t recycle_event; + + /* Local implementation of the sync trigger remote event */ + wait_queue_head_t sync_trigger_event; + + /* Local implementation of the sync release remote event */ + wait_queue_head_t sync_release_event; + + char *tx_data; + char *rx_data; + struct vchiq_slot_info *rx_info; + + struct mutex slot_mutex; + + struct mutex recycle_mutex; + + struct mutex sync_mutex; + + spinlock_t msg_queue_spinlock; + + spinlock_t bulk_waiter_spinlock; + + spinlock_t quota_spinlock; + + /* + * Indicates the byte position within the stream from where the next + * message will be read. The least significant bits are an index into + * the slot.The next bits are the index of the slot in + * remote->slot_queue. + */ + int rx_pos; + + /* + * A cached copy of local->tx_pos. Only write to local->tx_pos, and read + * from remote->tx_pos. + */ + int local_tx_pos; + + /* The slot_queue index of the slot to become available next. */ + int slot_queue_available; + + /* A flag to indicate if any poll has been requested */ + int poll_needed; + + /* Ths index of the previous slot used for data messages. */ + int previous_data_index; + + /* The number of slots occupied by data messages. */ + unsigned short data_use_count; + + /* The maximum number of slots to be occupied by data messages. */ + unsigned short data_quota; + + /* An array of bit sets indicating which services must be polled. */ + atomic_t poll_services[BITSET_SIZE(VCHIQ_MAX_SERVICES)]; + + /* The number of the first unused service */ + int unused_service; + + /* Signalled when a free slot becomes available. */ + struct completion slot_available_event; + + /* Signalled when a free data slot becomes available. */ + struct completion data_quota_event; + + struct state_stats_struct { + int slot_stalls; + int data_stalls; + int ctrl_tx_count; + int ctrl_rx_count; + int error_count; + } stats; + + struct vchiq_service __rcu *services[VCHIQ_MAX_SERVICES]; + struct vchiq_service_quota service_quotas[VCHIQ_MAX_SERVICES]; + struct vchiq_slot_info slot_info[VCHIQ_MAX_SLOTS]; + + struct opaque_platform_state *platform_state; +}; + +struct pagelist { + u32 length; + u16 type; + u16 offset; + u32 addrs[1]; /* N.B. 12 LSBs hold the number + * of following pages at consecutive + * addresses. + */ +}; + +struct vchiq_pagelist_info { + struct pagelist *pagelist; + size_t pagelist_buffer_size; + dma_addr_t dma_addr; + enum dma_data_direction dma_dir; + unsigned int num_pages; + unsigned int pages_need_release; + struct page **pages; + struct scatterlist *scatterlist; + unsigned int scatterlist_mapped; +}; + +static inline bool vchiq_remote_initialised(const struct vchiq_state *state) +{ + return state->remote && state->remote->initialised; +} + +struct bulk_waiter { + struct vchiq_bulk *bulk; + struct completion event; + int actual; +}; + +struct vchiq_config { + unsigned int max_msg_size; + unsigned int bulk_threshold; /* The message size above which it + * is better to use a bulk transfer + * (<= max_msg_size) + */ + unsigned int max_outstanding_bulks; + unsigned int max_services; + short version; /* The version of VCHIQ */ + short version_min; /* The minimum compatible version of VCHIQ */ +}; + +extern spinlock_t bulk_waiter_spinlock; + +extern const char * +get_conn_state_name(enum vchiq_connstate conn_state); + +extern struct vchiq_slot_zero * +vchiq_init_slots(struct device *dev, void *mem_base, int mem_size); + +extern int +vchiq_init_state(struct vchiq_state *state, struct vchiq_slot_zero *slot_zero, struct device *dev); + +extern int +vchiq_connect_internal(struct vchiq_state *state, struct vchiq_instance *instance); + +struct vchiq_service * +vchiq_add_service_internal(struct vchiq_state *state, + const struct vchiq_service_params_kernel *params, + int srvstate, struct vchiq_instance *instance, + void (*userdata_term)(void *userdata)); + +extern int +vchiq_open_service_internal(struct vchiq_service *service, int client_id); + +extern int +vchiq_close_service_internal(struct vchiq_service *service, int close_recvd); + +extern void +vchiq_terminate_service_internal(struct vchiq_service *service); + +extern void +vchiq_free_service_internal(struct vchiq_service *service); + +extern void +vchiq_shutdown_internal(struct vchiq_state *state, struct vchiq_instance *instance); + +extern void +remote_event_pollall(struct vchiq_state *state); + +extern int +vchiq_bulk_xfer_waiting(struct vchiq_instance *instance, unsigned int handle, + struct bulk_waiter *userdata); + +extern int +vchiq_bulk_xfer_blocking(struct vchiq_instance *instance, unsigned int handle, + struct vchiq_bulk *bulk); + +extern int +vchiq_bulk_xfer_callback(struct vchiq_instance *instance, unsigned int handle, + struct vchiq_bulk *bulk); + +extern void +vchiq_dump_state(struct seq_file *f, struct vchiq_state *state); + +extern void +request_poll(struct vchiq_state *state, struct vchiq_service *service, + int poll_type); + +struct vchiq_service *handle_to_service(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +find_service_by_handle(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +find_service_by_port(struct vchiq_state *state, unsigned int localport); + +extern struct vchiq_service * +find_service_for_instance(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +find_closed_service_for_instance(struct vchiq_instance *instance, unsigned int handle); + +extern struct vchiq_service * +__next_service_by_instance(struct vchiq_state *state, + struct vchiq_instance *instance, + int *pidx); + +extern struct vchiq_service * +next_service_by_instance(struct vchiq_state *state, + struct vchiq_instance *instance, + int *pidx); + +extern void +vchiq_service_get(struct vchiq_service *service); + +extern void +vchiq_service_put(struct vchiq_service *service); + +extern int +vchiq_queue_message(struct vchiq_instance *instance, unsigned int handle, + ssize_t (*copy_callback)(void *context, void *dest, + size_t offset, size_t maxsize), + void *context, + size_t size); + +void vchiq_dump_platform_state(struct seq_file *f); + +void vchiq_dump_platform_instances(struct vchiq_state *state, struct seq_file *f); + +void vchiq_dump_platform_service_state(struct seq_file *f, struct vchiq_service *service); + +int vchiq_use_service_internal(struct vchiq_service *service); + +int vchiq_release_service_internal(struct vchiq_service *service); + +void vchiq_on_remote_use(struct vchiq_state *state); + +void vchiq_on_remote_release(struct vchiq_state *state); + +int vchiq_platform_init_state(struct vchiq_state *state); + +int vchiq_check_service(struct vchiq_service *service); + +int vchiq_send_remote_use(struct vchiq_state *state); + +int vchiq_send_remote_use_active(struct vchiq_state *state); + +void vchiq_platform_conn_state_changed(struct vchiq_state *state, + enum vchiq_connstate oldstate, + enum vchiq_connstate newstate); + +void vchiq_set_conn_state(struct vchiq_state *state, enum vchiq_connstate newstate); + +void vchiq_log_dump_mem(struct device *dev, const char *label, u32 addr, + const void *void_mem, size_t num_bytes); + +int vchiq_remove_service(struct vchiq_instance *instance, unsigned int service); + +int vchiq_get_client_id(struct vchiq_instance *instance, unsigned int service); + +void vchiq_get_config(struct vchiq_config *config); + +int vchiq_set_service_option(struct vchiq_instance *instance, unsigned int service, + enum vchiq_service_option option, int value); + +#endif diff --git a/include/linux/raspberrypi/vchiq_debugfs.h b/include/linux/raspberrypi/vchiq_debugfs.h new file mode 100644 index 000000000000..b29e6693c949 --- /dev/null +++ b/include/linux/raspberrypi/vchiq_debugfs.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* Copyright (c) 2014 Raspberry Pi (Trading) Ltd. All rights reserved. */ + +#ifndef VCHIQ_DEBUGFS_H +#define VCHIQ_DEBUGFS_H + +struct vchiq_state; +struct vchiq_instance; + +struct vchiq_debugfs_node { + struct dentry *dentry; +}; + +void vchiq_debugfs_init(struct vchiq_state *state); + +void vchiq_debugfs_deinit(void); + +void vchiq_debugfs_add_instance(struct vchiq_instance *instance); + +void vchiq_debugfs_remove_instance(struct vchiq_instance *instance); + +#endif /* VCHIQ_DEBUGFS_H */ -- cgit v1.2.3 From 7b8a8ec20cfce2298f6737089f5d17407ea346b4 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 27 Oct 2025 11:34:01 +0200 Subject: PCI/TPH: Expose pcie_tph_get_st_table_loc() Expose pcie_tph_get_st_table_loc() to be used by drivers as will be done in the next patch from the series. Signed-off-by: Yishai Hadas Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251027-st-direct-mode-v1-1-e0ad953866b6@nvidia.com Acked-by: Bjorn Helgaas Signed-off-by: Leon Romanovsky --- include/linux/pci-tph.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h index 9e4e331b1603..ba28140ce670 100644 --- a/include/linux/pci-tph.h +++ b/include/linux/pci-tph.h @@ -29,6 +29,7 @@ int pcie_tph_get_cpu_st(struct pci_dev *dev, void pcie_disable_tph(struct pci_dev *pdev); int pcie_enable_tph(struct pci_dev *pdev, int mode); u16 pcie_tph_get_st_table_size(struct pci_dev *pdev); +u32 pcie_tph_get_st_table_loc(struct pci_dev *pdev); #else static inline int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag) -- cgit v1.2.3 From 6948417b3f1fafbeab85c051f8dba5e305a8f9c4 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 29 Oct 2025 17:42:53 +0200 Subject: net/mlx5: Add OTHER_ESWITCH HW capabilities Add OTHER_ESWITCH capabilities which includes other_eswitch and eswitch_owner_vhca_id to all steering objects. Signed-off-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-1-98bb707b5d57@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 47 ++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 07614cd95bed..9b8f88987d2f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5251,13 +5251,15 @@ struct mlx5_ifc_set_fte_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -8809,13 +8811,15 @@ struct mlx5_ifc_destroy_flow_table_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -8840,13 +8844,15 @@ struct mlx5_ifc_destroy_flow_group_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -8985,13 +8991,15 @@ struct mlx5_ifc_delete_fte_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -9535,13 +9543,15 @@ struct mlx5_ifc_create_flow_table_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x20]; @@ -9580,7 +9590,8 @@ struct mlx5_ifc_create_flow_group_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x20]; @@ -9588,7 +9599,7 @@ struct mlx5_ifc_create_flow_group_in_bits { u8 table_type[0x8]; u8 reserved_at_88[0x4]; u8 group_type[0x4]; - u8 reserved_at_90[0x10]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; @@ -11876,10 +11887,12 @@ struct mlx5_ifc_set_flow_table_root_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; - u8 reserved_at_60[0x20]; + u8 reserved_at_60[0x10]; + u8 eswitch_owner_vhca_id[0x10]; u8 table_type[0x8]; u8 reserved_at_88[0x7]; @@ -11919,14 +11932,16 @@ struct mlx5_ifc_modify_flow_table_in_bits { u8 op_mod[0x10]; u8 other_vport[0x1]; - u8 reserved_at_41[0xf]; + u8 other_eswitch[0x1]; + u8 reserved_at_42[0xe]; u8 vport_number[0x10]; u8 reserved_at_60[0x10]; u8 modify_field_select[0x10]; u8 table_type[0x8]; - u8 reserved_at_88[0x18]; + u8 reserved_at_88[0x8]; + u8 eswitch_owner_vhca_id[0x10]; u8 reserved_at_a0[0x8]; u8 table_id[0x18]; -- cgit v1.2.3 From 3b848dec7e821bace785b9e405bf1884c077635a Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 29 Oct 2025 17:42:54 +0200 Subject: net/mlx5: fs, Add other_eswitch support for steering tables Add other_eswitch support which allows flow tables creation above vports that reside on different esw managers. The new flag MLX5_FLOW_TABLE_OTHER_ESWITCH indicates if the esw_owner_vhca_id attribute is supported. Note that this is only supported if the Advanced-RDMA cap- rdma_transport_manager_other_eswitch is set. And it is the caller responsibility to check that. Signed-off-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-2-98bb707b5d57@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 6ac76a0c3827..6325a7fa0df2 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -71,6 +71,7 @@ enum { MLX5_FLOW_TABLE_UNMANAGED = BIT(3), MLX5_FLOW_TABLE_OTHER_VPORT = BIT(4), MLX5_FLOW_TABLE_UPLINK_VPORT = BIT(5), + MLX5_FLOW_TABLE_OTHER_ESWITCH = BIT(6), }; #define LEFTOVERS_RULE_NUM 2 @@ -208,6 +209,7 @@ struct mlx5_flow_table_attr { u32 flags; u16 uid; u16 vport; + u16 esw_owner_vhca_id; struct mlx5_flow_table *next_ft; struct { -- cgit v1.2.3 From 583b4fe1c19d978bb787e0adf9ce469cb7f68455 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 29 Oct 2025 17:42:55 +0200 Subject: net/mlx5: fs, set non default device per namespace Add mlx5_fs_set_root_dev() function which swaps the root namespace core device with another one for a given table_type. It is intended for usage only by RDMA_TRANSPORT tables in case of LAG configuration, to allow the creation of tables during LAG always through the LAG master device, which is valid since during LAG the master is allowed to manage the RDMA_TRANSPORT tables of its slaves. In addition move the table_type enum to global include to allow its use in a downstream patch in the RDMA driver. Signed-off-by: Patrisious Haddad Signed-off-by: Edward Srouji Link: https://patch.msgid.link/20251029-support-other-eswitch-v1-3-98bb707b5d57@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/fs.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 6325a7fa0df2..fe721557bd1d 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -128,6 +128,24 @@ enum { FDB_PER_VPORT, }; +enum fs_flow_table_type { + FS_FT_NIC_RX = 0x0, + FS_FT_NIC_TX = 0x1, + FS_FT_ESW_EGRESS_ACL = 0x2, + FS_FT_ESW_INGRESS_ACL = 0x3, + FS_FT_FDB = 0X4, + FS_FT_SNIFFER_RX = 0X5, + FS_FT_SNIFFER_TX = 0X6, + FS_FT_RDMA_RX = 0X7, + FS_FT_RDMA_TX = 0X8, + FS_FT_PORT_SEL = 0X9, + FS_FT_FDB_RX = 0xa, + FS_FT_FDB_TX = 0xb, + FS_FT_RDMA_TRANSPORT_RX = 0xd, + FS_FT_RDMA_TRANSPORT_TX = 0xe, + FS_FT_MAX_TYPE = FS_FT_RDMA_TRANSPORT_TX, +}; + struct mlx5_pkt_reformat; struct mlx5_modify_hdr; struct mlx5_flow_definer; @@ -355,4 +373,8 @@ u32 mlx5_flow_table_id(struct mlx5_flow_table *ft); struct mlx5_flow_root_namespace * mlx5_get_root_namespace(struct mlx5_core_dev *dev, enum mlx5_flow_namespace_type ns_type); + +int mlx5_fs_set_root_dev(struct mlx5_core_dev *dev, + struct mlx5_core_dev *new_dev, + enum fs_flow_table_type table_type); #endif -- cgit v1.2.3 From 1d165919c8261b927f8dc8cfe61eb04342bedb7e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 25 Oct 2025 19:47:59 -0700 Subject: iio: imu: adis: fix all kernel-doc warnings in header file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correct and add to adis.h to resolve all kernel-doc warnings: - add a missing struct member description - change one non-kernel-doc comment to use /* instead of /** - correct function parameter @value to @val (7 locations) - add function return value comments (13 locations) Warning: include/linux/iio/imu/adis.h:97 struct member 'has_fifo' not described in 'adis_data' Warning: include/linux/iio/imu/adis.h:139 Incorrect use of kernel-doc format: * The state_lock is meant to be used during operations that require Warning: include/linux/iio/imu/adis.h:158 struct member '"__adis_"' not described in 'adis' Warning: include/linux/iio/imu/adis.h:264 function parameter 'val' not described in 'adis_write_reg' Warning: include/linux/iio/imu/adis.h:371 No description found for return value of 'adis_update_bits_base' Signed-off-by: Randy Dunlap Reviewed-by: Nuno Sá Reviewed-by: Andy Shevchenko Signed-off-by: Jonathan Cameron --- include/linux/iio/imu/adis.h | 45 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h index aa160511e265..bfb6df68e6c9 100644 --- a/include/linux/iio/imu/adis.h +++ b/include/linux/iio/imu/adis.h @@ -57,6 +57,7 @@ struct adis_timeout { * @enable_irq: Hook for ADIS devices that have a special IRQ enable/disable * @unmasked_drdy: True for devices that cannot mask/unmask the data ready pin * @has_paging: True if ADIS device has paged registers + * @has_fifo: True if ADIS device has a hardware FIFO * @burst_reg_cmd: Register command that triggers burst * @burst_len: Burst size in the SPI RX buffer. If @burst_max_len is defined, * this should be the minimum size supported by the device. @@ -136,7 +137,7 @@ struct adis { const struct adis_data *data; unsigned int burst_extra_len; const struct adis_ops *ops; - /** + /* * The state_lock is meant to be used during operations that require * a sequence of SPI R/W in order to protect the SPI transfer * information (fields 'xfer', 'msg' & 'current_page') between @@ -166,7 +167,7 @@ int __adis_reset(struct adis *adis); * adis_reset() - Reset the device * @adis: The adis device * - * Returns 0 on success, a negative error code otherwise + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_reset(struct adis *adis) { @@ -183,7 +184,9 @@ int __adis_read_reg(struct adis *adis, unsigned int reg, * __adis_write_reg_8() - Write single byte to a register (unlocked) * @adis: The adis device * @reg: The address of the register to be written - * @value: The value to write + * @val: The value to write + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg, u8 val) @@ -195,7 +198,9 @@ static inline int __adis_write_reg_8(struct adis *adis, unsigned int reg, * __adis_write_reg_16() - Write 2 bytes to a pair of registers (unlocked) * @adis: The adis device * @reg: The address of the lower of the two registers - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg, u16 val) @@ -207,7 +212,9 @@ static inline int __adis_write_reg_16(struct adis *adis, unsigned int reg, * __adis_write_reg_32() - write 4 bytes to four registers (unlocked) * @adis: The adis device * @reg: The address of the lower of the four register - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg, u32 val) @@ -220,6 +227,8 @@ static inline int __adis_write_reg_32(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg, u16 *val) @@ -239,6 +248,8 @@ static inline int __adis_read_reg_16(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg, u32 *val) @@ -257,8 +268,10 @@ static inline int __adis_read_reg_32(struct adis *adis, unsigned int reg, * adis_write_reg() - write N bytes to register * @adis: The adis device * @reg: The address of the lower of the two registers - * @value: The value to write to device (up to 4 bytes) + * @val: The value to write to device (up to 4 bytes) * @size: The size of the @value (in bytes) + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg(struct adis *adis, unsigned int reg, unsigned int val, unsigned int size) @@ -273,6 +286,8 @@ static inline int adis_write_reg(struct adis *adis, unsigned int reg, * @reg: The address of the lower of the two registers * @val: The value read back from the device * @size: The size of the @val buffer + * + * Returns: %0 on success, a negative error code otherwise */ static int adis_read_reg(struct adis *adis, unsigned int reg, unsigned int *val, unsigned int size) @@ -285,7 +300,9 @@ static int adis_read_reg(struct adis *adis, unsigned int reg, * adis_write_reg_8() - Write single byte to a register * @adis: The adis device * @reg: The address of the register to be written - * @value: The value to write + * @val: The value to write + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg_8(struct adis *adis, unsigned int reg, u8 val) @@ -297,7 +314,9 @@ static inline int adis_write_reg_8(struct adis *adis, unsigned int reg, * adis_write_reg_16() - Write 2 bytes to a pair of registers * @adis: The adis device * @reg: The address of the lower of the two registers - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg_16(struct adis *adis, unsigned int reg, u16 val) @@ -309,7 +328,9 @@ static inline int adis_write_reg_16(struct adis *adis, unsigned int reg, * adis_write_reg_32() - write 4 bytes to four registers * @adis: The adis device * @reg: The address of the lower of the four register - * @value: Value to be written + * @val: Value to be written + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_write_reg_32(struct adis *adis, unsigned int reg, u32 val) @@ -322,6 +343,8 @@ static inline int adis_write_reg_32(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_read_reg_16(struct adis *adis, unsigned int reg, u16 *val) @@ -341,6 +364,8 @@ static inline int adis_read_reg_16(struct adis *adis, unsigned int reg, * @adis: The adis device * @reg: The address of the lower of the two registers * @val: The value read back from the device + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_read_reg_32(struct adis *adis, unsigned int reg, u32 *val) @@ -366,6 +391,8 @@ int __adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask, * @size: Size of the register to update * * Updates the desired bits of @reg in accordance with @mask and @val. + * + * Returns: %0 on success, a negative error code otherwise */ static inline int adis_update_bits_base(struct adis *adis, unsigned int reg, const u32 mask, const u32 val, u8 size) -- cgit v1.2.3 From aaf46c6a6df6052881c2e75cba65aeb6f1cfa88a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 23 Oct 2025 22:21:02 -0700 Subject: tee: : - add ending ':' to some struct members as needed for kernel-doc - change struct name in kernel-doc to match the actual struct name (2x) - add a @params: kernel-doc entry multiple times Warning: tee.h:265 struct member 'ret_origin' not described in 'tee_ioctl_open_session_arg' Warning: tee.h:265 struct member 'num_params' not described in 'tee_ioctl_open_session_arg' Warning: tee.h:265 struct member 'params' not described in 'tee_ioctl_open_session_arg' Warning: tee.h:351 struct member 'num_params' not described in 'tee_iocl_supp_recv_arg' Warning: tee.h:351 struct member 'params' not described in 'tee_iocl_supp_recv_arg' Warning: tee.h:372 struct member 'num_params' not described in 'tee_iocl_supp_send_arg' Warning: tee.h:372 struct member 'params' not described in 'tee_iocl_supp_send_arg' Warning: tee.h:298: expecting prototype for struct tee_ioctl_invoke_func_arg. Prototype was for struct tee_ioctl_invoke_arg instead Warning: tee.h:473: expecting prototype for struct tee_ioctl_invoke_func_arg. Prototype was for struct tee_ioctl_object_invoke_arg instead Signed-off-by: Randy Dunlap Reviewed-by: Sumit Garg Signed-off-by: Jens Wiklander --- include/uapi/linux/tee.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h index 386ad36f1a0a..cab5cadca8ef 100644 --- a/include/uapi/linux/tee.h +++ b/include/uapi/linux/tee.h @@ -249,8 +249,9 @@ struct tee_ioctl_param { * @cancel_id: [in] Cancellation id, a unique value to identify this request * @session: [out] Session id * @ret: [out] return value - * @ret_origin [out] origin of the return value - * @num_params [in] number of parameters following this struct + * @ret_origin: [out] origin of the return value + * @num_params: [in] number of &struct tee_ioctl_param entries in @params + * @params: array of ioctl parameters */ struct tee_ioctl_open_session_arg { __u8 uuid[TEE_IOCTL_UUID_LEN]; @@ -276,14 +277,14 @@ struct tee_ioctl_open_session_arg { struct tee_ioctl_buf_data) /** - * struct tee_ioctl_invoke_func_arg - Invokes a function in a Trusted - * Application + * struct tee_ioctl_invoke_arg - Invokes a function in a Trusted Application * @func: [in] Trusted Application function, specific to the TA * @session: [in] Session id * @cancel_id: [in] Cancellation id, a unique value to identify this request * @ret: [out] return value - * @ret_origin [out] origin of the return value - * @num_params [in] number of parameters following this struct + * @ret_origin: [out] origin of the return value + * @num_params: [in] number of parameters following this struct + * @params: array of ioctl parameters */ struct tee_ioctl_invoke_arg { __u32 func; @@ -338,7 +339,8 @@ struct tee_ioctl_close_session_arg { /** * struct tee_iocl_supp_recv_arg - Receive a request for a supplicant function * @func: [in] supplicant function - * @num_params [in/out] number of parameters following this struct + * @num_params: [in/out] number of &struct tee_ioctl_param entries in @params + * @params: array of ioctl parameters * * @num_params is the number of params that tee-supplicant has room to * receive when input, @num_params is the number of actual params @@ -363,7 +365,8 @@ struct tee_iocl_supp_recv_arg { /** * struct tee_iocl_supp_send_arg - Send a response to a received request * @ret: [out] return value - * @num_params [in] number of parameters following this struct + * @num_params: [in] number of &struct tee_ioctl_param entries in @params + * @params: array of ioctl parameters */ struct tee_iocl_supp_send_arg { __u32 ret; @@ -454,11 +457,13 @@ struct tee_ioctl_shm_register_fd_data { */ /** - * struct tee_ioctl_invoke_func_arg - Invokes an object in a Trusted Application + * struct tee_ioctl_object_invoke_arg - Invokes an object in a + * Trusted Application * @id: [in] Object id * @op: [in] Object operation, specific to the object * @ret: [out] return value * @num_params: [in] number of parameters following this struct + * @params: array of ioctl parameters */ struct tee_ioctl_object_invoke_arg { __u64 id; -- cgit v1.2.3 From 7cd3d204412b0584df38fd7be20002137f34721a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:23 +0100 Subject: ns: don't increment or decrement initial namespaces There's no need to bump the active reference counts of initial namespaces as they're always active and can simply remain at 1. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-2-ae8a4ad5a3b3@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index bd4492ef6ffc..791b18dc77d0 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -141,6 +141,12 @@ static __always_inline bool is_initial_namespace(struct ns_common *ns) IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); } +static __always_inline bool is_ns_init_id(const struct ns_common *ns) +{ + VFS_WARN_ON_ONCE(ns->ns_id == 0); + return ns->ns_id <= NS_LAST_INIT_ID; +} + #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns)->ns, \ @@ -285,14 +291,19 @@ void __ns_ref_active_get_owner(struct ns_common *ns); static __always_inline void __ns_ref_active_get(struct ns_common *ns) { - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); - VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0); + /* Initial namespaces are always active. */ + if (!is_ns_init_id(ns)) + WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); } #define ns_ref_active_get(__ns) \ do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return true; + if (atomic_inc_not_zero(&ns->__ns_ref_active)) { VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); return true; @@ -307,6 +318,10 @@ void __ns_ref_active_put_owner(struct ns_common *ns); static __always_inline void __ns_ref_active_put(struct ns_common *ns) { + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + if (atomic_dec_and_test(&ns->__ns_ref_active)) { VFS_WARN_ON_ONCE(is_initial_namespace(ns)); VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); @@ -319,8 +334,10 @@ static __always_inline void __ns_ref_active_put(struct ns_common *ns) static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); - if (!__ns_ref_active_read(ns)) + if (!__ns_ref_active_read(ns)) { + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); return NULL; + } if (!__ns_ref_get(ns)) return NULL; return ns; -- cgit v1.2.3 From f8d5a8970d2f49411824fb1fdd34bbb3eea22756 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:26 +0100 Subject: ns: handle setns(pidfd, ...) cleanly The setns() system call supports: (1) namespace file descriptors (nsfd) (2) process file descriptors (pidfd) When using nsfds the namespaces will remain active because they are pinned by the vfs. However, when pidfds are used things are more complicated. When the target task exits and passes through exit_nsproxy_namespaces() or is reaped and thus also passes through exit_cred_namespaces() after the setns()'ing task has called prepare_nsset() but before the active reference count of the set of namespaces it wants to setns() to might have been dropped already: P1 P2 pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) pidfd = pidfd_open(pid_p1) setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS) prepare_nsset() exit(0) // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 1 -> exit_nsproxy_namespaces() -> exit_cred_namespaces() // ns_active_ref_put() will also put // the reference on the owner of the // namespace. If the only reason the // owning namespace was alive was // because it was a parent of @ns // it's active reference count now goes // to zero... -------------------------------- // | // ns->__ns_active_ref == 0 | // parent_ns->__ns_active_ref == 0 | | commit_nsset() -----------------> // If setns() // now manages to install the namespaces // it will call ns_active_ref_get() // on them thus bumping the active reference // count from zero again but without also // taking the required reference on the owner. // Thus we get: // // ns->__ns_active_ref == 1 // parent_ns->__ns_active_ref == 0 When later someone does ns_active_ref_put() on @ns it will underflow parent_ns->__ns_active_ref leading to a splat from our asserts thinking there are still active references when in fact the counter just underflowed. So resurrect the ownership chain if necessary as well. If the caller succeeded to grab passive references to the set of namespaces the setns() should simply succeed even if the target task exists or gets reaped in the meantime and thus has dropped all active references to its namespaces. The race is rare and can only be triggered when using pidfs to setns() to namespaces. Also note that active reference on initial namespaces are nops. Since we now always handle parent references directly we can drop ns_ref_active_get_owner() when adding a namespace to a namespace tree. This is now all handled uniformly in the places where the new namespaces actually become active. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-5-ae8a4ad5a3b3@kernel.org Fixes: 3c9820d5c64a ("ns: add active reference count") Reported-by: syzbot+1957b26299cf3ff7890c@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 47 ++++------------------------------------------- 1 file changed, 4 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 791b18dc77d0..3aaba2ca31d7 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -287,47 +287,8 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns #define ns_ref_active_read(__ns) \ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) -void __ns_ref_active_get_owner(struct ns_common *ns); +void __ns_ref_active_put(struct ns_common *ns); -static __always_inline void __ns_ref_active_get(struct ns_common *ns) -{ - /* Initial namespaces are always active. */ - if (!is_ns_init_id(ns)) - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); -} -#define ns_ref_active_get(__ns) \ - do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) - -static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) -{ - /* Initial namespaces are always active. */ - if (is_ns_init_id(ns)) - return true; - - if (atomic_inc_not_zero(&ns->__ns_ref_active)) { - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); - return true; - } - return false; -} - -#define ns_ref_active_get_owner(__ns) \ - do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0) - -void __ns_ref_active_put_owner(struct ns_common *ns); - -static __always_inline void __ns_ref_active_put(struct ns_common *ns) -{ - /* Initial namespaces are always active. */ - if (is_ns_init_id(ns)) - return; - - if (atomic_dec_and_test(&ns->__ns_ref_active)) { - VFS_WARN_ON_ONCE(is_initial_namespace(ns)); - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); - __ns_ref_active_put_owner(ns); - } -} #define ns_ref_active_put(__ns) \ do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) @@ -343,9 +304,9 @@ static __always_inline struct ns_common *__must_check ns_get_unless_inactive(str return ns; } -void __ns_ref_active_resurrect(struct ns_common *ns); +void __ns_ref_active_get(struct ns_common *ns); -#define ns_ref_active_resurrect(__ns) \ - do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0) +#define ns_ref_active_get(__ns) \ + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) #endif -- cgit v1.2.3 From 57b39aabb99ea69b9046df2915404a931d9d6695 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 9 Nov 2025 22:11:27 +0100 Subject: ns: add asserts for active refcount underflow Add a few more assert to detect active reference count underflows. Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-6-ae8a4ad5a3b3@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 3aaba2ca31d7..66ea09b48377 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -294,7 +294,6 @@ void __ns_ref_active_put(struct ns_common *ns); static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) { - VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); if (!__ns_ref_active_read(ns)) { VFS_WARN_ON_ONCE(is_ns_init_id(ns)); return NULL; -- cgit v1.2.3 From 69674282fc97fffd98a85ab5b4837edbc5898145 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:49 +0100 Subject: wifi: ieee80211: split mesh definitions out The ieee80211.h file has gotten very long, start splitting it by putting mesh definitions into a separate file. Link: https://patch.msgid.link/20251105153843.489713ca8b34.I3befb4bf6ace0315758a1794224ddd18c4652e32@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-mesh.h | 230 +++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 211 +------------------------------------ 2 files changed, 232 insertions(+), 209 deletions(-) create mode 100644 include/linux/ieee80211-mesh.h (limited to 'include') diff --git a/include/linux/ieee80211-mesh.h b/include/linux/ieee80211-mesh.h new file mode 100644 index 000000000000..4b829bcb38b6 --- /dev/null +++ b/include/linux/ieee80211-mesh.h @@ -0,0 +1,230 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 mesh definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_MESH_H +#define LINUX_IEEE80211_MESH_H + +#include +#include + +#define IEEE80211_MAX_MESH_ID_LEN 32 + +struct ieee80211s_hdr { + u8 flags; + u8 ttl; + __le32 seqnum; + u8 eaddr1[ETH_ALEN]; + u8 eaddr2[ETH_ALEN]; +} __packed __aligned(2); + +/* Mesh flags */ +#define MESH_FLAGS_AE_A4 0x1 +#define MESH_FLAGS_AE_A5_A6 0x2 +#define MESH_FLAGS_AE 0x3 +#define MESH_FLAGS_PS_DEEP 0x4 + +/** + * enum ieee80211_preq_flags - mesh PREQ element flags + * + * @IEEE80211_PREQ_PROACTIVE_PREP_FLAG: proactive PREP subfield + */ +enum ieee80211_preq_flags { + IEEE80211_PREQ_PROACTIVE_PREP_FLAG = 1<<2, +}; + +/** + * enum ieee80211_preq_target_flags - mesh PREQ element per target flags + * + * @IEEE80211_PREQ_TO_FLAG: target only subfield + * @IEEE80211_PREQ_USN_FLAG: unknown target HWMP sequence number subfield + */ +enum ieee80211_preq_target_flags { + IEEE80211_PREQ_TO_FLAG = 1<<0, + IEEE80211_PREQ_USN_FLAG = 1<<2, +}; + +/** + * struct ieee80211_mesh_chansw_params_ie - mesh channel switch parameters IE + * @mesh_ttl: Time To Live + * @mesh_flags: Flags + * @mesh_reason: Reason Code + * @mesh_pre_value: Precedence Value + * + * This structure represents the payload of the "Mesh Channel Switch + * Parameters element" as described in IEEE Std 802.11-2020 section + * 9.4.2.102. + */ +struct ieee80211_mesh_chansw_params_ie { + u8 mesh_ttl; + u8 mesh_flags; + __le16 mesh_reason; + __le16 mesh_pre_value; +} __packed; + +/** + * struct ieee80211_meshconf_ie - Mesh Configuration element + * @meshconf_psel: Active Path Selection Protocol Identifier + * @meshconf_pmetric: Active Path Selection Metric Identifier + * @meshconf_congest: Congestion Control Mode Identifier + * @meshconf_synch: Synchronization Method Identifier + * @meshconf_auth: Authentication Protocol Identifier + * @meshconf_form: Mesh Formation Info + * @meshconf_cap: Mesh Capability (see &enum mesh_config_capab_flags) + * + * This structure represents the payload of the "Mesh Configuration + * element" as described in IEEE Std 802.11-2020 section 9.4.2.97. + */ +struct ieee80211_meshconf_ie { + u8 meshconf_psel; + u8 meshconf_pmetric; + u8 meshconf_congest; + u8 meshconf_synch; + u8 meshconf_auth; + u8 meshconf_form; + u8 meshconf_cap; +} __packed; + +/** + * enum mesh_config_capab_flags - Mesh Configuration IE capability field flags + * + * @IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS: STA is willing to establish + * additional mesh peerings with other mesh STAs + * @IEEE80211_MESHCONF_CAPAB_FORWARDING: the STA forwards MSDUs + * @IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING: TBTT adjustment procedure + * is ongoing + * @IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL: STA is in deep sleep mode or has + * neighbors in deep sleep mode + * + * Enumerates the "Mesh Capability" as described in IEEE Std + * 802.11-2020 section 9.4.2.97.7. + */ +enum mesh_config_capab_flags { + IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS = 0x01, + IEEE80211_MESHCONF_CAPAB_FORWARDING = 0x08, + IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING = 0x20, + IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL = 0x40, +}; + +#define IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE 0x1 + +/* + * mesh channel switch parameters element's flag indicator + * + */ +#define WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT BIT(0) +#define WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR BIT(1) +#define WLAN_EID_CHAN_SWITCH_PARAM_REASON BIT(2) + +/** + * struct ieee80211_rann_ie - RANN (root announcement) element + * @rann_flags: Flags + * @rann_hopcount: Hop Count + * @rann_ttl: Element TTL + * @rann_addr: Root Mesh STA Address + * @rann_seq: HWMP Sequence Number + * @rann_interval: Interval + * @rann_metric: Metric + * + * This structure represents the payload of the "RANN element" as + * described in IEEE Std 802.11-2020 section 9.4.2.111. + */ +struct ieee80211_rann_ie { + u8 rann_flags; + u8 rann_hopcount; + u8 rann_ttl; + u8 rann_addr[ETH_ALEN]; + __le32 rann_seq; + __le32 rann_interval; + __le32 rann_metric; +} __packed; + +enum ieee80211_rann_flags { + RANN_FLAG_IS_GATE = 1 << 0, +}; + +/* Mesh action codes */ +enum ieee80211_mesh_actioncode { + WLAN_MESH_ACTION_LINK_METRIC_REPORT, + WLAN_MESH_ACTION_HWMP_PATH_SELECTION, + WLAN_MESH_ACTION_GATE_ANNOUNCEMENT, + WLAN_MESH_ACTION_CONGESTION_CONTROL_NOTIFICATION, + WLAN_MESH_ACTION_MCCA_SETUP_REQUEST, + WLAN_MESH_ACTION_MCCA_SETUP_REPLY, + WLAN_MESH_ACTION_MCCA_ADVERTISEMENT_REQUEST, + WLAN_MESH_ACTION_MCCA_ADVERTISEMENT, + WLAN_MESH_ACTION_MCCA_TEARDOWN, + WLAN_MESH_ACTION_TBTT_ADJUSTMENT_REQUEST, + WLAN_MESH_ACTION_TBTT_ADJUSTMENT_RESPONSE, +}; + +/** + * enum ieee80211_mesh_sync_method - mesh synchronization method identifier + * + * @IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET: the default synchronization method + * @IEEE80211_SYNC_METHOD_VENDOR: a vendor specific synchronization method + * that will be specified in a vendor specific information element + */ +enum ieee80211_mesh_sync_method { + IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET = 1, + IEEE80211_SYNC_METHOD_VENDOR = 255, +}; + +/** + * enum ieee80211_mesh_path_protocol - mesh path selection protocol identifier + * + * @IEEE80211_PATH_PROTOCOL_HWMP: the default path selection protocol + * @IEEE80211_PATH_PROTOCOL_VENDOR: a vendor specific protocol that will + * be specified in a vendor specific information element + */ +enum ieee80211_mesh_path_protocol { + IEEE80211_PATH_PROTOCOL_HWMP = 1, + IEEE80211_PATH_PROTOCOL_VENDOR = 255, +}; + +/** + * enum ieee80211_mesh_path_metric - mesh path selection metric identifier + * + * @IEEE80211_PATH_METRIC_AIRTIME: the default path selection metric + * @IEEE80211_PATH_METRIC_VENDOR: a vendor specific metric that will be + * specified in a vendor specific information element + */ +enum ieee80211_mesh_path_metric { + IEEE80211_PATH_METRIC_AIRTIME = 1, + IEEE80211_PATH_METRIC_VENDOR = 255, +}; + +/** + * enum ieee80211_root_mode_identifier - root mesh STA mode identifier + * + * These attribute are used by dot11MeshHWMPRootMode to set root mesh STA mode + * + * @IEEE80211_ROOTMODE_NO_ROOT: the mesh STA is not a root mesh STA (default) + * @IEEE80211_ROOTMODE_ROOT: the mesh STA is a root mesh STA if greater than + * this value + * @IEEE80211_PROACTIVE_PREQ_NO_PREP: the mesh STA is a root mesh STA supports + * the proactive PREQ with proactive PREP subfield set to 0 + * @IEEE80211_PROACTIVE_PREQ_WITH_PREP: the mesh STA is a root mesh STA + * supports the proactive PREQ with proactive PREP subfield set to 1 + * @IEEE80211_PROACTIVE_RANN: the mesh STA is a root mesh STA supports + * the proactive RANN + */ +enum ieee80211_root_mode_identifier { + IEEE80211_ROOTMODE_NO_ROOT = 0, + IEEE80211_ROOTMODE_ROOT = 1, + IEEE80211_PROACTIVE_PREQ_NO_PREP = 2, + IEEE80211_PROACTIVE_PREQ_WITH_PREP = 3, + IEEE80211_PROACTIVE_RANN = 4, +}; + +#endif /* LINUX_IEEE80211_MESH_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index ddff9102f633..fe78b150ab45 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -252,8 +252,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_SSID_LEN 32 -#define IEEE80211_MAX_MESH_ID_LEN 32 - #define IEEE80211_FIRST_TSPEC_TSID 8 #define IEEE80211_NUM_TIDS 16 @@ -881,40 +879,6 @@ static inline u16 ieee80211_get_sn(struct ieee80211_hdr *hdr) return le16_get_bits(hdr->seq_ctrl, IEEE80211_SCTL_SEQ); } -struct ieee80211s_hdr { - u8 flags; - u8 ttl; - __le32 seqnum; - u8 eaddr1[ETH_ALEN]; - u8 eaddr2[ETH_ALEN]; -} __packed __aligned(2); - -/* Mesh flags */ -#define MESH_FLAGS_AE_A4 0x1 -#define MESH_FLAGS_AE_A5_A6 0x2 -#define MESH_FLAGS_AE 0x3 -#define MESH_FLAGS_PS_DEEP 0x4 - -/** - * enum ieee80211_preq_flags - mesh PREQ element flags - * - * @IEEE80211_PREQ_PROACTIVE_PREP_FLAG: proactive PREP subfield - */ -enum ieee80211_preq_flags { - IEEE80211_PREQ_PROACTIVE_PREP_FLAG = 1<<2, -}; - -/** - * enum ieee80211_preq_target_flags - mesh PREQ element per target flags - * - * @IEEE80211_PREQ_TO_FLAG: target only subfield - * @IEEE80211_PREQ_USN_FLAG: unknown target HWMP sequence number subfield - */ -enum ieee80211_preq_target_flags { - IEEE80211_PREQ_TO_FLAG = 1<<0, - IEEE80211_PREQ_USN_FLAG = 1<<2, -}; - /** * struct ieee80211_quiet_ie - Quiet element * @count: Quiet Count @@ -993,24 +957,6 @@ struct ieee80211_sec_chan_offs_ie { u8 sec_chan_offs; } __packed; -/** - * struct ieee80211_mesh_chansw_params_ie - mesh channel switch parameters IE - * @mesh_ttl: Time To Live - * @mesh_flags: Flags - * @mesh_reason: Reason Code - * @mesh_pre_value: Precedence Value - * - * This structure represents the payload of the "Mesh Channel Switch - * Parameters element" as described in IEEE Std 802.11-2020 section - * 9.4.2.102. - */ -struct ieee80211_mesh_chansw_params_ie { - u8 mesh_ttl; - u8 mesh_flags; - __le16 mesh_reason; - __le16 mesh_pre_value; -} __packed; - /** * struct ieee80211_wide_bw_chansw_ie - wide bandwidth channel switch IE * @new_channel_width: New Channel Width @@ -1051,87 +997,6 @@ struct ieee80211_tim_ie { }; } __packed; -/** - * struct ieee80211_meshconf_ie - Mesh Configuration element - * @meshconf_psel: Active Path Selection Protocol Identifier - * @meshconf_pmetric: Active Path Selection Metric Identifier - * @meshconf_congest: Congestion Control Mode Identifier - * @meshconf_synch: Synchronization Method Identifier - * @meshconf_auth: Authentication Protocol Identifier - * @meshconf_form: Mesh Formation Info - * @meshconf_cap: Mesh Capability (see &enum mesh_config_capab_flags) - * - * This structure represents the payload of the "Mesh Configuration - * element" as described in IEEE Std 802.11-2020 section 9.4.2.97. - */ -struct ieee80211_meshconf_ie { - u8 meshconf_psel; - u8 meshconf_pmetric; - u8 meshconf_congest; - u8 meshconf_synch; - u8 meshconf_auth; - u8 meshconf_form; - u8 meshconf_cap; -} __packed; - -/** - * enum mesh_config_capab_flags - Mesh Configuration IE capability field flags - * - * @IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS: STA is willing to establish - * additional mesh peerings with other mesh STAs - * @IEEE80211_MESHCONF_CAPAB_FORWARDING: the STA forwards MSDUs - * @IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING: TBTT adjustment procedure - * is ongoing - * @IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL: STA is in deep sleep mode or has - * neighbors in deep sleep mode - * - * Enumerates the "Mesh Capability" as described in IEEE Std - * 802.11-2020 section 9.4.2.97.7. - */ -enum mesh_config_capab_flags { - IEEE80211_MESHCONF_CAPAB_ACCEPT_PLINKS = 0x01, - IEEE80211_MESHCONF_CAPAB_FORWARDING = 0x08, - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING = 0x20, - IEEE80211_MESHCONF_CAPAB_POWER_SAVE_LEVEL = 0x40, -}; - -#define IEEE80211_MESHCONF_FORM_CONNECTED_TO_GATE 0x1 - -/* - * mesh channel switch parameters element's flag indicator - * - */ -#define WLAN_EID_CHAN_SWITCH_PARAM_TX_RESTRICT BIT(0) -#define WLAN_EID_CHAN_SWITCH_PARAM_INITIATOR BIT(1) -#define WLAN_EID_CHAN_SWITCH_PARAM_REASON BIT(2) - -/** - * struct ieee80211_rann_ie - RANN (root announcement) element - * @rann_flags: Flags - * @rann_hopcount: Hop Count - * @rann_ttl: Element TTL - * @rann_addr: Root Mesh STA Address - * @rann_seq: HWMP Sequence Number - * @rann_interval: Interval - * @rann_metric: Metric - * - * This structure represents the payload of the "RANN element" as - * described in IEEE Std 802.11-2020 section 9.4.2.111. - */ -struct ieee80211_rann_ie { - u8 rann_flags; - u8 rann_hopcount; - u8 rann_ttl; - u8 rann_addr[ETH_ALEN]; - __le32 rann_seq; - __le32 rann_interval; - __le32 rann_metric; -} __packed; - -enum ieee80211_rann_flags { - RANN_FLAG_IS_GATE = 1 << 0, -}; - enum ieee80211_ht_chanwidth_values { IEEE80211_HT_CHANWIDTH_20MHZ = 0, IEEE80211_HT_CHANWIDTH_ANY = 1, @@ -3971,21 +3836,6 @@ enum ieee80211_self_protected_actioncode { WLAN_SP_MGK_ACK = 5, }; -/* Mesh action codes */ -enum ieee80211_mesh_actioncode { - WLAN_MESH_ACTION_LINK_METRIC_REPORT, - WLAN_MESH_ACTION_HWMP_PATH_SELECTION, - WLAN_MESH_ACTION_GATE_ANNOUNCEMENT, - WLAN_MESH_ACTION_CONGESTION_CONTROL_NOTIFICATION, - WLAN_MESH_ACTION_MCCA_SETUP_REQUEST, - WLAN_MESH_ACTION_MCCA_SETUP_REPLY, - WLAN_MESH_ACTION_MCCA_ADVERTISEMENT_REQUEST, - WLAN_MESH_ACTION_MCCA_ADVERTISEMENT, - WLAN_MESH_ACTION_MCCA_TEARDOWN, - WLAN_MESH_ACTION_TBTT_ADJUSTMENT_REQUEST, - WLAN_MESH_ACTION_TBTT_ADJUSTMENT_RESPONSE, -}; - /* Unprotected WNM action codes */ enum ieee80211_unprotected_wnm_actioncode { WLAN_UNPROTECTED_WNM_ACTION_TIM = 0, @@ -4198,65 +4048,6 @@ enum ieee80211_tdls_actioncode { /* BSS Coex IE information field bits */ #define WLAN_BSS_COEX_INFORMATION_REQUEST BIT(0) -/** - * enum ieee80211_mesh_sync_method - mesh synchronization method identifier - * - * @IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET: the default synchronization method - * @IEEE80211_SYNC_METHOD_VENDOR: a vendor specific synchronization method - * that will be specified in a vendor specific information element - */ -enum ieee80211_mesh_sync_method { - IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET = 1, - IEEE80211_SYNC_METHOD_VENDOR = 255, -}; - -/** - * enum ieee80211_mesh_path_protocol - mesh path selection protocol identifier - * - * @IEEE80211_PATH_PROTOCOL_HWMP: the default path selection protocol - * @IEEE80211_PATH_PROTOCOL_VENDOR: a vendor specific protocol that will - * be specified in a vendor specific information element - */ -enum ieee80211_mesh_path_protocol { - IEEE80211_PATH_PROTOCOL_HWMP = 1, - IEEE80211_PATH_PROTOCOL_VENDOR = 255, -}; - -/** - * enum ieee80211_mesh_path_metric - mesh path selection metric identifier - * - * @IEEE80211_PATH_METRIC_AIRTIME: the default path selection metric - * @IEEE80211_PATH_METRIC_VENDOR: a vendor specific metric that will be - * specified in a vendor specific information element - */ -enum ieee80211_mesh_path_metric { - IEEE80211_PATH_METRIC_AIRTIME = 1, - IEEE80211_PATH_METRIC_VENDOR = 255, -}; - -/** - * enum ieee80211_root_mode_identifier - root mesh STA mode identifier - * - * These attribute are used by dot11MeshHWMPRootMode to set root mesh STA mode - * - * @IEEE80211_ROOTMODE_NO_ROOT: the mesh STA is not a root mesh STA (default) - * @IEEE80211_ROOTMODE_ROOT: the mesh STA is a root mesh STA if greater than - * this value - * @IEEE80211_PROACTIVE_PREQ_NO_PREP: the mesh STA is a root mesh STA supports - * the proactive PREQ with proactive PREP subfield set to 0 - * @IEEE80211_PROACTIVE_PREQ_WITH_PREP: the mesh STA is a root mesh STA - * supports the proactive PREQ with proactive PREP subfield set to 1 - * @IEEE80211_PROACTIVE_RANN: the mesh STA is a root mesh STA supports - * the proactive RANN - */ -enum ieee80211_root_mode_identifier { - IEEE80211_ROOTMODE_NO_ROOT = 0, - IEEE80211_ROOTMODE_ROOT = 1, - IEEE80211_PROACTIVE_PREQ_NO_PREP = 2, - IEEE80211_PROACTIVE_PREQ_WITH_PREP = 3, - IEEE80211_PROACTIVE_RANN = 4, -}; - /* * IEEE 802.11-2007 7.3.2.9 Country information element * @@ -6098,4 +5889,6 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) #define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 #define NAN_DEV_CAPA_S3_SUPPORTED 0x10 +#include "ieee80211-mesh.h" + #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From fdc1c141f3ef4dc94e3880e973061681843f62c0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:50 +0100 Subject: wifi: ieee80211: split HT definitions out The ieee80211.h file has gotten very long, continue splitting it by putting HT definitions into a separate file. Link: https://patch.msgid.link/20251105153843.7532471178d0.Id956a5433ad8658e4e5c0272dbcbb59587206142@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-ht.h | 292 +++++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 272 +--------------------------------------- 2 files changed, 293 insertions(+), 271 deletions(-) create mode 100644 include/linux/ieee80211-ht.h (limited to 'include') diff --git a/include/linux/ieee80211-ht.h b/include/linux/ieee80211-ht.h new file mode 100644 index 000000000000..21bbf470540f --- /dev/null +++ b/include/linux/ieee80211-ht.h @@ -0,0 +1,292 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 HT definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_HT_H +#define LINUX_IEEE80211_HT_H + +#include +#include + +/* Maximal size of an A-MSDU that can be transported in a HT BA session */ +#define IEEE80211_MAX_MPDU_LEN_HT_BA 4095 + +/* Maximal size of an A-MSDU */ +#define IEEE80211_MAX_MPDU_LEN_HT_3839 3839 +#define IEEE80211_MAX_MPDU_LEN_HT_7935 7935 + +#define IEEE80211_HT_CTL_LEN 4 + +enum ieee80211_ht_chanwidth_values { + IEEE80211_HT_CHANWIDTH_20MHZ = 0, + IEEE80211_HT_CHANWIDTH_ANY = 1, +}; + +/** + * struct ieee80211_bar - Block Ack Request frame format + * @frame_control: Frame Control + * @duration: Duration + * @ra: RA + * @ta: TA + * @control: BAR Control + * @start_seq_num: Starting Sequence Number (see Figure 9-37) + * + * This structure represents the "BlockAckReq frame format" + * as described in IEEE Std 802.11-2020 section 9.3.1.7. +*/ +struct ieee80211_bar { + __le16 frame_control; + __le16 duration; + __u8 ra[ETH_ALEN]; + __u8 ta[ETH_ALEN]; + __le16 control; + __le16 start_seq_num; +} __packed; + +/* 802.11 BAR control masks */ +#define IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL 0x0000 +#define IEEE80211_BAR_CTRL_MULTI_TID 0x0002 +#define IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA 0x0004 +#define IEEE80211_BAR_CTRL_TID_INFO_MASK 0xf000 +#define IEEE80211_BAR_CTRL_TID_INFO_SHIFT 12 + +#define IEEE80211_HT_MCS_MASK_LEN 10 + +/** + * struct ieee80211_mcs_info - Supported MCS Set field + * @rx_mask: RX mask + * @rx_highest: highest supported RX rate. If set represents + * the highest supported RX data rate in units of 1 Mbps. + * If this field is 0 this value should not be used to + * consider the highest RX data rate supported. + * @tx_params: TX parameters + * @reserved: Reserved bits + * + * This structure represents the "Supported MCS Set field" as + * described in IEEE Std 802.11-2020 section 9.4.2.55.4. + */ +struct ieee80211_mcs_info { + u8 rx_mask[IEEE80211_HT_MCS_MASK_LEN]; + __le16 rx_highest; + u8 tx_params; + u8 reserved[3]; +} __packed; + +/* 802.11n HT capability MSC set */ +#define IEEE80211_HT_MCS_RX_HIGHEST_MASK 0x3ff +#define IEEE80211_HT_MCS_TX_DEFINED 0x01 +#define IEEE80211_HT_MCS_TX_RX_DIFF 0x02 +/* value 0 == 1 stream etc */ +#define IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK 0x0C +#define IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT 2 +#define IEEE80211_HT_MCS_TX_MAX_STREAMS 4 +#define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION 0x10 + +#define IEEE80211_HT_MCS_CHAINS(mcs) ((mcs) == 32 ? 1 : (1 + ((mcs) >> 3))) + +/* + * 802.11n D5.0 20.3.5 / 20.6 says: + * - indices 0 to 7 and 32 are single spatial stream + * - 8 to 31 are multiple spatial streams using equal modulation + * [8..15 for two streams, 16..23 for three and 24..31 for four] + * - remainder are multiple spatial streams using unequal modulation + */ +#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START 33 +#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE \ + (IEEE80211_HT_MCS_UNEQUAL_MODULATION_START / 8) + +/** + * struct ieee80211_ht_cap - HT capabilities element + * @cap_info: HT Capability Information + * @ampdu_params_info: A-MPDU Parameters + * @mcs: Supported MCS Set + * @extended_ht_cap_info: HT Extended Capabilities + * @tx_BF_cap_info: Transmit Beamforming Capabilities + * @antenna_selection_info: ASEL Capability + * + * This structure represents the payload of the "HT Capabilities + * element" as described in IEEE Std 802.11-2020 section 9.4.2.55. + */ +struct ieee80211_ht_cap { + __le16 cap_info; + u8 ampdu_params_info; + + /* 16 bytes MCS information */ + struct ieee80211_mcs_info mcs; + + __le16 extended_ht_cap_info; + __le32 tx_BF_cap_info; + u8 antenna_selection_info; +} __packed; + +/* 802.11n HT capabilities masks (for cap_info) */ +#define IEEE80211_HT_CAP_LDPC_CODING 0x0001 +#define IEEE80211_HT_CAP_SUP_WIDTH_20_40 0x0002 +#define IEEE80211_HT_CAP_SM_PS 0x000C +#define IEEE80211_HT_CAP_SM_PS_SHIFT 2 +#define IEEE80211_HT_CAP_GRN_FLD 0x0010 +#define IEEE80211_HT_CAP_SGI_20 0x0020 +#define IEEE80211_HT_CAP_SGI_40 0x0040 +#define IEEE80211_HT_CAP_TX_STBC 0x0080 +#define IEEE80211_HT_CAP_RX_STBC 0x0300 +#define IEEE80211_HT_CAP_RX_STBC_SHIFT 8 +#define IEEE80211_HT_CAP_DELAY_BA 0x0400 +#define IEEE80211_HT_CAP_MAX_AMSDU 0x0800 +#define IEEE80211_HT_CAP_DSSSCCK40 0x1000 +#define IEEE80211_HT_CAP_RESERVED 0x2000 +#define IEEE80211_HT_CAP_40MHZ_INTOLERANT 0x4000 +#define IEEE80211_HT_CAP_LSIG_TXOP_PROT 0x8000 + +/* 802.11n HT extended capabilities masks (for extended_ht_cap_info) */ +#define IEEE80211_HT_EXT_CAP_PCO 0x0001 +#define IEEE80211_HT_EXT_CAP_PCO_TIME 0x0006 +#define IEEE80211_HT_EXT_CAP_PCO_TIME_SHIFT 1 +#define IEEE80211_HT_EXT_CAP_MCS_FB 0x0300 +#define IEEE80211_HT_EXT_CAP_MCS_FB_SHIFT 8 +#define IEEE80211_HT_EXT_CAP_HTC_SUP 0x0400 +#define IEEE80211_HT_EXT_CAP_RD_RESPONDER 0x0800 + +/* 802.11n HT capability AMPDU settings (for ampdu_params_info) */ +#define IEEE80211_HT_AMPDU_PARM_FACTOR 0x03 +#define IEEE80211_HT_AMPDU_PARM_DENSITY 0x1C +#define IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT 2 + +/* + * Maximum length of AMPDU that the STA can receive in high-throughput (HT). + * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets) + */ +enum ieee80211_max_ampdu_length_exp { + IEEE80211_HT_MAX_AMPDU_8K = 0, + IEEE80211_HT_MAX_AMPDU_16K = 1, + IEEE80211_HT_MAX_AMPDU_32K = 2, + IEEE80211_HT_MAX_AMPDU_64K = 3 +}; + +#define IEEE80211_HT_MAX_AMPDU_FACTOR 13 + +/* Minimum MPDU start spacing */ +enum ieee80211_min_mpdu_spacing { + IEEE80211_HT_MPDU_DENSITY_NONE = 0, /* No restriction */ + IEEE80211_HT_MPDU_DENSITY_0_25 = 1, /* 1/4 usec */ + IEEE80211_HT_MPDU_DENSITY_0_5 = 2, /* 1/2 usec */ + IEEE80211_HT_MPDU_DENSITY_1 = 3, /* 1 usec */ + IEEE80211_HT_MPDU_DENSITY_2 = 4, /* 2 usec */ + IEEE80211_HT_MPDU_DENSITY_4 = 5, /* 4 usec */ + IEEE80211_HT_MPDU_DENSITY_8 = 6, /* 8 usec */ + IEEE80211_HT_MPDU_DENSITY_16 = 7 /* 16 usec */ +}; + +/** + * struct ieee80211_ht_operation - HT operation IE + * @primary_chan: Primary Channel + * @ht_param: HT Operation Information parameters + * @operation_mode: HT Operation Information operation mode + * @stbc_param: HT Operation Information STBC params + * @basic_set: Basic HT-MCS Set + * + * This structure represents the payload of the "HT Operation + * element" as described in IEEE Std 802.11-2020 section 9.4.2.56. + */ +struct ieee80211_ht_operation { + u8 primary_chan; + u8 ht_param; + __le16 operation_mode; + __le16 stbc_param; + u8 basic_set[16]; +} __packed; + +/* for ht_param */ +#define IEEE80211_HT_PARAM_CHA_SEC_OFFSET 0x03 +#define IEEE80211_HT_PARAM_CHA_SEC_NONE 0x00 +#define IEEE80211_HT_PARAM_CHA_SEC_ABOVE 0x01 +#define IEEE80211_HT_PARAM_CHA_SEC_BELOW 0x03 +#define IEEE80211_HT_PARAM_CHAN_WIDTH_ANY 0x04 +#define IEEE80211_HT_PARAM_RIFS_MODE 0x08 + +/* for operation_mode */ +#define IEEE80211_HT_OP_MODE_PROTECTION 0x0003 +#define IEEE80211_HT_OP_MODE_PROTECTION_NONE 0 +#define IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER 1 +#define IEEE80211_HT_OP_MODE_PROTECTION_20MHZ 2 +#define IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED 3 +#define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT 0x0004 +#define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT 0x0010 +#define IEEE80211_HT_OP_MODE_CCFS2_SHIFT 5 +#define IEEE80211_HT_OP_MODE_CCFS2_MASK 0x1fe0 + +/* for stbc_param */ +#define IEEE80211_HT_STBC_PARAM_DUAL_BEACON 0x0040 +#define IEEE80211_HT_STBC_PARAM_DUAL_CTS_PROT 0x0080 +#define IEEE80211_HT_STBC_PARAM_STBC_BEACON 0x0100 +#define IEEE80211_HT_STBC_PARAM_LSIG_TXOP_FULLPROT 0x0200 +#define IEEE80211_HT_STBC_PARAM_PCO_ACTIVE 0x0400 +#define IEEE80211_HT_STBC_PARAM_PCO_PHASE 0x0800 + + +/* block-ack parameters */ +#define IEEE80211_ADDBA_PARAM_AMSDU_MASK 0x0001 +#define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002 +#define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C +#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFC0 +#define IEEE80211_DELBA_PARAM_TID_MASK 0xF000 +#define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800 + +/* + * A-MPDU buffer sizes + * According to HT size varies from 8 to 64 frames + * HE adds the ability to have up to 256 frames. + * EHT adds the ability to have up to 1K frames. + */ +#define IEEE80211_MIN_AMPDU_BUF 0x8 +#define IEEE80211_MAX_AMPDU_BUF_HT 0x40 +#define IEEE80211_MAX_AMPDU_BUF_HE 0x100 +#define IEEE80211_MAX_AMPDU_BUF_EHT 0x400 + + +/* Spatial Multiplexing Power Save Modes (for capability) */ +#define WLAN_HT_CAP_SM_PS_STATIC 0 +#define WLAN_HT_CAP_SM_PS_DYNAMIC 1 +#define WLAN_HT_CAP_SM_PS_INVALID 2 +#define WLAN_HT_CAP_SM_PS_DISABLED 3 + +/* for SM power control field lower two bits */ +#define WLAN_HT_SMPS_CONTROL_DISABLED 0 +#define WLAN_HT_SMPS_CONTROL_STATIC 1 +#define WLAN_HT_SMPS_CONTROL_DYNAMIC 3 + +/* HT action codes */ +enum ieee80211_ht_actioncode { + WLAN_HT_ACTION_NOTIFY_CHANWIDTH = 0, + WLAN_HT_ACTION_SMPS = 1, + WLAN_HT_ACTION_PSMP = 2, + WLAN_HT_ACTION_PCO_PHASE = 3, + WLAN_HT_ACTION_CSI = 4, + WLAN_HT_ACTION_NONCOMPRESSED_BF = 5, + WLAN_HT_ACTION_COMPRESSED_BF = 6, + WLAN_HT_ACTION_ASEL_IDX_FEEDBACK = 7, +}; + +/* BACK action code */ +enum ieee80211_back_actioncode { + WLAN_ACTION_ADDBA_REQ = 0, + WLAN_ACTION_ADDBA_RESP = 1, + WLAN_ACTION_DELBA = 2, +}; + +/* BACK (block-ack) parties */ +enum ieee80211_back_parties { + WLAN_BACK_RECIPIENT = 0, + WLAN_BACK_INITIATOR = 1, +}; + +#endif /* LINUX_IEEE80211_HT_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index fe78b150ab45..0a9b4a8025cd 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -239,13 +239,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */ #define IEEE80211_MAX_FRAME_LEN 2352 -/* Maximal size of an A-MSDU that can be transported in a HT BA session */ -#define IEEE80211_MAX_MPDU_LEN_HT_BA 4095 - -/* Maximal size of an A-MSDU */ -#define IEEE80211_MAX_MPDU_LEN_HT_3839 3839 -#define IEEE80211_MAX_MPDU_LEN_HT_7935 7935 - #define IEEE80211_MAX_MPDU_LEN_VHT_3895 3895 #define IEEE80211_MAX_MPDU_LEN_VHT_7991 7991 #define IEEE80211_MAX_MPDU_LEN_VHT_11454 11454 @@ -302,8 +295,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_WMM_IE_STA_QOSINFO_SP_MASK 0x03 #define IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT 5 -#define IEEE80211_HT_CTL_LEN 4 - /* trigger type within common_info of trigger frame */ #define IEEE80211_TRIGGER_TYPE_MASK 0xf #define IEEE80211_TRIGGER_TYPE_BASIC 0x0 @@ -997,11 +988,6 @@ struct ieee80211_tim_ie { }; } __packed; -enum ieee80211_ht_chanwidth_values { - IEEE80211_HT_CHANWIDTH_20MHZ = 0, - IEEE80211_HT_CHANWIDTH_ANY = 1, -}; - /** * enum ieee80211_vht_opmode_bits - VHT operating mode field bits * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask @@ -1677,146 +1663,6 @@ struct ieee80211_p2p_noa_attr { #define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F -/** - * struct ieee80211_bar - Block Ack Request frame format - * @frame_control: Frame Control - * @duration: Duration - * @ra: RA - * @ta: TA - * @control: BAR Control - * @start_seq_num: Starting Sequence Number (see Figure 9-37) - * - * This structure represents the "BlockAckReq frame format" - * as described in IEEE Std 802.11-2020 section 9.3.1.7. -*/ -struct ieee80211_bar { - __le16 frame_control; - __le16 duration; - __u8 ra[ETH_ALEN]; - __u8 ta[ETH_ALEN]; - __le16 control; - __le16 start_seq_num; -} __packed; - -/* 802.11 BAR control masks */ -#define IEEE80211_BAR_CTRL_ACK_POLICY_NORMAL 0x0000 -#define IEEE80211_BAR_CTRL_MULTI_TID 0x0002 -#define IEEE80211_BAR_CTRL_CBMTID_COMPRESSED_BA 0x0004 -#define IEEE80211_BAR_CTRL_TID_INFO_MASK 0xf000 -#define IEEE80211_BAR_CTRL_TID_INFO_SHIFT 12 - -#define IEEE80211_HT_MCS_MASK_LEN 10 - -/** - * struct ieee80211_mcs_info - Supported MCS Set field - * @rx_mask: RX mask - * @rx_highest: highest supported RX rate. If set represents - * the highest supported RX data rate in units of 1 Mbps. - * If this field is 0 this value should not be used to - * consider the highest RX data rate supported. - * @tx_params: TX parameters - * @reserved: Reserved bits - * - * This structure represents the "Supported MCS Set field" as - * described in IEEE Std 802.11-2020 section 9.4.2.55.4. - */ -struct ieee80211_mcs_info { - u8 rx_mask[IEEE80211_HT_MCS_MASK_LEN]; - __le16 rx_highest; - u8 tx_params; - u8 reserved[3]; -} __packed; - -/* 802.11n HT capability MSC set */ -#define IEEE80211_HT_MCS_RX_HIGHEST_MASK 0x3ff -#define IEEE80211_HT_MCS_TX_DEFINED 0x01 -#define IEEE80211_HT_MCS_TX_RX_DIFF 0x02 -/* value 0 == 1 stream etc */ -#define IEEE80211_HT_MCS_TX_MAX_STREAMS_MASK 0x0C -#define IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT 2 -#define IEEE80211_HT_MCS_TX_MAX_STREAMS 4 -#define IEEE80211_HT_MCS_TX_UNEQUAL_MODULATION 0x10 - -#define IEEE80211_HT_MCS_CHAINS(mcs) ((mcs) == 32 ? 1 : (1 + ((mcs) >> 3))) - -/* - * 802.11n D5.0 20.3.5 / 20.6 says: - * - indices 0 to 7 and 32 are single spatial stream - * - 8 to 31 are multiple spatial streams using equal modulation - * [8..15 for two streams, 16..23 for three and 24..31 for four] - * - remainder are multiple spatial streams using unequal modulation - */ -#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START 33 -#define IEEE80211_HT_MCS_UNEQUAL_MODULATION_START_BYTE \ - (IEEE80211_HT_MCS_UNEQUAL_MODULATION_START / 8) - -/** - * struct ieee80211_ht_cap - HT capabilities element - * @cap_info: HT Capability Information - * @ampdu_params_info: A-MPDU Parameters - * @mcs: Supported MCS Set - * @extended_ht_cap_info: HT Extended Capabilities - * @tx_BF_cap_info: Transmit Beamforming Capabilities - * @antenna_selection_info: ASEL Capability - * - * This structure represents the payload of the "HT Capabilities - * element" as described in IEEE Std 802.11-2020 section 9.4.2.55. - */ -struct ieee80211_ht_cap { - __le16 cap_info; - u8 ampdu_params_info; - - /* 16 bytes MCS information */ - struct ieee80211_mcs_info mcs; - - __le16 extended_ht_cap_info; - __le32 tx_BF_cap_info; - u8 antenna_selection_info; -} __packed; - -/* 802.11n HT capabilities masks (for cap_info) */ -#define IEEE80211_HT_CAP_LDPC_CODING 0x0001 -#define IEEE80211_HT_CAP_SUP_WIDTH_20_40 0x0002 -#define IEEE80211_HT_CAP_SM_PS 0x000C -#define IEEE80211_HT_CAP_SM_PS_SHIFT 2 -#define IEEE80211_HT_CAP_GRN_FLD 0x0010 -#define IEEE80211_HT_CAP_SGI_20 0x0020 -#define IEEE80211_HT_CAP_SGI_40 0x0040 -#define IEEE80211_HT_CAP_TX_STBC 0x0080 -#define IEEE80211_HT_CAP_RX_STBC 0x0300 -#define IEEE80211_HT_CAP_RX_STBC_SHIFT 8 -#define IEEE80211_HT_CAP_DELAY_BA 0x0400 -#define IEEE80211_HT_CAP_MAX_AMSDU 0x0800 -#define IEEE80211_HT_CAP_DSSSCCK40 0x1000 -#define IEEE80211_HT_CAP_RESERVED 0x2000 -#define IEEE80211_HT_CAP_40MHZ_INTOLERANT 0x4000 -#define IEEE80211_HT_CAP_LSIG_TXOP_PROT 0x8000 - -/* 802.11n HT extended capabilities masks (for extended_ht_cap_info) */ -#define IEEE80211_HT_EXT_CAP_PCO 0x0001 -#define IEEE80211_HT_EXT_CAP_PCO_TIME 0x0006 -#define IEEE80211_HT_EXT_CAP_PCO_TIME_SHIFT 1 -#define IEEE80211_HT_EXT_CAP_MCS_FB 0x0300 -#define IEEE80211_HT_EXT_CAP_MCS_FB_SHIFT 8 -#define IEEE80211_HT_EXT_CAP_HTC_SUP 0x0400 -#define IEEE80211_HT_EXT_CAP_RD_RESPONDER 0x0800 - -/* 802.11n HT capability AMPDU settings (for ampdu_params_info) */ -#define IEEE80211_HT_AMPDU_PARM_FACTOR 0x03 -#define IEEE80211_HT_AMPDU_PARM_DENSITY 0x1C -#define IEEE80211_HT_AMPDU_PARM_DENSITY_SHIFT 2 - -/* - * Maximum length of AMPDU that the STA can receive in high-throughput (HT). - * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets) - */ -enum ieee80211_max_ampdu_length_exp { - IEEE80211_HT_MAX_AMPDU_8K = 0, - IEEE80211_HT_MAX_AMPDU_16K = 1, - IEEE80211_HT_MAX_AMPDU_32K = 2, - IEEE80211_HT_MAX_AMPDU_64K = 3 -}; - /* * Maximum length of AMPDU that the STA can receive in VHT. * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets) @@ -1832,98 +1678,6 @@ enum ieee80211_vht_max_ampdu_length_exp { IEEE80211_VHT_MAX_AMPDU_1024K = 7 }; -#define IEEE80211_HT_MAX_AMPDU_FACTOR 13 - -/* Minimum MPDU start spacing */ -enum ieee80211_min_mpdu_spacing { - IEEE80211_HT_MPDU_DENSITY_NONE = 0, /* No restriction */ - IEEE80211_HT_MPDU_DENSITY_0_25 = 1, /* 1/4 usec */ - IEEE80211_HT_MPDU_DENSITY_0_5 = 2, /* 1/2 usec */ - IEEE80211_HT_MPDU_DENSITY_1 = 3, /* 1 usec */ - IEEE80211_HT_MPDU_DENSITY_2 = 4, /* 2 usec */ - IEEE80211_HT_MPDU_DENSITY_4 = 5, /* 4 usec */ - IEEE80211_HT_MPDU_DENSITY_8 = 6, /* 8 usec */ - IEEE80211_HT_MPDU_DENSITY_16 = 7 /* 16 usec */ -}; - -/** - * struct ieee80211_ht_operation - HT operation IE - * @primary_chan: Primary Channel - * @ht_param: HT Operation Information parameters - * @operation_mode: HT Operation Information operation mode - * @stbc_param: HT Operation Information STBC params - * @basic_set: Basic HT-MCS Set - * - * This structure represents the payload of the "HT Operation - * element" as described in IEEE Std 802.11-2020 section 9.4.2.56. - */ -struct ieee80211_ht_operation { - u8 primary_chan; - u8 ht_param; - __le16 operation_mode; - __le16 stbc_param; - u8 basic_set[16]; -} __packed; - -/* for ht_param */ -#define IEEE80211_HT_PARAM_CHA_SEC_OFFSET 0x03 -#define IEEE80211_HT_PARAM_CHA_SEC_NONE 0x00 -#define IEEE80211_HT_PARAM_CHA_SEC_ABOVE 0x01 -#define IEEE80211_HT_PARAM_CHA_SEC_BELOW 0x03 -#define IEEE80211_HT_PARAM_CHAN_WIDTH_ANY 0x04 -#define IEEE80211_HT_PARAM_RIFS_MODE 0x08 - -/* for operation_mode */ -#define IEEE80211_HT_OP_MODE_PROTECTION 0x0003 -#define IEEE80211_HT_OP_MODE_PROTECTION_NONE 0 -#define IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER 1 -#define IEEE80211_HT_OP_MODE_PROTECTION_20MHZ 2 -#define IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED 3 -#define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT 0x0004 -#define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT 0x0010 -#define IEEE80211_HT_OP_MODE_CCFS2_SHIFT 5 -#define IEEE80211_HT_OP_MODE_CCFS2_MASK 0x1fe0 - -/* for stbc_param */ -#define IEEE80211_HT_STBC_PARAM_DUAL_BEACON 0x0040 -#define IEEE80211_HT_STBC_PARAM_DUAL_CTS_PROT 0x0080 -#define IEEE80211_HT_STBC_PARAM_STBC_BEACON 0x0100 -#define IEEE80211_HT_STBC_PARAM_LSIG_TXOP_FULLPROT 0x0200 -#define IEEE80211_HT_STBC_PARAM_PCO_ACTIVE 0x0400 -#define IEEE80211_HT_STBC_PARAM_PCO_PHASE 0x0800 - - -/* block-ack parameters */ -#define IEEE80211_ADDBA_PARAM_AMSDU_MASK 0x0001 -#define IEEE80211_ADDBA_PARAM_POLICY_MASK 0x0002 -#define IEEE80211_ADDBA_PARAM_TID_MASK 0x003C -#define IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK 0xFFC0 -#define IEEE80211_DELBA_PARAM_TID_MASK 0xF000 -#define IEEE80211_DELBA_PARAM_INITIATOR_MASK 0x0800 - -/* - * A-MPDU buffer sizes - * According to HT size varies from 8 to 64 frames - * HE adds the ability to have up to 256 frames. - * EHT adds the ability to have up to 1K frames. - */ -#define IEEE80211_MIN_AMPDU_BUF 0x8 -#define IEEE80211_MAX_AMPDU_BUF_HT 0x40 -#define IEEE80211_MAX_AMPDU_BUF_HE 0x100 -#define IEEE80211_MAX_AMPDU_BUF_EHT 0x400 - - -/* Spatial Multiplexing Power Save Modes (for capability) */ -#define WLAN_HT_CAP_SM_PS_STATIC 0 -#define WLAN_HT_CAP_SM_PS_DYNAMIC 1 -#define WLAN_HT_CAP_SM_PS_INVALID 2 -#define WLAN_HT_CAP_SM_PS_DISABLED 3 - -/* for SM power control field lower two bits */ -#define WLAN_HT_SMPS_CONTROL_DISABLED 0 -#define WLAN_HT_SMPS_CONTROL_STATIC 1 -#define WLAN_HT_SMPS_CONTROL_DYNAMIC 3 - /** * struct ieee80211_vht_mcs_info - VHT MCS information * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams @@ -3807,18 +3561,6 @@ enum ieee80211_spectrum_mgmt_actioncode { WLAN_ACTION_SPCT_CHL_SWITCH = 4, }; -/* HT action codes */ -enum ieee80211_ht_actioncode { - WLAN_HT_ACTION_NOTIFY_CHANWIDTH = 0, - WLAN_HT_ACTION_SMPS = 1, - WLAN_HT_ACTION_PSMP = 2, - WLAN_HT_ACTION_PCO_PHASE = 3, - WLAN_HT_ACTION_CSI = 4, - WLAN_HT_ACTION_NONCOMPRESSED_BF = 5, - WLAN_HT_ACTION_COMPRESSED_BF = 6, - WLAN_HT_ACTION_ASEL_IDX_FEEDBACK = 7, -}; - /* VHT action codes */ enum ieee80211_vht_actioncode { WLAN_VHT_ACTION_COMPRESSED_BF = 0, @@ -4155,19 +3897,6 @@ struct ieee80211_bss_max_idle_period_ie { u8 idle_options; } __packed; -/* BACK action code */ -enum ieee80211_back_actioncode { - WLAN_ACTION_ADDBA_REQ = 0, - WLAN_ACTION_ADDBA_RESP = 1, - WLAN_ACTION_DELBA = 2, -}; - -/* BACK (block-ack) parties */ -enum ieee80211_back_parties { - WLAN_BACK_RECIPIENT = 0, - WLAN_BACK_INITIATOR = 1, -}; - /* SA Query action */ enum ieee80211_sa_query_action { WLAN_ACTION_SA_QUERY_REQUEST = 0, @@ -5889,6 +5618,7 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) #define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 #define NAN_DEV_CAPA_S3_SUPPORTED 0x10 +#include "ieee80211-ht.h" #include "ieee80211-mesh.h" #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From 7cb14da1d7bbfa4a6417ed7f1bc07dd77bcd9c83 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:51 +0100 Subject: wifi: ieee80211: split VHT definitions out The ieee80211.h file has gotten very long, continue splitting it by putting VHT definitions into a separate file. Link: https://patch.msgid.link/20251105153843.c31cb771a250.I787a13064db7d80440101de3445be17881daf1b6@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-vht.h | 236 ++++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 216 +------------------------------------- 2 files changed, 237 insertions(+), 215 deletions(-) create mode 100644 include/linux/ieee80211-vht.h (limited to 'include') diff --git a/include/linux/ieee80211-vht.h b/include/linux/ieee80211-vht.h new file mode 100644 index 000000000000..898dfb561fef --- /dev/null +++ b/include/linux/ieee80211-vht.h @@ -0,0 +1,236 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 VHT definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_VHT_H +#define LINUX_IEEE80211_VHT_H + +#include +#include + +#define IEEE80211_MAX_MPDU_LEN_VHT_3895 3895 +#define IEEE80211_MAX_MPDU_LEN_VHT_7991 7991 +#define IEEE80211_MAX_MPDU_LEN_VHT_11454 11454 + +/** + * enum ieee80211_vht_opmode_bits - VHT operating mode field bits + * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask + * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: 20 MHz channel width + * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width + * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width + * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width + * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag + * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask + * (the NSS value is the value of this field + 1) + * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift + * @IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF: indicates streams in SU-MIMO PPDU + * using a beamforming steering matrix + */ +enum ieee80211_vht_opmode_bits { + IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK = 0x03, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ = 0, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ = 1, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ = 2, + IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ = 3, + IEEE80211_OPMODE_NOTIF_BW_160_80P80 = 0x04, + IEEE80211_OPMODE_NOTIF_RX_NSS_MASK = 0x70, + IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT = 4, + IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF = 0x80, +}; + +/* + * Maximum length of AMPDU that the STA can receive in VHT. + * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets) + */ +enum ieee80211_vht_max_ampdu_length_exp { + IEEE80211_VHT_MAX_AMPDU_8K = 0, + IEEE80211_VHT_MAX_AMPDU_16K = 1, + IEEE80211_VHT_MAX_AMPDU_32K = 2, + IEEE80211_VHT_MAX_AMPDU_64K = 3, + IEEE80211_VHT_MAX_AMPDU_128K = 4, + IEEE80211_VHT_MAX_AMPDU_256K = 5, + IEEE80211_VHT_MAX_AMPDU_512K = 6, + IEEE80211_VHT_MAX_AMPDU_1024K = 7 +}; + +/** + * struct ieee80211_vht_mcs_info - VHT MCS information + * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams + * @rx_highest: Indicates highest long GI VHT PPDU data rate + * STA can receive. Rate expressed in units of 1 Mbps. + * If this field is 0 this value should not be used to + * consider the highest RX data rate supported. + * The top 3 bits of this field indicate the Maximum NSTS,total + * (a beamformee capability.) + * @tx_mcs_map: TX MCS map 2 bits for each stream, total 8 streams + * @tx_highest: Indicates highest long GI VHT PPDU data rate + * STA can transmit. Rate expressed in units of 1 Mbps. + * If this field is 0 this value should not be used to + * consider the highest TX data rate supported. + * The top 2 bits of this field are reserved, the + * 3rd bit from the top indiciates VHT Extended NSS BW + * Capability. + */ +struct ieee80211_vht_mcs_info { + __le16 rx_mcs_map; + __le16 rx_highest; + __le16 tx_mcs_map; + __le16 tx_highest; +} __packed; + +/* for rx_highest */ +#define IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT 13 +#define IEEE80211_VHT_MAX_NSTS_TOTAL_MASK (7 << IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT) + +/* for tx_highest */ +#define IEEE80211_VHT_EXT_NSS_BW_CAPABLE (1 << 13) + +/** + * enum ieee80211_vht_mcs_support - VHT MCS support definitions + * @IEEE80211_VHT_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the + * number of streams + * @IEEE80211_VHT_MCS_SUPPORT_0_8: MCSes 0-8 are supported + * @IEEE80211_VHT_MCS_SUPPORT_0_9: MCSes 0-9 are supported + * @IEEE80211_VHT_MCS_NOT_SUPPORTED: This number of streams isn't supported + * + * These definitions are used in each 2-bit subfield of the @rx_mcs_map + * and @tx_mcs_map fields of &struct ieee80211_vht_mcs_info, which are + * both split into 8 subfields by number of streams. These values indicate + * which MCSes are supported for the number of streams the value appears + * for. + */ +enum ieee80211_vht_mcs_support { + IEEE80211_VHT_MCS_SUPPORT_0_7 = 0, + IEEE80211_VHT_MCS_SUPPORT_0_8 = 1, + IEEE80211_VHT_MCS_SUPPORT_0_9 = 2, + IEEE80211_VHT_MCS_NOT_SUPPORTED = 3, +}; + +/** + * struct ieee80211_vht_cap - VHT capabilities + * + * This structure is the "VHT capabilities element" as + * described in 802.11ac D3.0 8.4.2.160 + * @vht_cap_info: VHT capability info + * @supp_mcs: VHT MCS supported rates + */ +struct ieee80211_vht_cap { + __le32 vht_cap_info; + struct ieee80211_vht_mcs_info supp_mcs; +} __packed; + +/** + * enum ieee80211_vht_chanwidth - VHT channel width + * @IEEE80211_VHT_CHANWIDTH_USE_HT: use the HT operation IE to + * determine the channel width (20 or 40 MHz) + * @IEEE80211_VHT_CHANWIDTH_80MHZ: 80 MHz bandwidth + * @IEEE80211_VHT_CHANWIDTH_160MHZ: 160 MHz bandwidth + * @IEEE80211_VHT_CHANWIDTH_80P80MHZ: 80+80 MHz bandwidth + */ +enum ieee80211_vht_chanwidth { + IEEE80211_VHT_CHANWIDTH_USE_HT = 0, + IEEE80211_VHT_CHANWIDTH_80MHZ = 1, + IEEE80211_VHT_CHANWIDTH_160MHZ = 2, + IEEE80211_VHT_CHANWIDTH_80P80MHZ = 3, +}; + +/** + * struct ieee80211_vht_operation - VHT operation IE + * + * This structure is the "VHT operation element" as + * described in 802.11ac D3.0 8.4.2.161 + * @chan_width: Operating channel width + * @center_freq_seg0_idx: center freq segment 0 index + * @center_freq_seg1_idx: center freq segment 1 index + * @basic_mcs_set: VHT Basic MCS rate set + */ +struct ieee80211_vht_operation { + u8 chan_width; + u8 center_freq_seg0_idx; + u8 center_freq_seg1_idx; + __le16 basic_mcs_set; +} __packed; + +/* 802.11ac VHT Capabilities */ +#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 +#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 0x00000001 +#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 0x00000002 +#define IEEE80211_VHT_CAP_MAX_MPDU_MASK 0x00000003 +#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ 0x00000004 +#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ 0x00000008 +#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK 0x0000000C +#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_SHIFT 2 +#define IEEE80211_VHT_CAP_RXLDPC 0x00000010 +#define IEEE80211_VHT_CAP_SHORT_GI_80 0x00000020 +#define IEEE80211_VHT_CAP_SHORT_GI_160 0x00000040 +#define IEEE80211_VHT_CAP_TXSTBC 0x00000080 +#define IEEE80211_VHT_CAP_RXSTBC_1 0x00000100 +#define IEEE80211_VHT_CAP_RXSTBC_2 0x00000200 +#define IEEE80211_VHT_CAP_RXSTBC_3 0x00000300 +#define IEEE80211_VHT_CAP_RXSTBC_4 0x00000400 +#define IEEE80211_VHT_CAP_RXSTBC_MASK 0x00000700 +#define IEEE80211_VHT_CAP_RXSTBC_SHIFT 8 +#define IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE 0x00000800 +#define IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE 0x00001000 +#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT 13 +#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK \ + (7 << IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT) +#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT 16 +#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK \ + (7 << IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT) +#define IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE 0x00080000 +#define IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE 0x00100000 +#define IEEE80211_VHT_CAP_VHT_TXOP_PS 0x00200000 +#define IEEE80211_VHT_CAP_HTC_VHT 0x00400000 +#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT 23 +#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK \ + (7 << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT) +#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB 0x08000000 +#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB 0x0c000000 +#define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN 0x10000000 +#define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN 0x20000000 +#define IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT 30 +#define IEEE80211_VHT_CAP_EXT_NSS_BW_MASK 0xc0000000 + +/** + * ieee80211_get_vht_max_nss - return max NSS for a given bandwidth/MCS + * @cap: VHT capabilities of the peer + * @bw: bandwidth to use + * @mcs: MCS index to use + * @ext_nss_bw_capable: indicates whether or not the local transmitter + * (rate scaling algorithm) can deal with the new logic + * (dot11VHTExtendedNSSBWCapable) + * @max_vht_nss: current maximum NSS as advertised by the STA in + * operating mode notification, can be 0 in which case the + * capability data will be used to derive this (from MCS support) + * Return: The maximum NSS that can be used for the given bandwidth/MCS + * combination + * + * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can + * vary for a given BW/MCS. This function parses the data. + * + * Note: This function is exported by cfg80211. + */ +int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, + enum ieee80211_vht_chanwidth bw, + int mcs, bool ext_nss_bw_capable, + unsigned int max_vht_nss); + +/* VHT action codes */ +enum ieee80211_vht_actioncode { + WLAN_VHT_ACTION_COMPRESSED_BF = 0, + WLAN_VHT_ACTION_GROUPID_MGMT = 1, + WLAN_VHT_ACTION_OPMODE_NOTIF = 2, +}; + +#endif /* LINUX_IEEE80211_VHT_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0a9b4a8025cd..0b247b28c661 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -239,10 +239,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) /* 30 byte 4 addr hdr, 2 byte QoS, 2304 byte MSDU, 12 byte crypt, 4 byte FCS */ #define IEEE80211_MAX_FRAME_LEN 2352 -#define IEEE80211_MAX_MPDU_LEN_VHT_3895 3895 -#define IEEE80211_MAX_MPDU_LEN_VHT_7991 7991 -#define IEEE80211_MAX_MPDU_LEN_VHT_11454 11454 - #define IEEE80211_MAX_SSID_LEN 32 #define IEEE80211_FIRST_TSPEC_TSID 8 @@ -988,32 +984,6 @@ struct ieee80211_tim_ie { }; } __packed; -/** - * enum ieee80211_vht_opmode_bits - VHT operating mode field bits - * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK: channel width mask - * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: 20 MHz channel width - * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: 40 MHz channel width - * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: 80 MHz channel width - * @IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: 160 MHz or 80+80 MHz channel width - * @IEEE80211_OPMODE_NOTIF_BW_160_80P80: 160 / 80+80 MHz indicator flag - * @IEEE80211_OPMODE_NOTIF_RX_NSS_MASK: number of spatial streams mask - * (the NSS value is the value of this field + 1) - * @IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT: number of spatial streams shift - * @IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF: indicates streams in SU-MIMO PPDU - * using a beamforming steering matrix - */ -enum ieee80211_vht_opmode_bits { - IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK = 0x03, - IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ = 0, - IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ = 1, - IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ = 2, - IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ = 3, - IEEE80211_OPMODE_NOTIF_BW_160_80P80 = 0x04, - IEEE80211_OPMODE_NOTIF_RX_NSS_MASK = 0x70, - IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT = 4, - IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF = 0x80, -}; - /** * enum ieee80211_s1g_chanwidth - S1G channel widths * These are defined in IEEE802.11-2016ah Table 10-20 @@ -1663,119 +1633,6 @@ struct ieee80211_p2p_noa_attr { #define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F -/* - * Maximum length of AMPDU that the STA can receive in VHT. - * Length = 2 ^ (13 + max_ampdu_length_exp) - 1 (octets) - */ -enum ieee80211_vht_max_ampdu_length_exp { - IEEE80211_VHT_MAX_AMPDU_8K = 0, - IEEE80211_VHT_MAX_AMPDU_16K = 1, - IEEE80211_VHT_MAX_AMPDU_32K = 2, - IEEE80211_VHT_MAX_AMPDU_64K = 3, - IEEE80211_VHT_MAX_AMPDU_128K = 4, - IEEE80211_VHT_MAX_AMPDU_256K = 5, - IEEE80211_VHT_MAX_AMPDU_512K = 6, - IEEE80211_VHT_MAX_AMPDU_1024K = 7 -}; - -/** - * struct ieee80211_vht_mcs_info - VHT MCS information - * @rx_mcs_map: RX MCS map 2 bits for each stream, total 8 streams - * @rx_highest: Indicates highest long GI VHT PPDU data rate - * STA can receive. Rate expressed in units of 1 Mbps. - * If this field is 0 this value should not be used to - * consider the highest RX data rate supported. - * The top 3 bits of this field indicate the Maximum NSTS,total - * (a beamformee capability.) - * @tx_mcs_map: TX MCS map 2 bits for each stream, total 8 streams - * @tx_highest: Indicates highest long GI VHT PPDU data rate - * STA can transmit. Rate expressed in units of 1 Mbps. - * If this field is 0 this value should not be used to - * consider the highest TX data rate supported. - * The top 2 bits of this field are reserved, the - * 3rd bit from the top indiciates VHT Extended NSS BW - * Capability. - */ -struct ieee80211_vht_mcs_info { - __le16 rx_mcs_map; - __le16 rx_highest; - __le16 tx_mcs_map; - __le16 tx_highest; -} __packed; - -/* for rx_highest */ -#define IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT 13 -#define IEEE80211_VHT_MAX_NSTS_TOTAL_MASK (7 << IEEE80211_VHT_MAX_NSTS_TOTAL_SHIFT) - -/* for tx_highest */ -#define IEEE80211_VHT_EXT_NSS_BW_CAPABLE (1 << 13) - -/** - * enum ieee80211_vht_mcs_support - VHT MCS support definitions - * @IEEE80211_VHT_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the - * number of streams - * @IEEE80211_VHT_MCS_SUPPORT_0_8: MCSes 0-8 are supported - * @IEEE80211_VHT_MCS_SUPPORT_0_9: MCSes 0-9 are supported - * @IEEE80211_VHT_MCS_NOT_SUPPORTED: This number of streams isn't supported - * - * These definitions are used in each 2-bit subfield of the @rx_mcs_map - * and @tx_mcs_map fields of &struct ieee80211_vht_mcs_info, which are - * both split into 8 subfields by number of streams. These values indicate - * which MCSes are supported for the number of streams the value appears - * for. - */ -enum ieee80211_vht_mcs_support { - IEEE80211_VHT_MCS_SUPPORT_0_7 = 0, - IEEE80211_VHT_MCS_SUPPORT_0_8 = 1, - IEEE80211_VHT_MCS_SUPPORT_0_9 = 2, - IEEE80211_VHT_MCS_NOT_SUPPORTED = 3, -}; - -/** - * struct ieee80211_vht_cap - VHT capabilities - * - * This structure is the "VHT capabilities element" as - * described in 802.11ac D3.0 8.4.2.160 - * @vht_cap_info: VHT capability info - * @supp_mcs: VHT MCS supported rates - */ -struct ieee80211_vht_cap { - __le32 vht_cap_info; - struct ieee80211_vht_mcs_info supp_mcs; -} __packed; - -/** - * enum ieee80211_vht_chanwidth - VHT channel width - * @IEEE80211_VHT_CHANWIDTH_USE_HT: use the HT operation IE to - * determine the channel width (20 or 40 MHz) - * @IEEE80211_VHT_CHANWIDTH_80MHZ: 80 MHz bandwidth - * @IEEE80211_VHT_CHANWIDTH_160MHZ: 160 MHz bandwidth - * @IEEE80211_VHT_CHANWIDTH_80P80MHZ: 80+80 MHz bandwidth - */ -enum ieee80211_vht_chanwidth { - IEEE80211_VHT_CHANWIDTH_USE_HT = 0, - IEEE80211_VHT_CHANWIDTH_80MHZ = 1, - IEEE80211_VHT_CHANWIDTH_160MHZ = 2, - IEEE80211_VHT_CHANWIDTH_80P80MHZ = 3, -}; - -/** - * struct ieee80211_vht_operation - VHT operation IE - * - * This structure is the "VHT operation element" as - * described in 802.11ac D3.0 8.4.2.161 - * @chan_width: Operating channel width - * @center_freq_seg0_idx: center freq segment 0 index - * @center_freq_seg1_idx: center freq segment 1 index - * @basic_mcs_set: VHT Basic MCS rate set - */ -struct ieee80211_vht_operation { - u8 chan_width; - u8 center_freq_seg0_idx; - u8 center_freq_seg1_idx; - __le16 basic_mcs_set; -} __packed; - /** * struct ieee80211_he_cap_elem - HE capabilities element * @mac_cap_info: HE MAC Capabilities Information @@ -2045,71 +1902,6 @@ struct ieee80211_eht_operation_info { u8 optional[]; } __packed; -/* 802.11ac VHT Capabilities */ -#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 0x00000000 -#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 0x00000001 -#define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 0x00000002 -#define IEEE80211_VHT_CAP_MAX_MPDU_MASK 0x00000003 -#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ 0x00000004 -#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ 0x00000008 -#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK 0x0000000C -#define IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_SHIFT 2 -#define IEEE80211_VHT_CAP_RXLDPC 0x00000010 -#define IEEE80211_VHT_CAP_SHORT_GI_80 0x00000020 -#define IEEE80211_VHT_CAP_SHORT_GI_160 0x00000040 -#define IEEE80211_VHT_CAP_TXSTBC 0x00000080 -#define IEEE80211_VHT_CAP_RXSTBC_1 0x00000100 -#define IEEE80211_VHT_CAP_RXSTBC_2 0x00000200 -#define IEEE80211_VHT_CAP_RXSTBC_3 0x00000300 -#define IEEE80211_VHT_CAP_RXSTBC_4 0x00000400 -#define IEEE80211_VHT_CAP_RXSTBC_MASK 0x00000700 -#define IEEE80211_VHT_CAP_RXSTBC_SHIFT 8 -#define IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE 0x00000800 -#define IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE 0x00001000 -#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT 13 -#define IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK \ - (7 << IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT) -#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT 16 -#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK \ - (7 << IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT) -#define IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE 0x00080000 -#define IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE 0x00100000 -#define IEEE80211_VHT_CAP_VHT_TXOP_PS 0x00200000 -#define IEEE80211_VHT_CAP_HTC_VHT 0x00400000 -#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT 23 -#define IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK \ - (7 << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT) -#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB 0x08000000 -#define IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB 0x0c000000 -#define IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN 0x10000000 -#define IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN 0x20000000 -#define IEEE80211_VHT_CAP_EXT_NSS_BW_SHIFT 30 -#define IEEE80211_VHT_CAP_EXT_NSS_BW_MASK 0xc0000000 - -/** - * ieee80211_get_vht_max_nss - return max NSS for a given bandwidth/MCS - * @cap: VHT capabilities of the peer - * @bw: bandwidth to use - * @mcs: MCS index to use - * @ext_nss_bw_capable: indicates whether or not the local transmitter - * (rate scaling algorithm) can deal with the new logic - * (dot11VHTExtendedNSSBWCapable) - * @max_vht_nss: current maximum NSS as advertised by the STA in - * operating mode notification, can be 0 in which case the - * capability data will be used to derive this (from MCS support) - * Return: The maximum NSS that can be used for the given bandwidth/MCS - * combination - * - * Due to the VHT Extended NSS Bandwidth Support, the maximum NSS can - * vary for a given BW/MCS. This function parses the data. - * - * Note: This function is exported by cfg80211. - */ -int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, - enum ieee80211_vht_chanwidth bw, - int mcs, bool ext_nss_bw_capable, - unsigned int max_vht_nss); - /* 802.11ax HE MAC capabilities */ #define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 #define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 @@ -3561,13 +3353,6 @@ enum ieee80211_spectrum_mgmt_actioncode { WLAN_ACTION_SPCT_CHL_SWITCH = 4, }; -/* VHT action codes */ -enum ieee80211_vht_actioncode { - WLAN_VHT_ACTION_COMPRESSED_BF = 0, - WLAN_VHT_ACTION_GROUPID_MGMT = 1, - WLAN_VHT_ACTION_OPMODE_NOTIF = 2, -}; - /* Self Protected Action codes */ enum ieee80211_self_protected_actioncode { WLAN_SP_RESERVED = 0, @@ -5619,6 +5404,7 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) #define NAN_DEV_CAPA_S3_SUPPORTED 0x10 #include "ieee80211-ht.h" +#include "ieee80211-vht.h" #include "ieee80211-mesh.h" #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From 02a2cf302557eb59794bba0b05d6755f44928d78 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:52 +0100 Subject: wifi: ieee80211: split HE definitions out The ieee80211.h file has gotten very long, continue splitting it by putting HE definitions into a separate file. Link: https://patch.msgid.link/20251105153843.6998c0802104.I3dd7cfea6abbd118b999ecdedd48437d39cb0533@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-he.h | 824 +++++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 806 +----------------------------------------- 2 files changed, 827 insertions(+), 803 deletions(-) create mode 100644 include/linux/ieee80211-he.h (limited to 'include') diff --git a/include/linux/ieee80211-he.h b/include/linux/ieee80211-he.h new file mode 100644 index 000000000000..904d50db5bb8 --- /dev/null +++ b/include/linux/ieee80211-he.h @@ -0,0 +1,824 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 HE definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_HE_H +#define LINUX_IEEE80211_HE_H + +#include +#include + +#define IEEE80211_TWT_CONTROL_NDP BIT(0) +#define IEEE80211_TWT_CONTROL_RESP_MODE BIT(1) +#define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST BIT(3) +#define IEEE80211_TWT_CONTROL_RX_DISABLED BIT(4) +#define IEEE80211_TWT_CONTROL_WAKE_DUR_UNIT BIT(5) + +#define IEEE80211_TWT_REQTYPE_REQUEST BIT(0) +#define IEEE80211_TWT_REQTYPE_SETUP_CMD GENMASK(3, 1) +#define IEEE80211_TWT_REQTYPE_TRIGGER BIT(4) +#define IEEE80211_TWT_REQTYPE_IMPLICIT BIT(5) +#define IEEE80211_TWT_REQTYPE_FLOWTYPE BIT(6) +#define IEEE80211_TWT_REQTYPE_FLOWID GENMASK(9, 7) +#define IEEE80211_TWT_REQTYPE_WAKE_INT_EXP GENMASK(14, 10) +#define IEEE80211_TWT_REQTYPE_PROTECTION BIT(15) + +enum ieee80211_twt_setup_cmd { + TWT_SETUP_CMD_REQUEST, + TWT_SETUP_CMD_SUGGEST, + TWT_SETUP_CMD_DEMAND, + TWT_SETUP_CMD_GROUPING, + TWT_SETUP_CMD_ACCEPT, + TWT_SETUP_CMD_ALTERNATE, + TWT_SETUP_CMD_DICTATE, + TWT_SETUP_CMD_REJECT, +}; + +struct ieee80211_twt_params { + __le16 req_type; + __le64 twt; + u8 min_twt_dur; + __le16 mantissa; + u8 channel; +} __packed; + +struct ieee80211_twt_setup { + u8 dialog_token; + u8 element_id; + u8 length; + u8 control; + u8 params[]; +} __packed; + +/** + * struct ieee80211_he_cap_elem - HE capabilities element + * @mac_cap_info: HE MAC Capabilities Information + * @phy_cap_info: HE PHY Capabilities Information + * + * This structure represents the fixed fields of the payload of the + * "HE capabilities element" as described in IEEE Std 802.11ax-2021 + * sections 9.4.2.248.2 and 9.4.2.248.3. + */ +struct ieee80211_he_cap_elem { + u8 mac_cap_info[6]; + u8 phy_cap_info[11]; +} __packed; + +#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN 5 + +/** + * enum ieee80211_he_mcs_support - HE MCS support definitions + * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the + * number of streams + * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported + * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported + * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported + * + * These definitions are used in each 2-bit subfield of the rx_mcs_* + * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are + * both split into 8 subfields by number of streams. These values indicate + * which MCSes are supported for the number of streams the value appears + * for. + */ +enum ieee80211_he_mcs_support { + IEEE80211_HE_MCS_SUPPORT_0_7 = 0, + IEEE80211_HE_MCS_SUPPORT_0_9 = 1, + IEEE80211_HE_MCS_SUPPORT_0_11 = 2, + IEEE80211_HE_MCS_NOT_SUPPORTED = 3, +}; + +/** + * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field + * + * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field + * described in P802.11ax_D2.0 section 9.4.2.237.4 + * + * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel + * widths less than 80MHz. + * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel + * widths less than 80MHz. + * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel + * width 160MHz. + * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel + * width 160MHz. + * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for + * channel width 80p80MHz. + * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for + * channel width 80p80MHz. + */ +struct ieee80211_he_mcs_nss_supp { + __le16 rx_mcs_80; + __le16 tx_mcs_80; + __le16 rx_mcs_160; + __le16 tx_mcs_160; + __le16 rx_mcs_80p80; + __le16 tx_mcs_80p80; +} __packed; + +/** + * struct ieee80211_he_operation - HE Operation element + * @he_oper_params: HE Operation Parameters + BSS Color Information + * @he_mcs_nss_set: Basic HE-MCS And NSS Set + * @optional: Optional fields VHT Operation Information, Max Co-Hosted + * BSSID Indicator, and 6 GHz Operation Information + * + * This structure represents the payload of the "HE Operation + * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.249. + */ +struct ieee80211_he_operation { + __le32 he_oper_params; + __le16 he_mcs_nss_set; + u8 optional[]; +} __packed; + +/** + * struct ieee80211_he_spr - Spatial Reuse Parameter Set element + * @he_sr_control: SR Control + * @optional: Optional fields Non-SRG OBSS PD Max Offset, SRG OBSS PD + * Min Offset, SRG OBSS PD Max Offset, SRG BSS Color + * Bitmap, and SRG Partial BSSID Bitmap + * + * This structure represents the payload of the "Spatial Reuse + * Parameter Set element" as described in IEEE Std 802.11ax-2021 + * section 9.4.2.252. + */ +struct ieee80211_he_spr { + u8 he_sr_control; + u8 optional[]; +} __packed; + +/** + * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field + * @aifsn: ACI/AIFSN + * @ecw_min_max: ECWmin/ECWmax + * @mu_edca_timer: MU EDCA Timer + * + * This structure represents the "MU AC Parameter Record" as described + * in IEEE Std 802.11ax-2021 section 9.4.2.251, Figure 9-788p. + */ +struct ieee80211_he_mu_edca_param_ac_rec { + u8 aifsn; + u8 ecw_min_max; + u8 mu_edca_timer; +} __packed; + +/** + * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element + * @mu_qos_info: QoS Info + * @ac_be: MU AC_BE Parameter Record + * @ac_bk: MU AC_BK Parameter Record + * @ac_vi: MU AC_VI Parameter Record + * @ac_vo: MU AC_VO Parameter Record + * + * This structure represents the payload of the "MU EDCA Parameter Set + * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.251. + */ +struct ieee80211_mu_edca_param_set { + u8 mu_qos_info; + struct ieee80211_he_mu_edca_param_ac_rec ac_be; + struct ieee80211_he_mu_edca_param_ac_rec ac_bk; + struct ieee80211_he_mu_edca_param_ac_rec ac_vi; + struct ieee80211_he_mu_edca_param_ac_rec ac_vo; +} __packed; + +/* 802.11ax HE MAC capabilities */ +#define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 +#define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 +#define IEEE80211_HE_MAC_CAP0_TWT_RES 0x04 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP 0x00 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1 0x08 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2 0x10 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3 0x18 +#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK 0x18 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1 0x00 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2 0x20 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4 0x40 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8 0x60 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16 0x80 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32 0xa0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64 0xc0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED 0xe0 +#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK 0xe0 + +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED 0x00 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128 0x01 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256 0x02 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512 0x03 +#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK 0x03 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US 0x00 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US 0x04 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US 0x08 +#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK 0x0c +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_1 0x00 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_2 0x10 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_3 0x20 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_4 0x30 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_5 0x40 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_6 0x50 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_7 0x60 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8 0x70 +#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_MASK 0x70 + +/* Link adaptation is split between byte HE_MAC_CAP1 and + * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE + * in which case the following values apply: + * 0 = No feedback. + * 1 = reserved. + * 2 = Unsolicited feedback. + * 3 = both + */ +#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION 0x80 + +#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION 0x01 +#define IEEE80211_HE_MAC_CAP2_ALL_ACK 0x02 +#define IEEE80211_HE_MAC_CAP2_TRS 0x04 +#define IEEE80211_HE_MAC_CAP2_BSR 0x08 +#define IEEE80211_HE_MAC_CAP2_BCAST_TWT 0x10 +#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP 0x20 +#define IEEE80211_HE_MAC_CAP2_MU_CASCADING 0x40 +#define IEEE80211_HE_MAC_CAP2_ACK_EN 0x80 + +#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL 0x02 +#define IEEE80211_HE_MAC_CAP3_OFDMA_RA 0x04 + +/* The maximum length of an A-MDPU is defined by the combination of the Maximum + * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the + * same field in the HE capabilities. + */ +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0 0x00 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1 0x08 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2 0x10 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3 0x18 +#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK 0x18 +#define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG 0x20 +#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED 0x40 +#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS 0x80 + +#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 +#define IEEE80211_HE_MAC_CAP4_QTP 0x02 +#define IEEE80211_HE_MAC_CAP4_BQR 0x04 +#define IEEE80211_HE_MAC_CAP4_PSR_RESP 0x08 +#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP 0x10 +#define IEEE80211_HE_MAC_CAP4_OPS 0x20 +#define IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU 0x40 +/* Multi TID agg TX is split between byte #4 and #5 + * The value is a combination of B39,B40,B41 + */ +#define IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39 0x80 + +#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40 0x01 +#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41 0x02 +#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION 0x04 +#define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU 0x08 +#define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX 0x10 +#define IEEE80211_HE_MAC_CAP5_HE_DYNAMIC_SM_PS 0x20 +#define IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING 0x40 +#define IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX 0x80 + +#define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR 20 +#define IEEE80211_HE_HT_MAX_AMPDU_FACTOR 16 +#define IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR 13 + +/* 802.11ax HE PHY capabilities */ +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G 0x02 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G 0x08 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G 0x10 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL 0x1e + +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G 0x20 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G 0x40 +#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK 0xfe + +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ 0x01 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ 0x02 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ 0x04 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ 0x08 +#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK 0x0f +#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A 0x10 +#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD 0x20 +#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US 0x40 +/* Midamble RX/TX Max NSTS is split between byte #2 and byte #3 */ +#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS 0x80 + +#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_TX_MAX_NSTS 0x01 +#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US 0x02 +#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ 0x04 +#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ 0x08 +#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX 0x10 +#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX 0x20 + +/* Note that the meaning of UL MU below is different between an AP and a non-AP + * sta, where in the AP case it indicates support for Rx and in the non-AP sta + * case it indicates support for Tx. + */ +#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO 0x40 +#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO 0x80 + +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK 0x01 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK 0x02 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM 0x03 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK 0x03 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2 0x04 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK 0x08 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK 0x10 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM 0x18 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK 0x18 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1 0x00 +#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2 0x20 +#define IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU 0x40 +#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER 0x80 + +#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE 0x01 +#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER 0x02 + +/* Minimal allowed value of Max STS under 80MHz is 3 */ +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4 0x0c +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5 0x10 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6 0x14 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7 0x18 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8 0x1c +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK 0x1c + +/* Minimal allowed value of Max STS above 80MHz is 3 */ +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4 0x60 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5 0x80 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6 0xa0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7 0xc0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8 0xe0 +#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK 0xe0 + +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1 0x00 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2 0x01 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3 0x02 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4 0x03 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5 0x04 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6 0x05 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7 0x06 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8 0x07 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK 0x07 + +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1 0x00 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2 0x08 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3 0x10 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4 0x18 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5 0x20 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6 0x28 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7 0x30 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8 0x38 +#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK 0x38 + +#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK 0x40 +#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK 0x80 + +#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU 0x01 +#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU 0x02 +#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB 0x04 +#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB 0x08 +#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB 0x10 +#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE 0x20 +#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO 0x40 +#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT 0x80 + +#define IEEE80211_HE_PHY_CAP7_PSR_BASED_SR 0x01 +#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP 0x02 +#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI 0x04 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_1 0x08 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_2 0x10 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_3 0x18 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_4 0x20 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_5 0x28 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_6 0x30 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_7 0x38 +#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK 0x38 +#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ 0x40 +#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ 0x80 + +#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI 0x01 +#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G 0x02 +#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU 0x04 +#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU 0x08 +#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI 0x10 +#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_TX_2X_AND_1XLTF 0x20 +#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242 0x00 +#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484 0x40 +#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996 0x80 +#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996 0xc0 +#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK 0xc0 + +#define IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM 0x01 +#define IEEE80211_HE_PHY_CAP9_NON_TRIGGERED_CQI_FEEDBACK 0x02 +#define IEEE80211_HE_PHY_CAP9_TX_1024_QAM_LESS_THAN_242_TONE_RU 0x04 +#define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU 0x08 +#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB 0x10 +#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB 0x20 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US 0x0 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US 0x1 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US 0x2 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED 0x3 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_POS 6 +#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK 0xc0 + +#define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF 0x01 + +/* 802.11ax HE TX/RX MCS NSS Support */ +#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS (3) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS (6) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS (11) +#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK 0x07c0 +#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK 0xf800 + +/* TX/RX HE MCS Support field Highest MCS subfield encoding */ +enum ieee80211_he_highest_mcs_supported_subfield_enc { + HIGHEST_MCS_SUPPORTED_MCS7 = 0, + HIGHEST_MCS_SUPPORTED_MCS8, + HIGHEST_MCS_SUPPORTED_MCS9, + HIGHEST_MCS_SUPPORTED_MCS10, + HIGHEST_MCS_SUPPORTED_MCS11, +}; + +/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */ +static inline u8 +ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap) +{ + u8 count = 4; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + count += 4; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) + count += 4; + + return count; +} + +/* 802.11ax HE PPE Thresholds */ +#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS (1) +#define IEEE80211_PPE_THRES_NSS_POS (0) +#define IEEE80211_PPE_THRES_NSS_MASK (7) +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU \ + (BIT(5) | BIT(6)) +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK 0x78 +#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS (3) +#define IEEE80211_PPE_THRES_INFO_PPET_SIZE (3) +#define IEEE80211_HE_PPE_THRES_INFO_HEADER_SIZE (7) + +/* + * Calculate 802.11ax HE capabilities IE PPE field size + * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8* + */ +static inline u8 +ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) +{ + u8 n; + + if ((phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) + return 0; + + n = hweight8(ppe_thres_hdr & + IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >> + IEEE80211_PPE_THRES_NSS_POS)); + + /* + * Each pair is 6 bits, and we need to add the 7 "header" bits to the + * total size. + */ + n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; + n = DIV_ROUND_UP(n, 8); + + return n; +} + +static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_he_cap_elem *he_cap_ie_elem = (const void *)data; + u8 needed = sizeof(*he_cap_ie_elem); + + if (len < needed) + return false; + + needed += ieee80211_he_mcs_nss_size(he_cap_ie_elem); + if (len < needed) + return false; + + if (he_cap_ie_elem->phy_cap_info[6] & + IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) { + if (len < needed + 1) + return false; + needed += ieee80211_he_ppe_size(data[needed], + he_cap_ie_elem->phy_cap_info); + } + + return len >= needed; +} + +/* HE Operation defines */ +#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x00000007 +#define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000008 +#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK 0x00003ff0 +#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET 4 +#define IEEE80211_HE_OPERATION_VHT_OPER_INFO 0x00004000 +#define IEEE80211_HE_OPERATION_CO_HOSTED_BSS 0x00008000 +#define IEEE80211_HE_OPERATION_ER_SU_DISABLE 0x00010000 +#define IEEE80211_HE_OPERATION_6GHZ_OP_INFO 0x00020000 +#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK 0x3f000000 +#define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET 24 +#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x40000000 +#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED 0x80000000 + +#define IEEE80211_6GHZ_CTRL_REG_LPI_AP 0 +#define IEEE80211_6GHZ_CTRL_REG_SP_AP 1 +#define IEEE80211_6GHZ_CTRL_REG_VLP_AP 2 +#define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP 3 +#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD 4 +#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP 8 + +/** + * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field + * @primary: primary channel + * @control: control flags + * @ccfs0: channel center frequency segment 0 + * @ccfs1: channel center frequency segment 1 + * @minrate: minimum rate (in 1 Mbps units) + */ +struct ieee80211_he_6ghz_oper { + u8 primary; +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH 0x3 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ 0 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ 1 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ 2 +#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ 3 +#define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON 0x4 +#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO 0x78 + u8 control; + u8 ccfs0; + u8 ccfs1; + u8 minrate; +} __packed; + +/** + * enum ieee80211_reg_conn_bits - represents Regulatory connectivity field bits. + * + * This enumeration defines bit flags used to represent regulatory connectivity + * field bits. + * + * @IEEE80211_REG_CONN_LPI_VALID: Indicates whether the LPI bit is valid. + * @IEEE80211_REG_CONN_LPI_VALUE: Represents the value of the LPI bit. + * @IEEE80211_REG_CONN_SP_VALID: Indicates whether the SP bit is valid. + * @IEEE80211_REG_CONN_SP_VALUE: Represents the value of the SP bit. + */ +enum ieee80211_reg_conn_bits { + IEEE80211_REG_CONN_LPI_VALID = BIT(0), + IEEE80211_REG_CONN_LPI_VALUE = BIT(1), + IEEE80211_REG_CONN_SP_VALID = BIT(2), + IEEE80211_REG_CONN_SP_VALUE = BIT(3), +}; + +/* transmit power interpretation type of transmit power envelope element */ +enum ieee80211_tx_power_intrpt_type { + IEEE80211_TPE_LOCAL_EIRP, + IEEE80211_TPE_LOCAL_EIRP_PSD, + IEEE80211_TPE_REG_CLIENT_EIRP, + IEEE80211_TPE_REG_CLIENT_EIRP_PSD, +}; + +/* category type of transmit power envelope element */ +enum ieee80211_tx_power_category_6ghz { + IEEE80211_TPE_CAT_6GHZ_DEFAULT = 0, + IEEE80211_TPE_CAT_6GHZ_SUBORDINATE = 1, +}; + +/* + * For IEEE80211_TPE_LOCAL_EIRP / IEEE80211_TPE_REG_CLIENT_EIRP, + * setting to 63.5 dBm means no constraint. + */ +#define IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT 127 + +/* + * For IEEE80211_TPE_LOCAL_EIRP_PSD / IEEE80211_TPE_REG_CLIENT_EIRP_PSD, + * setting to 127 indicates no PSD limit for the 20 MHz channel. + */ +#define IEEE80211_TPE_PSD_NO_LIMIT 127 + +/** + * struct ieee80211_tx_pwr_env - Transmit Power Envelope + * @info: Transmit Power Information field + * @variable: Maximum Transmit Power field + * + * This structure represents the payload of the "Transmit Power + * Envelope element" as described in IEEE Std 802.11ax-2021 section + * 9.4.2.161 + */ +struct ieee80211_tx_pwr_env { + u8 info; + u8 variable[]; +} __packed; + +#define IEEE80211_TX_PWR_ENV_INFO_COUNT 0x7 +#define IEEE80211_TX_PWR_ENV_INFO_INTERPRET 0x38 +#define IEEE80211_TX_PWR_ENV_INFO_CATEGORY 0xC0 + +#define IEEE80211_TX_PWR_ENV_EXT_COUNT 0xF + +static inline bool ieee80211_valid_tpe_element(const u8 *data, u8 len) +{ + const struct ieee80211_tx_pwr_env *env = (const void *)data; + u8 count, interpret, category; + u8 needed = sizeof(*env); + u8 N; /* also called N in the spec */ + + if (len < needed) + return false; + + count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT); + interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET); + category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY); + + switch (category) { + case IEEE80211_TPE_CAT_6GHZ_DEFAULT: + case IEEE80211_TPE_CAT_6GHZ_SUBORDINATE: + break; + default: + return false; + } + + switch (interpret) { + case IEEE80211_TPE_LOCAL_EIRP: + case IEEE80211_TPE_REG_CLIENT_EIRP: + if (count > 3) + return false; + + /* count == 0 encodes 1 value for 20 MHz, etc. */ + needed += count + 1; + + if (len < needed) + return false; + + /* there can be extension fields not accounted for in 'count' */ + + return true; + case IEEE80211_TPE_LOCAL_EIRP_PSD: + case IEEE80211_TPE_REG_CLIENT_EIRP_PSD: + if (count > 4) + return false; + + N = count ? 1 << (count - 1) : 1; + needed += N; + + if (len < needed) + return false; + + if (len > needed) { + u8 K = u8_get_bits(env->variable[N], + IEEE80211_TX_PWR_ENV_EXT_COUNT); + + needed += 1 + K; + if (len < needed) + return false; + } + + return true; + } + + return false; +} + +/* + * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size + * @he_oper_ie: byte data of the He Operations IE, stating from the byte + * after the ext ID byte. It is assumed that he_oper_ie has at least + * sizeof(struct ieee80211_he_operation) bytes, the caller must have + * validated this. + * @return the actual size of the IE data (not including header), or 0 on error + */ +static inline u8 +ieee80211_he_oper_size(const u8 *he_oper_ie) +{ + const struct ieee80211_he_operation *he_oper = (const void *)he_oper_ie; + u8 oper_len = sizeof(struct ieee80211_he_operation); + u32 he_oper_params; + + /* Make sure the input is not NULL */ + if (!he_oper_ie) + return 0; + + /* Calc required length */ + he_oper_params = le32_to_cpu(he_oper->he_oper_params); + if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) + oper_len += 3; + if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) + oper_len++; + if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO) + oper_len += sizeof(struct ieee80211_he_6ghz_oper); + + /* Add the first byte (extension ID) to the total length */ + oper_len++; + + return oper_len; +} + +/** + * ieee80211_he_6ghz_oper - obtain 6 GHz operation field + * @he_oper: HE operation element (must be pre-validated for size) + * but may be %NULL + * + * Return: a pointer to the 6 GHz operation field, or %NULL + */ +static inline const struct ieee80211_he_6ghz_oper * +ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) +{ + const u8 *ret; + u32 he_oper_params; + + if (!he_oper) + return NULL; + + ret = (const void *)&he_oper->optional; + + he_oper_params = le32_to_cpu(he_oper->he_oper_params); + + if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)) + return NULL; + if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) + ret += 3; + if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) + ret++; + + return (const void *)ret; +} + +/* HE Spatial Reuse defines */ +#define IEEE80211_HE_SPR_PSR_DISALLOWED BIT(0) +#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED BIT(1) +#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT BIT(2) +#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT BIT(3) +#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED BIT(4) + +/* + * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size + * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the byte + * after the ext ID byte. It is assumed that he_spr_ie has at least + * sizeof(struct ieee80211_he_spr) bytes, the caller must have validated + * this + * @return the actual size of the IE data (not including header), or 0 on error + */ +static inline u8 +ieee80211_he_spr_size(const u8 *he_spr_ie) +{ + const struct ieee80211_he_spr *he_spr = (const void *)he_spr_ie; + u8 spr_len = sizeof(struct ieee80211_he_spr); + u8 he_spr_params; + + /* Make sure the input is not NULL */ + if (!he_spr_ie) + return 0; + + /* Calc required length */ + he_spr_params = he_spr->he_sr_control; + if (he_spr_params & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT) + spr_len++; + if (he_spr_params & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT) + spr_len += 18; + + /* Add the first byte (extension ID) to the total length */ + spr_len++; + + return spr_len; +} + +struct ieee80211_he_6ghz_capa { + /* uses IEEE80211_HE_6GHZ_CAP_* below */ + __le16 capa; +} __packed; + +/* HE 6 GHz band capabilities */ +/* uses enum ieee80211_min_mpdu_spacing values */ +#define IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START 0x0007 +/* uses enum ieee80211_vht_max_ampdu_length_exp values */ +#define IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP 0x0038 +/* uses IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_* values */ +#define IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN 0x00c0 +/* WLAN_HT_CAP_SM_PS_* values */ +#define IEEE80211_HE_6GHZ_CAP_SM_PS 0x0600 +#define IEEE80211_HE_6GHZ_CAP_RD_RESPONDER 0x0800 +#define IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS 0x1000 +#define IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS 0x2000 + +#endif /* LINUX_IEEE80211_HE_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0b247b28c661..a3dbbcee00ee 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1141,48 +1141,6 @@ ieee80211_s1g_optional_len(__le16 fc) return len; } -#define IEEE80211_TWT_CONTROL_NDP BIT(0) -#define IEEE80211_TWT_CONTROL_RESP_MODE BIT(1) -#define IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST BIT(3) -#define IEEE80211_TWT_CONTROL_RX_DISABLED BIT(4) -#define IEEE80211_TWT_CONTROL_WAKE_DUR_UNIT BIT(5) - -#define IEEE80211_TWT_REQTYPE_REQUEST BIT(0) -#define IEEE80211_TWT_REQTYPE_SETUP_CMD GENMASK(3, 1) -#define IEEE80211_TWT_REQTYPE_TRIGGER BIT(4) -#define IEEE80211_TWT_REQTYPE_IMPLICIT BIT(5) -#define IEEE80211_TWT_REQTYPE_FLOWTYPE BIT(6) -#define IEEE80211_TWT_REQTYPE_FLOWID GENMASK(9, 7) -#define IEEE80211_TWT_REQTYPE_WAKE_INT_EXP GENMASK(14, 10) -#define IEEE80211_TWT_REQTYPE_PROTECTION BIT(15) - -enum ieee80211_twt_setup_cmd { - TWT_SETUP_CMD_REQUEST, - TWT_SETUP_CMD_SUGGEST, - TWT_SETUP_CMD_DEMAND, - TWT_SETUP_CMD_GROUPING, - TWT_SETUP_CMD_ACCEPT, - TWT_SETUP_CMD_ALTERNATE, - TWT_SETUP_CMD_DICTATE, - TWT_SETUP_CMD_REJECT, -}; - -struct ieee80211_twt_params { - __le16 req_type; - __le64 twt; - u8 min_twt_dur; - __le16 mantissa; - u8 channel; -} __packed; - -struct ieee80211_twt_setup { - u8 dialog_token; - u8 element_id; - u8 length; - u8 control; - u8 params[]; -} __packed; - #define IEEE80211_TTLM_MAX_CNT 2 #define IEEE80211_TTLM_CONTROL_DIRECTION 0x03 #define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP 0x04 @@ -1633,137 +1591,6 @@ struct ieee80211_p2p_noa_attr { #define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F -/** - * struct ieee80211_he_cap_elem - HE capabilities element - * @mac_cap_info: HE MAC Capabilities Information - * @phy_cap_info: HE PHY Capabilities Information - * - * This structure represents the fixed fields of the payload of the - * "HE capabilities element" as described in IEEE Std 802.11ax-2021 - * sections 9.4.2.248.2 and 9.4.2.248.3. - */ -struct ieee80211_he_cap_elem { - u8 mac_cap_info[6]; - u8 phy_cap_info[11]; -} __packed; - -#define IEEE80211_TX_RX_MCS_NSS_DESC_MAX_LEN 5 - -/** - * enum ieee80211_he_mcs_support - HE MCS support definitions - * @IEEE80211_HE_MCS_SUPPORT_0_7: MCSes 0-7 are supported for the - * number of streams - * @IEEE80211_HE_MCS_SUPPORT_0_9: MCSes 0-9 are supported - * @IEEE80211_HE_MCS_SUPPORT_0_11: MCSes 0-11 are supported - * @IEEE80211_HE_MCS_NOT_SUPPORTED: This number of streams isn't supported - * - * These definitions are used in each 2-bit subfield of the rx_mcs_* - * and tx_mcs_* fields of &struct ieee80211_he_mcs_nss_supp, which are - * both split into 8 subfields by number of streams. These values indicate - * which MCSes are supported for the number of streams the value appears - * for. - */ -enum ieee80211_he_mcs_support { - IEEE80211_HE_MCS_SUPPORT_0_7 = 0, - IEEE80211_HE_MCS_SUPPORT_0_9 = 1, - IEEE80211_HE_MCS_SUPPORT_0_11 = 2, - IEEE80211_HE_MCS_NOT_SUPPORTED = 3, -}; - -/** - * struct ieee80211_he_mcs_nss_supp - HE Tx/Rx HE MCS NSS Support Field - * - * This structure holds the data required for the Tx/Rx HE MCS NSS Support Field - * described in P802.11ax_D2.0 section 9.4.2.237.4 - * - * @rx_mcs_80: Rx MCS map 2 bits for each stream, total 8 streams, for channel - * widths less than 80MHz. - * @tx_mcs_80: Tx MCS map 2 bits for each stream, total 8 streams, for channel - * widths less than 80MHz. - * @rx_mcs_160: Rx MCS map 2 bits for each stream, total 8 streams, for channel - * width 160MHz. - * @tx_mcs_160: Tx MCS map 2 bits for each stream, total 8 streams, for channel - * width 160MHz. - * @rx_mcs_80p80: Rx MCS map 2 bits for each stream, total 8 streams, for - * channel width 80p80MHz. - * @tx_mcs_80p80: Tx MCS map 2 bits for each stream, total 8 streams, for - * channel width 80p80MHz. - */ -struct ieee80211_he_mcs_nss_supp { - __le16 rx_mcs_80; - __le16 tx_mcs_80; - __le16 rx_mcs_160; - __le16 tx_mcs_160; - __le16 rx_mcs_80p80; - __le16 tx_mcs_80p80; -} __packed; - -/** - * struct ieee80211_he_operation - HE Operation element - * @he_oper_params: HE Operation Parameters + BSS Color Information - * @he_mcs_nss_set: Basic HE-MCS And NSS Set - * @optional: Optional fields VHT Operation Information, Max Co-Hosted - * BSSID Indicator, and 6 GHz Operation Information - * - * This structure represents the payload of the "HE Operation - * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.249. - */ -struct ieee80211_he_operation { - __le32 he_oper_params; - __le16 he_mcs_nss_set; - u8 optional[]; -} __packed; - -/** - * struct ieee80211_he_spr - Spatial Reuse Parameter Set element - * @he_sr_control: SR Control - * @optional: Optional fields Non-SRG OBSS PD Max Offset, SRG OBSS PD - * Min Offset, SRG OBSS PD Max Offset, SRG BSS Color - * Bitmap, and SRG Partial BSSID Bitmap - * - * This structure represents the payload of the "Spatial Reuse - * Parameter Set element" as described in IEEE Std 802.11ax-2021 - * section 9.4.2.252. - */ -struct ieee80211_he_spr { - u8 he_sr_control; - u8 optional[]; -} __packed; - -/** - * struct ieee80211_he_mu_edca_param_ac_rec - MU AC Parameter Record field - * @aifsn: ACI/AIFSN - * @ecw_min_max: ECWmin/ECWmax - * @mu_edca_timer: MU EDCA Timer - * - * This structure represents the "MU AC Parameter Record" as described - * in IEEE Std 802.11ax-2021 section 9.4.2.251, Figure 9-788p. - */ -struct ieee80211_he_mu_edca_param_ac_rec { - u8 aifsn; - u8 ecw_min_max; - u8 mu_edca_timer; -} __packed; - -/** - * struct ieee80211_mu_edca_param_set - MU EDCA Parameter Set element - * @mu_qos_info: QoS Info - * @ac_be: MU AC_BE Parameter Record - * @ac_bk: MU AC_BK Parameter Record - * @ac_vi: MU AC_VI Parameter Record - * @ac_vo: MU AC_VO Parameter Record - * - * This structure represents the payload of the "MU EDCA Parameter Set - * element" as described in IEEE Std 802.11ax-2021 section 9.4.2.251. - */ -struct ieee80211_mu_edca_param_set { - u8 mu_qos_info; - struct ieee80211_he_mu_edca_param_ac_rec ac_be; - struct ieee80211_he_mu_edca_param_ac_rec ac_bk; - struct ieee80211_he_mu_edca_param_ac_rec ac_vi; - struct ieee80211_he_mu_edca_param_ac_rec ac_vo; -} __packed; - #define IEEE80211_EHT_MCS_NSS_RX 0x0f #define IEEE80211_EHT_MCS_NSS_TX 0xf0 @@ -1902,618 +1729,6 @@ struct ieee80211_eht_operation_info { u8 optional[]; } __packed; -/* 802.11ax HE MAC capabilities */ -#define IEEE80211_HE_MAC_CAP0_HTC_HE 0x01 -#define IEEE80211_HE_MAC_CAP0_TWT_REQ 0x02 -#define IEEE80211_HE_MAC_CAP0_TWT_RES 0x04 -#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_NOT_SUPP 0x00 -#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_1 0x08 -#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_2 0x10 -#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_LEVEL_3 0x18 -#define IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK 0x18 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_1 0x00 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_2 0x20 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_4 0x40 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_8 0x60 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_16 0x80 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_32 0xa0 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_64 0xc0 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_UNLIMITED 0xe0 -#define IEEE80211_HE_MAC_CAP0_MAX_NUM_FRAG_MSDU_MASK 0xe0 - -#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_UNLIMITED 0x00 -#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_128 0x01 -#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_256 0x02 -#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_512 0x03 -#define IEEE80211_HE_MAC_CAP1_MIN_FRAG_SIZE_MASK 0x03 -#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_0US 0x00 -#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_8US 0x04 -#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US 0x08 -#define IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_MASK 0x0c -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_1 0x00 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_2 0x10 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_3 0x20 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_4 0x30 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_5 0x40 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_6 0x50 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_7 0x60 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8 0x70 -#define IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_MASK 0x70 - -/* Link adaptation is split between byte HE_MAC_CAP1 and - * HE_MAC_CAP2. It should be set only if IEEE80211_HE_MAC_CAP0_HTC_HE - * in which case the following values apply: - * 0 = No feedback. - * 1 = reserved. - * 2 = Unsolicited feedback. - * 3 = both - */ -#define IEEE80211_HE_MAC_CAP1_LINK_ADAPTATION 0x80 - -#define IEEE80211_HE_MAC_CAP2_LINK_ADAPTATION 0x01 -#define IEEE80211_HE_MAC_CAP2_ALL_ACK 0x02 -#define IEEE80211_HE_MAC_CAP2_TRS 0x04 -#define IEEE80211_HE_MAC_CAP2_BSR 0x08 -#define IEEE80211_HE_MAC_CAP2_BCAST_TWT 0x10 -#define IEEE80211_HE_MAC_CAP2_32BIT_BA_BITMAP 0x20 -#define IEEE80211_HE_MAC_CAP2_MU_CASCADING 0x40 -#define IEEE80211_HE_MAC_CAP2_ACK_EN 0x80 - -#define IEEE80211_HE_MAC_CAP3_OMI_CONTROL 0x02 -#define IEEE80211_HE_MAC_CAP3_OFDMA_RA 0x04 - -/* The maximum length of an A-MDPU is defined by the combination of the Maximum - * A-MDPU Length Exponent field in the HT capabilities, VHT capabilities and the - * same field in the HE capabilities. - */ -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_0 0x00 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_1 0x08 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_2 0x10 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3 0x18 -#define IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_MASK 0x18 -#define IEEE80211_HE_MAC_CAP3_AMSDU_FRAG 0x20 -#define IEEE80211_HE_MAC_CAP3_FLEX_TWT_SCHED 0x40 -#define IEEE80211_HE_MAC_CAP3_RX_CTRL_FRAME_TO_MULTIBSS 0x80 - -#define IEEE80211_HE_MAC_CAP4_BSRP_BQRP_A_MPDU_AGG 0x01 -#define IEEE80211_HE_MAC_CAP4_QTP 0x02 -#define IEEE80211_HE_MAC_CAP4_BQR 0x04 -#define IEEE80211_HE_MAC_CAP4_PSR_RESP 0x08 -#define IEEE80211_HE_MAC_CAP4_NDP_FB_REP 0x10 -#define IEEE80211_HE_MAC_CAP4_OPS 0x20 -#define IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU 0x40 -/* Multi TID agg TX is split between byte #4 and #5 - * The value is a combination of B39,B40,B41 - */ -#define IEEE80211_HE_MAC_CAP4_MULTI_TID_AGG_TX_QOS_B39 0x80 - -#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B40 0x01 -#define IEEE80211_HE_MAC_CAP5_MULTI_TID_AGG_TX_QOS_B41 0x02 -#define IEEE80211_HE_MAC_CAP5_SUBCHAN_SELECTIVE_TRANSMISSION 0x04 -#define IEEE80211_HE_MAC_CAP5_UL_2x996_TONE_RU 0x08 -#define IEEE80211_HE_MAC_CAP5_OM_CTRL_UL_MU_DATA_DIS_RX 0x10 -#define IEEE80211_HE_MAC_CAP5_HE_DYNAMIC_SM_PS 0x20 -#define IEEE80211_HE_MAC_CAP5_PUNCTURED_SOUNDING 0x40 -#define IEEE80211_HE_MAC_CAP5_HT_VHT_TRIG_FRAME_RX 0x80 - -#define IEEE80211_HE_VHT_MAX_AMPDU_FACTOR 20 -#define IEEE80211_HE_HT_MAX_AMPDU_FACTOR 16 -#define IEEE80211_HE_6GHZ_MAX_AMPDU_FACTOR 13 - -/* 802.11ax HE PHY capabilities */ -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G 0x02 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G 0x04 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G 0x08 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G 0x10 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK_ALL 0x1e - -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_2G 0x20 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_RU_MAPPING_IN_5G 0x40 -#define IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_MASK 0xfe - -#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_20MHZ 0x01 -#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_80MHZ_ONLY_SECOND_40MHZ 0x02 -#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_20MHZ 0x04 -#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_160MHZ_ONLY_SECOND_40MHZ 0x08 -#define IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK 0x0f -#define IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A 0x10 -#define IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD 0x20 -#define IEEE80211_HE_PHY_CAP1_HE_LTF_AND_GI_FOR_HE_PPDUS_0_8US 0x40 -/* Midamble RX/TX Max NSTS is split between byte #2 and byte #3 */ -#define IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS 0x80 - -#define IEEE80211_HE_PHY_CAP2_MIDAMBLE_RX_TX_MAX_NSTS 0x01 -#define IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US 0x02 -#define IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ 0x04 -#define IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ 0x08 -#define IEEE80211_HE_PHY_CAP2_DOPPLER_TX 0x10 -#define IEEE80211_HE_PHY_CAP2_DOPPLER_RX 0x20 - -/* Note that the meaning of UL MU below is different between an AP and a non-AP - * sta, where in the AP case it indicates support for Rx and in the non-AP sta - * case it indicates support for Tx. - */ -#define IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO 0x40 -#define IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO 0x80 - -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_NO_DCM 0x00 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_BPSK 0x01 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_QPSK 0x02 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_16_QAM 0x03 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_TX_MASK 0x03 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_1 0x00 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_TX_NSS_2 0x04 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_NO_DCM 0x00 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_BPSK 0x08 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_QPSK 0x10 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_16_QAM 0x18 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_CONST_RX_MASK 0x18 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_1 0x00 -#define IEEE80211_HE_PHY_CAP3_DCM_MAX_RX_NSS_2 0x20 -#define IEEE80211_HE_PHY_CAP3_RX_PARTIAL_BW_SU_IN_20MHZ_MU 0x40 -#define IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER 0x80 - -#define IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE 0x01 -#define IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER 0x02 - -/* Minimal allowed value of Max STS under 80MHz is 3 */ -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_4 0x0c -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_5 0x10 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_6 0x14 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_7 0x18 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_8 0x1c -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_UNDER_80MHZ_MASK 0x1c - -/* Minimal allowed value of Max STS above 80MHz is 3 */ -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_4 0x60 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_5 0x80 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_6 0xa0 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_7 0xc0 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_8 0xe0 -#define IEEE80211_HE_PHY_CAP4_BEAMFORMEE_MAX_STS_ABOVE_80MHZ_MASK 0xe0 - -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_1 0x00 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_2 0x01 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_3 0x02 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_4 0x03 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_5 0x04 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_6 0x05 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_7 0x06 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_8 0x07 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_UNDER_80MHZ_MASK 0x07 - -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_1 0x00 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_2 0x08 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_3 0x10 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_4 0x18 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_5 0x20 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_6 0x28 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_7 0x30 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_8 0x38 -#define IEEE80211_HE_PHY_CAP5_BEAMFORMEE_NUM_SND_DIM_ABOVE_80MHZ_MASK 0x38 - -#define IEEE80211_HE_PHY_CAP5_NG16_SU_FEEDBACK 0x40 -#define IEEE80211_HE_PHY_CAP5_NG16_MU_FEEDBACK 0x80 - -#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_42_SU 0x01 -#define IEEE80211_HE_PHY_CAP6_CODEBOOK_SIZE_75_MU 0x02 -#define IEEE80211_HE_PHY_CAP6_TRIG_SU_BEAMFORMING_FB 0x04 -#define IEEE80211_HE_PHY_CAP6_TRIG_MU_BEAMFORMING_PARTIAL_BW_FB 0x08 -#define IEEE80211_HE_PHY_CAP6_TRIG_CQI_FB 0x10 -#define IEEE80211_HE_PHY_CAP6_PARTIAL_BW_EXT_RANGE 0x20 -#define IEEE80211_HE_PHY_CAP6_PARTIAL_BANDWIDTH_DL_MUMIMO 0x40 -#define IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT 0x80 - -#define IEEE80211_HE_PHY_CAP7_PSR_BASED_SR 0x01 -#define IEEE80211_HE_PHY_CAP7_POWER_BOOST_FACTOR_SUPP 0x02 -#define IEEE80211_HE_PHY_CAP7_HE_SU_MU_PPDU_4XLTF_AND_08_US_GI 0x04 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_1 0x08 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_2 0x10 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_3 0x18 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_4 0x20 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_5 0x28 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_6 0x30 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_7 0x38 -#define IEEE80211_HE_PHY_CAP7_MAX_NC_MASK 0x38 -#define IEEE80211_HE_PHY_CAP7_STBC_TX_ABOVE_80MHZ 0x40 -#define IEEE80211_HE_PHY_CAP7_STBC_RX_ABOVE_80MHZ 0x80 - -#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_PPDU_4XLTF_AND_08_US_GI 0x01 -#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_40MHZ_HE_PPDU_IN_2G 0x02 -#define IEEE80211_HE_PHY_CAP8_20MHZ_IN_160MHZ_HE_PPDU 0x04 -#define IEEE80211_HE_PHY_CAP8_80MHZ_IN_160MHZ_HE_PPDU 0x08 -#define IEEE80211_HE_PHY_CAP8_HE_ER_SU_1XLTF_AND_08_US_GI 0x10 -#define IEEE80211_HE_PHY_CAP8_MIDAMBLE_RX_TX_2X_AND_1XLTF 0x20 -#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_242 0x00 -#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_484 0x40 -#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_996 0x80 -#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_2x996 0xc0 -#define IEEE80211_HE_PHY_CAP8_DCM_MAX_RU_MASK 0xc0 - -#define IEEE80211_HE_PHY_CAP9_LONGER_THAN_16_SIGB_OFDM_SYM 0x01 -#define IEEE80211_HE_PHY_CAP9_NON_TRIGGERED_CQI_FEEDBACK 0x02 -#define IEEE80211_HE_PHY_CAP9_TX_1024_QAM_LESS_THAN_242_TONE_RU 0x04 -#define IEEE80211_HE_PHY_CAP9_RX_1024_QAM_LESS_THAN_242_TONE_RU 0x08 -#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_COMP_SIGB 0x10 -#define IEEE80211_HE_PHY_CAP9_RX_FULL_BW_SU_USING_MU_WITH_NON_COMP_SIGB 0x20 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_0US 0x0 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_8US 0x1 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_16US 0x2 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_RESERVED 0x3 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_POS 6 -#define IEEE80211_HE_PHY_CAP9_NOMINAL_PKT_PADDING_MASK 0xc0 - -#define IEEE80211_HE_PHY_CAP10_HE_MU_M1RU_MAX_LTF 0x01 - -/* 802.11ax HE TX/RX MCS NSS Support */ -#define IEEE80211_TX_RX_MCS_NSS_SUPP_HIGHEST_MCS_POS (3) -#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_POS (6) -#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_POS (11) -#define IEEE80211_TX_RX_MCS_NSS_SUPP_TX_BITMAP_MASK 0x07c0 -#define IEEE80211_TX_RX_MCS_NSS_SUPP_RX_BITMAP_MASK 0xf800 - -/* TX/RX HE MCS Support field Highest MCS subfield encoding */ -enum ieee80211_he_highest_mcs_supported_subfield_enc { - HIGHEST_MCS_SUPPORTED_MCS7 = 0, - HIGHEST_MCS_SUPPORTED_MCS8, - HIGHEST_MCS_SUPPORTED_MCS9, - HIGHEST_MCS_SUPPORTED_MCS10, - HIGHEST_MCS_SUPPORTED_MCS11, -}; - -/* Calculate 802.11ax HE capabilities IE Tx/Rx HE MCS NSS Support Field size */ -static inline u8 -ieee80211_he_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap) -{ - u8 count = 4; - - if (he_cap->phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) - count += 4; - - if (he_cap->phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) - count += 4; - - return count; -} - -/* 802.11ax HE PPE Thresholds */ -#define IEEE80211_PPE_THRES_NSS_SUPPORT_2NSS (1) -#define IEEE80211_PPE_THRES_NSS_POS (0) -#define IEEE80211_PPE_THRES_NSS_MASK (7) -#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_2x966_AND_966_RU \ - (BIT(5) | BIT(6)) -#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK 0x78 -#define IEEE80211_PPE_THRES_RU_INDEX_BITMASK_POS (3) -#define IEEE80211_PPE_THRES_INFO_PPET_SIZE (3) -#define IEEE80211_HE_PPE_THRES_INFO_HEADER_SIZE (7) - -/* - * Calculate 802.11ax HE capabilities IE PPE field size - * Input: Header byte of ppe_thres (first byte), and HE capa IE's PHY cap u8* - */ -static inline u8 -ieee80211_he_ppe_size(u8 ppe_thres_hdr, const u8 *phy_cap_info) -{ - u8 n; - - if ((phy_cap_info[6] & - IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) == 0) - return 0; - - n = hweight8(ppe_thres_hdr & - IEEE80211_PPE_THRES_RU_INDEX_BITMASK_MASK); - n *= (1 + ((ppe_thres_hdr & IEEE80211_PPE_THRES_NSS_MASK) >> - IEEE80211_PPE_THRES_NSS_POS)); - - /* - * Each pair is 6 bits, and we need to add the 7 "header" bits to the - * total size. - */ - n = (n * IEEE80211_PPE_THRES_INFO_PPET_SIZE * 2) + 7; - n = DIV_ROUND_UP(n, 8); - - return n; -} - -static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) -{ - const struct ieee80211_he_cap_elem *he_cap_ie_elem = (const void *)data; - u8 needed = sizeof(*he_cap_ie_elem); - - if (len < needed) - return false; - - needed += ieee80211_he_mcs_nss_size(he_cap_ie_elem); - if (len < needed) - return false; - - if (he_cap_ie_elem->phy_cap_info[6] & - IEEE80211_HE_PHY_CAP6_PPE_THRESHOLD_PRESENT) { - if (len < needed + 1) - return false; - needed += ieee80211_he_ppe_size(data[needed], - he_cap_ie_elem->phy_cap_info); - } - - return len >= needed; -} - -/* HE Operation defines */ -#define IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK 0x00000007 -#define IEEE80211_HE_OPERATION_TWT_REQUIRED 0x00000008 -#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK 0x00003ff0 -#define IEEE80211_HE_OPERATION_RTS_THRESHOLD_OFFSET 4 -#define IEEE80211_HE_OPERATION_VHT_OPER_INFO 0x00004000 -#define IEEE80211_HE_OPERATION_CO_HOSTED_BSS 0x00008000 -#define IEEE80211_HE_OPERATION_ER_SU_DISABLE 0x00010000 -#define IEEE80211_HE_OPERATION_6GHZ_OP_INFO 0x00020000 -#define IEEE80211_HE_OPERATION_BSS_COLOR_MASK 0x3f000000 -#define IEEE80211_HE_OPERATION_BSS_COLOR_OFFSET 24 -#define IEEE80211_HE_OPERATION_PARTIAL_BSS_COLOR 0x40000000 -#define IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED 0x80000000 - -#define IEEE80211_6GHZ_CTRL_REG_LPI_AP 0 -#define IEEE80211_6GHZ_CTRL_REG_SP_AP 1 -#define IEEE80211_6GHZ_CTRL_REG_VLP_AP 2 -#define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP 3 -#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD 4 -#define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP 8 - -/** - * struct ieee80211_he_6ghz_oper - HE 6 GHz operation Information field - * @primary: primary channel - * @control: control flags - * @ccfs0: channel center frequency segment 0 - * @ccfs1: channel center frequency segment 1 - * @minrate: minimum rate (in 1 Mbps units) - */ -struct ieee80211_he_6ghz_oper { - u8 primary; -#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH 0x3 -#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ 0 -#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ 1 -#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ 2 -#define IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ 3 -#define IEEE80211_HE_6GHZ_OPER_CTRL_DUP_BEACON 0x4 -#define IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO 0x78 - u8 control; - u8 ccfs0; - u8 ccfs1; - u8 minrate; -} __packed; - -/** - * enum ieee80211_reg_conn_bits - represents Regulatory connectivity field bits. - * - * This enumeration defines bit flags used to represent regulatory connectivity - * field bits. - * - * @IEEE80211_REG_CONN_LPI_VALID: Indicates whether the LPI bit is valid. - * @IEEE80211_REG_CONN_LPI_VALUE: Represents the value of the LPI bit. - * @IEEE80211_REG_CONN_SP_VALID: Indicates whether the SP bit is valid. - * @IEEE80211_REG_CONN_SP_VALUE: Represents the value of the SP bit. - */ -enum ieee80211_reg_conn_bits { - IEEE80211_REG_CONN_LPI_VALID = BIT(0), - IEEE80211_REG_CONN_LPI_VALUE = BIT(1), - IEEE80211_REG_CONN_SP_VALID = BIT(2), - IEEE80211_REG_CONN_SP_VALUE = BIT(3), -}; - -/* transmit power interpretation type of transmit power envelope element */ -enum ieee80211_tx_power_intrpt_type { - IEEE80211_TPE_LOCAL_EIRP, - IEEE80211_TPE_LOCAL_EIRP_PSD, - IEEE80211_TPE_REG_CLIENT_EIRP, - IEEE80211_TPE_REG_CLIENT_EIRP_PSD, -}; - -/* category type of transmit power envelope element */ -enum ieee80211_tx_power_category_6ghz { - IEEE80211_TPE_CAT_6GHZ_DEFAULT = 0, - IEEE80211_TPE_CAT_6GHZ_SUBORDINATE = 1, -}; - -/* - * For IEEE80211_TPE_LOCAL_EIRP / IEEE80211_TPE_REG_CLIENT_EIRP, - * setting to 63.5 dBm means no constraint. - */ -#define IEEE80211_TPE_MAX_TX_PWR_NO_CONSTRAINT 127 - -/* - * For IEEE80211_TPE_LOCAL_EIRP_PSD / IEEE80211_TPE_REG_CLIENT_EIRP_PSD, - * setting to 127 indicates no PSD limit for the 20 MHz channel. - */ -#define IEEE80211_TPE_PSD_NO_LIMIT 127 - -/** - * struct ieee80211_tx_pwr_env - Transmit Power Envelope - * @info: Transmit Power Information field - * @variable: Maximum Transmit Power field - * - * This structure represents the payload of the "Transmit Power - * Envelope element" as described in IEEE Std 802.11ax-2021 section - * 9.4.2.161 - */ -struct ieee80211_tx_pwr_env { - u8 info; - u8 variable[]; -} __packed; - -#define IEEE80211_TX_PWR_ENV_INFO_COUNT 0x7 -#define IEEE80211_TX_PWR_ENV_INFO_INTERPRET 0x38 -#define IEEE80211_TX_PWR_ENV_INFO_CATEGORY 0xC0 - -#define IEEE80211_TX_PWR_ENV_EXT_COUNT 0xF - -static inline bool ieee80211_valid_tpe_element(const u8 *data, u8 len) -{ - const struct ieee80211_tx_pwr_env *env = (const void *)data; - u8 count, interpret, category; - u8 needed = sizeof(*env); - u8 N; /* also called N in the spec */ - - if (len < needed) - return false; - - count = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_COUNT); - interpret = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_INTERPRET); - category = u8_get_bits(env->info, IEEE80211_TX_PWR_ENV_INFO_CATEGORY); - - switch (category) { - case IEEE80211_TPE_CAT_6GHZ_DEFAULT: - case IEEE80211_TPE_CAT_6GHZ_SUBORDINATE: - break; - default: - return false; - } - - switch (interpret) { - case IEEE80211_TPE_LOCAL_EIRP: - case IEEE80211_TPE_REG_CLIENT_EIRP: - if (count > 3) - return false; - - /* count == 0 encodes 1 value for 20 MHz, etc. */ - needed += count + 1; - - if (len < needed) - return false; - - /* there can be extension fields not accounted for in 'count' */ - - return true; - case IEEE80211_TPE_LOCAL_EIRP_PSD: - case IEEE80211_TPE_REG_CLIENT_EIRP_PSD: - if (count > 4) - return false; - - N = count ? 1 << (count - 1) : 1; - needed += N; - - if (len < needed) - return false; - - if (len > needed) { - u8 K = u8_get_bits(env->variable[N], - IEEE80211_TX_PWR_ENV_EXT_COUNT); - - needed += 1 + K; - if (len < needed) - return false; - } - - return true; - } - - return false; -} - -/* - * ieee80211_he_oper_size - calculate 802.11ax HE Operations IE size - * @he_oper_ie: byte data of the He Operations IE, stating from the byte - * after the ext ID byte. It is assumed that he_oper_ie has at least - * sizeof(struct ieee80211_he_operation) bytes, the caller must have - * validated this. - * @return the actual size of the IE data (not including header), or 0 on error - */ -static inline u8 -ieee80211_he_oper_size(const u8 *he_oper_ie) -{ - const struct ieee80211_he_operation *he_oper = (const void *)he_oper_ie; - u8 oper_len = sizeof(struct ieee80211_he_operation); - u32 he_oper_params; - - /* Make sure the input is not NULL */ - if (!he_oper_ie) - return 0; - - /* Calc required length */ - he_oper_params = le32_to_cpu(he_oper->he_oper_params); - if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) - oper_len += 3; - if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) - oper_len++; - if (he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO) - oper_len += sizeof(struct ieee80211_he_6ghz_oper); - - /* Add the first byte (extension ID) to the total length */ - oper_len++; - - return oper_len; -} - -/** - * ieee80211_he_6ghz_oper - obtain 6 GHz operation field - * @he_oper: HE operation element (must be pre-validated for size) - * but may be %NULL - * - * Return: a pointer to the 6 GHz operation field, or %NULL - */ -static inline const struct ieee80211_he_6ghz_oper * -ieee80211_he_6ghz_oper(const struct ieee80211_he_operation *he_oper) -{ - const u8 *ret; - u32 he_oper_params; - - if (!he_oper) - return NULL; - - ret = (const void *)&he_oper->optional; - - he_oper_params = le32_to_cpu(he_oper->he_oper_params); - - if (!(he_oper_params & IEEE80211_HE_OPERATION_6GHZ_OP_INFO)) - return NULL; - if (he_oper_params & IEEE80211_HE_OPERATION_VHT_OPER_INFO) - ret += 3; - if (he_oper_params & IEEE80211_HE_OPERATION_CO_HOSTED_BSS) - ret++; - - return (const void *)ret; -} - -/* HE Spatial Reuse defines */ -#define IEEE80211_HE_SPR_PSR_DISALLOWED BIT(0) -#define IEEE80211_HE_SPR_NON_SRG_OBSS_PD_SR_DISALLOWED BIT(1) -#define IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT BIT(2) -#define IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT BIT(3) -#define IEEE80211_HE_SPR_HESIGA_SR_VAL15_ALLOWED BIT(4) - -/* - * ieee80211_he_spr_size - calculate 802.11ax HE Spatial Reuse IE size - * @he_spr_ie: byte data of the He Spatial Reuse IE, stating from the byte - * after the ext ID byte. It is assumed that he_spr_ie has at least - * sizeof(struct ieee80211_he_spr) bytes, the caller must have validated - * this - * @return the actual size of the IE data (not including header), or 0 on error - */ -static inline u8 -ieee80211_he_spr_size(const u8 *he_spr_ie) -{ - const struct ieee80211_he_spr *he_spr = (const void *)he_spr_ie; - u8 spr_len = sizeof(struct ieee80211_he_spr); - u8 he_spr_params; - - /* Make sure the input is not NULL */ - if (!he_spr_ie) - return 0; - - /* Calc required length */ - he_spr_params = he_spr->he_sr_control; - if (he_spr_params & IEEE80211_HE_SPR_NON_SRG_OFFSET_PRESENT) - spr_len++; - if (he_spr_params & IEEE80211_HE_SPR_SRG_INFORMATION_PRESENT) - spr_len += 18; - - /* Add the first byte (extension ID) to the total length */ - spr_len++; - - return spr_len; -} - /* S1G Capabilities Information field */ #define IEEE80211_S1G_CAPABILITY_LEN 15 @@ -2697,6 +1912,9 @@ ieee80211_he_spr_size(const u8 *he_spr_ie) #define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ 3 #define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ 4 +/* need HE definitions for EHT functions */ +#include "ieee80211-he.h" + /* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ static inline u8 ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, @@ -3815,24 +3033,6 @@ struct ieee80211_tspec_ie { __le16 medium_time; } __packed; -struct ieee80211_he_6ghz_capa { - /* uses IEEE80211_HE_6GHZ_CAP_* below */ - __le16 capa; -} __packed; - -/* HE 6 GHz band capabilities */ -/* uses enum ieee80211_min_mpdu_spacing values */ -#define IEEE80211_HE_6GHZ_CAP_MIN_MPDU_START 0x0007 -/* uses enum ieee80211_vht_max_ampdu_length_exp values */ -#define IEEE80211_HE_6GHZ_CAP_MAX_AMPDU_LEN_EXP 0x0038 -/* uses IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_* values */ -#define IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN 0x00c0 -/* WLAN_HT_CAP_SM_PS_* values */ -#define IEEE80211_HE_6GHZ_CAP_SM_PS 0x0600 -#define IEEE80211_HE_6GHZ_CAP_RD_RESPONDER 0x0800 -#define IEEE80211_HE_6GHZ_CAP_RX_ANTPAT_CONS 0x1000 -#define IEEE80211_HE_6GHZ_CAP_TX_ANTPAT_CONS 0x2000 - /** * ieee80211_get_qos_ctl - get pointer to qos control bytes * @hdr: the frame -- cgit v1.2.3 From 86bc0c662322b4749cd666678d2fdce7015bcae3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:53 +0100 Subject: wifi: ieee80211: split EHT definitions out The ieee80211.h file has gotten very long, continue splitting it by putting EHT definitions into a separate file. Link: https://patch.msgid.link/20251105153843.bf77fe169140.I691267e0edd914c604a5bfd447d33be00044c9b4@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-eht.h | 1182 +++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 1164 +--------------------------------------- 2 files changed, 1184 insertions(+), 1162 deletions(-) create mode 100644 include/linux/ieee80211-eht.h (limited to 'include') diff --git a/include/linux/ieee80211-eht.h b/include/linux/ieee80211-eht.h new file mode 100644 index 000000000000..f9782e46c5e5 --- /dev/null +++ b/include/linux/ieee80211-eht.h @@ -0,0 +1,1182 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 EHT definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_EHT_H +#define LINUX_IEEE80211_EHT_H + +#include +#include +/* need HE definitions for the inlines here */ +#include + +#define IEEE80211_TTLM_MAX_CNT 2 +#define IEEE80211_TTLM_CONTROL_DIRECTION 0x03 +#define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP 0x04 +#define IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT 0x08 +#define IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT 0x10 +#define IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE 0x20 + +#define IEEE80211_TTLM_DIRECTION_DOWN 0 +#define IEEE80211_TTLM_DIRECTION_UP 1 +#define IEEE80211_TTLM_DIRECTION_BOTH 2 + +/** + * struct ieee80211_ttlm_elem - TID-To-Link Mapping element + * + * Defined in section 9.4.2.314 in P802.11be_D4 + * + * @control: the first part of control field + * @optional: the second part of control field + */ +struct ieee80211_ttlm_elem { + u8 control; + u8 optional[]; +} __packed; + +#define IEEE80211_EHT_MCS_NSS_RX 0x0f +#define IEEE80211_EHT_MCS_NSS_TX 0xf0 + +/** + * struct ieee80211_eht_mcs_nss_supp_20mhz_only - EHT 20MHz only station max + * supported NSS for per MCS. + * + * For each field below, bits 0 - 3 indicate the maximal number of spatial + * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams + * for Tx. + * + * @rx_tx_mcs7_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 0 - 7. + * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 8 - 9. + * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 10 - 11. + * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 12 - 13. + * @rx_tx_max_nss: array of the previous fields for easier loop access + */ +struct ieee80211_eht_mcs_nss_supp_20mhz_only { + union { + struct { + u8 rx_tx_mcs7_max_nss; + u8 rx_tx_mcs9_max_nss; + u8 rx_tx_mcs11_max_nss; + u8 rx_tx_mcs13_max_nss; + }; + u8 rx_tx_max_nss[4]; + }; +}; + +/** + * struct ieee80211_eht_mcs_nss_supp_bw - EHT max supported NSS per MCS (except + * 20MHz only stations). + * + * For each field below, bits 0 - 3 indicate the maximal number of spatial + * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams + * for Tx. + * + * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 0 - 9. + * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 10 - 11. + * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams + * supported for reception and the maximum number of spatial streams + * supported for transmission for MCS 12 - 13. + * @rx_tx_max_nss: array of the previous fields for easier loop access + */ +struct ieee80211_eht_mcs_nss_supp_bw { + union { + struct { + u8 rx_tx_mcs9_max_nss; + u8 rx_tx_mcs11_max_nss; + u8 rx_tx_mcs13_max_nss; + }; + u8 rx_tx_max_nss[3]; + }; +}; + +/** + * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data + * + * This structure is the "EHT Capabilities element" fixed fields as + * described in P802.11be_D2.0 section 9.4.2.313. + * + * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP* + * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP* + */ +struct ieee80211_eht_cap_elem_fixed { + u8 mac_cap_info[2]; + u8 phy_cap_info[9]; +} __packed; + +/** + * struct ieee80211_eht_cap_elem - EHT capabilities element + * @fixed: fixed parts, see &ieee80211_eht_cap_elem_fixed + * @optional: optional parts + */ +struct ieee80211_eht_cap_elem { + struct ieee80211_eht_cap_elem_fixed fixed; + + /* + * Followed by: + * Supported EHT-MCS And NSS Set field: 4, 3, 6 or 9 octets. + * EHT PPE Thresholds field: variable length. + */ + u8 optional[]; +} __packed; + +#define IEEE80211_EHT_OPER_INFO_PRESENT 0x01 +#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT 0x02 +#define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION 0x04 +#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT 0x08 +#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK 0x30 +#define IEEE80211_EHT_OPER_MCS15_DISABLE 0x40 + +/** + * struct ieee80211_eht_operation - eht operation element + * + * This structure is the "EHT Operation Element" fields as + * described in P802.11be_D2.0 section 9.4.2.311 + * + * @params: EHT operation element parameters. See &IEEE80211_EHT_OPER_* + * @basic_mcs_nss: indicates the EHT-MCSs for each number of spatial streams in + * EHT PPDUs that are supported by all EHT STAs in the BSS in transmit and + * receive. + * @optional: optional parts + */ +struct ieee80211_eht_operation { + u8 params; + struct ieee80211_eht_mcs_nss_supp_20mhz_only basic_mcs_nss; + u8 optional[]; +} __packed; + +/** + * struct ieee80211_eht_operation_info - eht operation information + * + * @control: EHT operation information control. + * @ccfs0: defines a channel center frequency for a 20, 40, 80, 160, or 320 MHz + * EHT BSS. + * @ccfs1: defines a channel center frequency for a 160 or 320 MHz EHT BSS. + * @optional: optional parts + */ +struct ieee80211_eht_operation_info { + u8 control; + u8 ccfs0; + u8 ccfs1; + u8 optional[]; +} __packed; + +/* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */ +#define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS 0x01 +#define IEEE80211_EHT_MAC_CAP0_OM_CONTROL 0x02 +#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 0x04 +#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2 0x08 +#define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT 0x10 +#define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC 0x20 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK 0xc0 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895 0 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991 1 +#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454 2 + +#define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK 0x01 +#define IEEE80211_EHT_MAC_CAP1_EHT_TRS 0x02 +#define IEEE80211_EHT_MAC_CAP1_TXOP_RET 0x04 +#define IEEE80211_EHT_MAC_CAP1_TWO_BQRS 0x08 +#define IEEE80211_EHT_MAC_CAP1_EHT_LINK_ADAPT_MASK 0x30 +#define IEEE80211_EHT_MAC_CAP1_UNSOL_EPCS_PRIO_ACCESS 0x40 + +/* EHT PHY capabilities as defined in P802.11be_D2.0 section 9.4.2.313.3 */ +#define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ 0x02 +#define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ 0x04 +#define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI 0x08 +#define IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO 0x10 +#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER 0x20 +#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE 0x40 + +/* EHT beamformee number of spatial streams <= 80MHz is split */ +#define IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK 0x80 +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK 0x03 + +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK 0x1c +#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK 0xe0 + +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK 0x07 +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK 0x38 + +/* EHT number of sounding dimensions for 320MHz is split */ +#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK 0xc0 +#define IEEE80211_EHT_PHY_CAP3_SOUNDING_DIM_320MHZ_MASK 0x01 +#define IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK 0x02 +#define IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK 0x04 +#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK 0x08 +#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK 0x10 +#define IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK 0x20 +#define IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK 0x40 +#define IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK 0x80 + +#define IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO 0x01 +#define IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP 0x02 +#define IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP 0x04 +#define IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI 0x08 +#define IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK 0xf0 + +#define IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK 0x01 +#define IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP 0x02 +#define IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP 0x04 +#define IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT 0x08 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK 0x30 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US 0 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US 1 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US 2 +#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US 3 + +/* Maximum number of supported EHT LTF is split */ +#define IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK 0xc0 +#define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF 0x40 +#define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 + +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x08 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x30 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ 0x40 +#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 +#define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 + +#define IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW 0x01 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ 0x02 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ 0x04 +#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ 0x08 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ 0x10 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ 0x20 +#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ 0x40 +#define IEEE80211_EHT_PHY_CAP7_TB_SOUNDING_FDBK_RATE_LIMIT 0x80 + +#define IEEE80211_EHT_PHY_CAP8_RX_1024QAM_WIDER_BW_DL_OFDMA 0x01 +#define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA 0x02 + +/* + * EHT operation channel width as defined in P802.11be_D2.0 section 9.4.2.311 + */ +#define IEEE80211_EHT_OPER_CHAN_WIDTH 0x7 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ 0 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ 1 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ 2 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ 3 +#define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ 4 + +/* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ +static inline u8 +ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, + const struct ieee80211_eht_cap_elem_fixed *eht_cap, + bool from_ap) +{ + u8 count = 0; + + /* on 2.4 GHz, if it supports 40 MHz, the result is 3 */ + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) + return 3; + + /* on 2.4 GHz, these three bits are reserved, so should be 0 */ + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) + count += 3; + + if (he_cap->phy_cap_info[0] & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) + count += 3; + + if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) + count += 3; + + if (count) + return count; + + return from_ap ? 3 : 4; +} + +/* 802.11be EHT PPE Thresholds */ +#define IEEE80211_EHT_PPE_THRES_NSS_POS 0 +#define IEEE80211_EHT_PPE_THRES_NSS_MASK 0xf +#define IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK 0x1f0 +#define IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE 3 +#define IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE 9 + +/* + * Calculate 802.11be EHT capabilities IE EHT field size + */ +static inline u8 +ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info) +{ + u32 n; + + if (!(phy_cap_info[5] & + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT)) + return 0; + + n = hweight16(ppe_thres_hdr & + IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK); + n *= 1 + u16_get_bits(ppe_thres_hdr, IEEE80211_EHT_PPE_THRES_NSS_MASK); + + /* + * Each pair is 6 bits, and we need to add the 9 "header" bits to the + * total size. + */ + n = n * IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE * 2 + + IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE; + return DIV_ROUND_UP(n, 8); +} + +static inline bool +ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len, + bool from_ap) +{ + const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data; + u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed); + + if (len < needed || !he_capa) + return false; + + needed += ieee80211_eht_mcs_nss_size((const void *)he_capa, + (const void *)data, + from_ap); + if (len < needed) + return false; + + if (elem->phy_cap_info[5] & + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) { + u16 ppe_thres_hdr; + + if (len < needed + sizeof(ppe_thres_hdr)) + return false; + + ppe_thres_hdr = get_unaligned_le16(data + needed); + needed += ieee80211_eht_ppe_size(ppe_thres_hdr, + elem->phy_cap_info); + } + + return len >= needed; +} + +static inline bool +ieee80211_eht_oper_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_eht_operation *elem = (const void *)data; + u8 needed = sizeof(*elem); + + if (len < needed) + return false; + + if (elem->params & IEEE80211_EHT_OPER_INFO_PRESENT) { + needed += 3; + + if (elem->params & + IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT) + needed += 2; + } + + return len >= needed; +} + +/* must validate ieee80211_eht_oper_size_ok() first */ +static inline u16 +ieee80211_eht_oper_dis_subchan_bitmap(const struct ieee80211_eht_operation *eht_oper) +{ + const struct ieee80211_eht_operation_info *info = + (const void *)eht_oper->optional; + + if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) + return 0; + + if (!(eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) + return 0; + + return get_unaligned_le16(info->optional); +} + +#define IEEE80211_BW_IND_DIS_SUBCH_PRESENT BIT(1) + +struct ieee80211_bandwidth_indication { + u8 params; + struct ieee80211_eht_operation_info info; +} __packed; + +static inline bool +ieee80211_bandwidth_indication_size_ok(const u8 *data, u8 len) +{ + const struct ieee80211_bandwidth_indication *bwi = (const void *)data; + + if (len < sizeof(*bwi)) + return false; + + if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT && + len < sizeof(*bwi) + 2) + return false; + + return true; +} + +/* Protected EHT action codes */ +enum ieee80211_protected_eht_actioncode { + WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0, + WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1, + WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ = 3, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP = 4, + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN = 5, + WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF = 6, + WLAN_PROTECTED_EHT_ACTION_LINK_RECOMMEND = 7, + WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_REQ = 8, + WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_RESP = 9, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_NOTIF = 10, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ = 11, + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP = 12, +}; + +/* multi-link device */ +#define IEEE80211_MLD_MAX_NUM_LINKS 15 + +#define IEEE80211_ML_CONTROL_TYPE 0x0007 +#define IEEE80211_ML_CONTROL_TYPE_BASIC 0 +#define IEEE80211_ML_CONTROL_TYPE_PREQ 1 +#define IEEE80211_ML_CONTROL_TYPE_RECONF 2 +#define IEEE80211_ML_CONTROL_TYPE_TDLS 3 +#define IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS 4 +#define IEEE80211_ML_CONTROL_PRESENCE_MASK 0xfff0 + +struct ieee80211_multi_link_elem { + __le16 control; + u8 variable[]; +} __packed; + +#define IEEE80211_MLC_BASIC_PRES_LINK_ID 0x0010 +#define IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT 0x0020 +#define IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY 0x0040 +#define IEEE80211_MLC_BASIC_PRES_EML_CAPA 0x0080 +#define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP 0x0100 +#define IEEE80211_MLC_BASIC_PRES_MLD_ID 0x0200 +#define IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP 0x0400 + +#define IEEE80211_MED_SYNC_DELAY_DURATION 0x00ff +#define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH 0x0f00 +#define IEEE80211_MED_SYNC_DELAY_SYNC_MAX_NUM_TXOPS 0xf000 + +/* + * Described in P802.11be_D3.0 + * dot11MSDTimerDuration should default to 5484 (i.e. 171.375) + * dot11MSDOFDMEDthreshold defaults to -72 (i.e. 0) + * dot11MSDTXOPMAX defaults to 1 + */ +#define IEEE80211_MED_SYNC_DELAY_DEFAULT 0x10ac + +#define IEEE80211_EML_CAP_EMLSR_SUPP 0x0001 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY 0x000e +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_0US 0 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_32US 1 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_64US 2 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_128US 3 +#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US 4 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY 0x0070 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_0US 0 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_16US 1 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_32US 2 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_64US 3 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_128US 4 +#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US 5 +#define IEEE80211_EML_CAP_EMLMR_SUPPORT 0x0080 +#define IEEE80211_EML_CAP_EMLMR_DELAY 0x0700 +#define IEEE80211_EML_CAP_EMLMR_DELAY_0US 0 +#define IEEE80211_EML_CAP_EMLMR_DELAY_32US 1 +#define IEEE80211_EML_CAP_EMLMR_DELAY_64US 2 +#define IEEE80211_EML_CAP_EMLMR_DELAY_128US 3 +#define IEEE80211_EML_CAP_EMLMR_DELAY_256US 4 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT 0x7800 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_0 0 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128US 1 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_256US 2 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_512US 3 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_1TU 4 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_2TU 5 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_4TU 6 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_8TU 7 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_16TU 8 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_32TU 9 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_64TU 10 +#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU 11 + +#define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS 0x000f +#define IEEE80211_MLD_CAP_OP_SRS_SUPPORT 0x0010 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP 0x0060 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_NO_SUPP 0 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME 1 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_RESERVED 2 +#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF 3 +#define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND 0x0f80 +#define IEEE80211_MLD_CAP_OP_AAR_SUPPORT 0x1000 +#define IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT 0x2000 +#define IEEE80211_MLD_CAP_OP_ALIGNED_TWT_SUPPORT 0x4000 + +struct ieee80211_mle_basic_common_info { + u8 len; + u8 mld_mac_addr[ETH_ALEN]; + u8 variable[]; +} __packed; + +#define IEEE80211_MLC_PREQ_PRES_MLD_ID 0x0010 + +struct ieee80211_mle_preq_common_info { + u8 len; + u8 variable[]; +} __packed; + +#define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR 0x0010 +#define IEEE80211_MLC_RECONF_PRES_EML_CAPA 0x0020 +#define IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP 0x0040 +#define IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP 0x0080 + +/* no fixed fields in RECONF */ + +struct ieee80211_mle_tdls_common_info { + u8 len; + u8 ap_mld_mac_addr[ETH_ALEN]; +} __packed; + +#define IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR 0x0010 + +/* no fixed fields in PRIO_ACCESS */ + +/** + * ieee80211_mle_common_size - check multi-link element common size + * @data: multi-link element, must already be checked for size using + * ieee80211_mle_size_ok() + * Return: the size of the multi-link element's "common" subfield + */ +static inline u8 ieee80211_mle_common_size(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + + switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { + case IEEE80211_ML_CONTROL_TYPE_BASIC: + case IEEE80211_ML_CONTROL_TYPE_PREQ: + case IEEE80211_ML_CONTROL_TYPE_TDLS: + case IEEE80211_ML_CONTROL_TYPE_RECONF: + case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: + /* + * The length is the first octet pointed by mle->variable so no + * need to add anything + */ + break; + default: + WARN_ON(1); + return 0; + } + + return sizeof(*mle) + mle->variable[0]; +} + +/** + * ieee80211_mle_get_link_id - returns the link ID + * @data: the basic multi link element + * Return: the link ID, or -1 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline int ieee80211_mle_get_link_id(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* common points now at the beginning of ieee80211_mle_basic_common_info */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_LINK_ID)) + return -1; + + return *common; +} + +/** + * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count + * @data: pointer to the basic multi link element + * Return: the BSS Parameter Change Count field value, or -1 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline int +ieee80211_mle_get_bss_param_ch_cnt(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* common points now at the beginning of ieee80211_mle_basic_common_info */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)) + return -1; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + + return *common; +} + +/** + * ieee80211_mle_get_eml_med_sync_delay - returns the medium sync delay + * @data: pointer to the multi-link element + * Return: the medium synchronization delay field value from the multi-link + * element, or the default value (%IEEE80211_MED_SYNC_DELAY_DEFAULT) + * if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u16 ieee80211_mle_get_eml_med_sync_delay(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* common points now at the beginning of ieee80211_mle_basic_common_info */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)) + return IEEE80211_MED_SYNC_DELAY_DEFAULT; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + + return get_unaligned_le16(common); +} + +/** + * ieee80211_mle_get_eml_cap - returns the EML capability + * @data: pointer to the multi-link element + * Return: the EML capability field value from the multi-link element, + * or 0 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u16 ieee80211_mle_get_eml_cap(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* common points now at the beginning of ieee80211_mle_basic_common_info */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + + return get_unaligned_le16(common); +} + +/** + * ieee80211_mle_get_mld_capa_op - returns the MLD capabilities and operations. + * @data: pointer to the multi-link element + * Return: the MLD capabilities and operations field value from the multi-link + * element, or 0 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* + * common points now at the beginning of + * ieee80211_mle_basic_common_info + */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + + return get_unaligned_le16(common); +} + +/* Defined in Figure 9-1074t in P802.11be_D7.0 */ +#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE 0x0001 +#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_RECO_MAX_LINKS_MASK 0x001e +#define IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE 0x0020 +#define IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK 0x0040 +#define IEEE80211_EHT_ML_EXT_MLD_CAPA_BTM_MLD_RECO_MULTI_AP 0x0080 + +/** + * ieee80211_mle_get_ext_mld_capa_op - returns the extended MLD capabilities + * and operations. + * @data: pointer to the multi-link element + * Return: the extended MLD capabilities and operations field value from + * the multi-link element, or 0 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u16 ieee80211_mle_get_ext_mld_capa_op(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* + * common points now at the beginning of + * ieee80211_mle_basic_common_info + */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) + common += 1; + + return get_unaligned_le16(common); +} + +/** + * ieee80211_mle_get_mld_id - returns the MLD ID + * @data: pointer to the multi-link element + * Return: The MLD ID in the given multi-link element, or 0 if not present + * + * The element is assumed to be of the correct type (BASIC) and big enough, + * this must be checked using ieee80211_mle_type_ok(). + */ +static inline u8 ieee80211_mle_get_mld_id(const u8 *data) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control = le16_to_cpu(mle->control); + const u8 *common = mle->variable; + + /* + * common points now at the beginning of + * ieee80211_mle_basic_common_info + */ + common += sizeof(struct ieee80211_mle_basic_common_info); + + if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_ID)) + return 0; + + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) + common += 2; + + return *common; +} + +/** + * ieee80211_mle_size_ok - validate multi-link element size + * @data: pointer to the element data + * @len: length of the containing element + * Return: whether or not the multi-link element size is OK + */ +static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u8 fixed = sizeof(*mle); + u8 common = 0; + bool check_common_len = false; + u16 control; + + if (!data || len < fixed) + return false; + + control = le16_to_cpu(mle->control); + + switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { + case IEEE80211_ML_CONTROL_TYPE_BASIC: + common += sizeof(struct ieee80211_mle_basic_common_info); + check_common_len = true; + if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) + common += 1; + if (control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP) + common += 2; + break; + case IEEE80211_ML_CONTROL_TYPE_PREQ: + common += sizeof(struct ieee80211_mle_preq_common_info); + if (control & IEEE80211_MLC_PREQ_PRES_MLD_ID) + common += 1; + check_common_len = true; + break; + case IEEE80211_ML_CONTROL_TYPE_RECONF: + if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) + common += ETH_ALEN; + if (control & IEEE80211_MLC_RECONF_PRES_EML_CAPA) + common += 2; + if (control & IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP) + common += 2; + if (control & IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP) + common += 2; + break; + case IEEE80211_ML_CONTROL_TYPE_TDLS: + common += sizeof(struct ieee80211_mle_tdls_common_info); + check_common_len = true; + break; + case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: + common = ETH_ALEN + 1; + break; + default: + /* we don't know this type */ + return true; + } + + if (len < fixed + common) + return false; + + if (!check_common_len) + return true; + + /* if present, common length is the first octet there */ + return mle->variable[0] >= common; +} + +/** + * ieee80211_mle_type_ok - validate multi-link element type and size + * @data: pointer to the element data + * @type: expected type of the element + * @len: length of the containing element + * Return: whether or not the multi-link element type matches and size is OK + */ +static inline bool ieee80211_mle_type_ok(const u8 *data, u8 type, size_t len) +{ + const struct ieee80211_multi_link_elem *mle = (const void *)data; + u16 control; + + if (!ieee80211_mle_size_ok(data, len)) + return false; + + control = le16_to_cpu(mle->control); + + if (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE) == type) + return true; + + return false; +} + +enum ieee80211_mle_subelems { + IEEE80211_MLE_SUBELEM_PER_STA_PROFILE = 0, + IEEE80211_MLE_SUBELEM_FRAGMENT = 254, +}; + +#define IEEE80211_MLE_STA_CONTROL_LINK_ID 0x000f +#define IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE 0x0010 +#define IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 +#define IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT 0x0040 +#define IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT 0x0080 +#define IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT 0x0100 +#define IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT 0x0200 +#define IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE 0x0400 +#define IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT 0x0800 + +struct ieee80211_mle_per_sta_profile { + __le16 control; + u8 sta_info_len; + u8 variable[]; +} __packed; + +/** + * ieee80211_mle_basic_sta_prof_size_ok - validate basic multi-link element sta + * profile size + * @data: pointer to the sub element data + * @len: length of the containing sub element + * Return: %true if the STA profile is large enough, %false otherwise + */ +static inline bool ieee80211_mle_basic_sta_prof_size_ok(const u8 *data, + size_t len) +{ + const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; + u16 control; + u8 fixed = sizeof(*prof); + u8 info_len = 1; + + if (len < fixed) + return false; + + control = le16_to_cpu(prof->control); + + if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) + info_len += 6; + if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) + info_len += 8; + if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && + control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) { + if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) + info_len += 2; + else + info_len += 1; + } + if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT) + info_len += 1; + + return prof->sta_info_len >= info_len && + fixed + prof->sta_info_len - 1 <= len; +} + +/** + * ieee80211_mle_basic_sta_prof_bss_param_ch_cnt - get per-STA profile BSS + * parameter change count + * @prof: the per-STA profile, having been checked with + * ieee80211_mle_basic_sta_prof_size_ok() for the correct length + * + * Return: The BSS parameter change count value if present, 0 otherwise. + */ +static inline u8 +ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(const struct ieee80211_mle_per_sta_profile *prof) +{ + u16 control = le16_to_cpu(prof->control); + const u8 *pos = prof->variable; + + if (!(control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT)) + return 0; + + if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) + pos += 6; + if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) + pos += 2; + if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) + pos += 8; + if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) + pos += 2; + if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && + control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) { + if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) + pos += 2; + else + pos += 1; + } + + return *pos; +} + +#define IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID 0x000f +#define IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE 0x0010 +#define IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 +#define IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT 0x0040 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE 0x0780 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_AP_REM 0 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_OP_PARAM_UPDATE 1 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK 2 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK 3 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_NSTR_STATUS 4 +#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT 0x0800 + +/** + * ieee80211_mle_reconf_sta_prof_size_ok - validate reconfiguration multi-link + * element sta profile size. + * @data: pointer to the sub element data + * @len: length of the containing sub element + * Return: %true if the STA profile is large enough, %false otherwise + */ +static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data, + size_t len) +{ + const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; + u16 control; + u8 fixed = sizeof(*prof); + u8 info_len = 1; + + if (len < fixed) + return false; + + control = le16_to_cpu(prof->control); + + if (control & IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT) + info_len += ETH_ALEN; + if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT) + info_len += 2; + + return prof->sta_info_len >= info_len && + fixed + prof->sta_info_len - 1 <= len; +} + +#define IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID 0x000f +#define IEEE80211_EPCS_ENA_RESP_BODY_LEN 3 + +static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len) +{ + const struct ieee80211_ttlm_elem *t2l = (const void *)data; + u8 control, fixed = sizeof(*t2l), elem_len = 0; + + if (len < fixed) + return false; + + control = t2l->control; + + if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT) + elem_len += 2; + if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT) + elem_len += 3; + + if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) { + u8 bm_size; + + elem_len += 1; + if (len < fixed + elem_len) + return false; + + if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) + bm_size = 1; + else + bm_size = 2; + + elem_len += hweight8(t2l->optional[0]) * bm_size; + } + + return len >= fixed + elem_len; +} + +/** + * ieee80211_emlsr_pad_delay_in_us - Fetch the EMLSR Padding delay + * in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Padding delay (in microseconds) encoded in the + * EML Capabilities field + */ + +static inline u32 ieee80211_emlsr_pad_delay_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417i—Encoding of the EMLSR + * Padding Delay subfield. + */ + u32 pad_delay = u16_get_bits(eml_cap, + IEEE80211_EML_CAP_EMLSR_PADDING_DELAY); + + if (!pad_delay || + pad_delay > IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US) + return 0; + + return 32 * (1 << (pad_delay - 1)); +} + +/** + * ieee80211_emlsr_trans_delay_in_us - Fetch the EMLSR Transition + * delay in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Transition delay (in microseconds) encoded in the + * EML Capabilities field + */ + +static inline u32 ieee80211_emlsr_trans_delay_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417j—Encoding of the EMLSR + * Transition Delay subfield. + */ + u32 trans_delay = + u16_get_bits(eml_cap, + IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY); + + /* invalid values also just use 0 */ + if (!trans_delay || + trans_delay > IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US) + return 0; + + return 16 * (1 << (trans_delay - 1)); +} + +/** + * ieee80211_eml_trans_timeout_in_us - Fetch the EMLSR Transition + * timeout value in microseconds + * @eml_cap: EML capabilities field value from common info field of + * the Multi-link element + * Return: the EMLSR Transition timeout (in microseconds) encoded in + * the EML Capabilities field + */ + +static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) +{ + /* IEEE Std 802.11be-2024 Table 9-417m—Encoding of the + * Transition Timeout subfield. + */ + u8 timeout = u16_get_bits(eml_cap, + IEEE80211_EML_CAP_TRANSITION_TIMEOUT); + + /* invalid values also just use 0 */ + if (!timeout || timeout > IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU) + return 0; + + return 128 * (1 << (timeout - 1)); +} + +#define for_each_mle_subelement(_elem, _data, _len) \ + if (ieee80211_mle_size_ok(_data, _len)) \ + for_each_element(_elem, \ + _data + ieee80211_mle_common_size(_data),\ + _len - ieee80211_mle_common_size(_data)) + +#endif /* LINUX_IEEE80211_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index a3dbbcee00ee..63a9775b059d 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1141,30 +1141,6 @@ ieee80211_s1g_optional_len(__le16 fc) return len; } -#define IEEE80211_TTLM_MAX_CNT 2 -#define IEEE80211_TTLM_CONTROL_DIRECTION 0x03 -#define IEEE80211_TTLM_CONTROL_DEF_LINK_MAP 0x04 -#define IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT 0x08 -#define IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT 0x10 -#define IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE 0x20 - -#define IEEE80211_TTLM_DIRECTION_DOWN 0 -#define IEEE80211_TTLM_DIRECTION_UP 1 -#define IEEE80211_TTLM_DIRECTION_BOTH 2 - -/** - * struct ieee80211_ttlm_elem - TID-To-Link Mapping element - * - * Defined in section 9.4.2.314 in P802.11be_D4 - * - * @control: the first part of control field - * @optional: the second part of control field - */ -struct ieee80211_ttlm_elem { - u8 control; - u8 optional[]; -} __packed; - /** * struct ieee80211_bss_load_elem - BSS Load elemen * @@ -1591,144 +1567,6 @@ struct ieee80211_p2p_noa_attr { #define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F -#define IEEE80211_EHT_MCS_NSS_RX 0x0f -#define IEEE80211_EHT_MCS_NSS_TX 0xf0 - -/** - * struct ieee80211_eht_mcs_nss_supp_20mhz_only - EHT 20MHz only station max - * supported NSS for per MCS. - * - * For each field below, bits 0 - 3 indicate the maximal number of spatial - * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams - * for Tx. - * - * @rx_tx_mcs7_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 0 - 7. - * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 8 - 9. - * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 10 - 11. - * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 12 - 13. - * @rx_tx_max_nss: array of the previous fields for easier loop access - */ -struct ieee80211_eht_mcs_nss_supp_20mhz_only { - union { - struct { - u8 rx_tx_mcs7_max_nss; - u8 rx_tx_mcs9_max_nss; - u8 rx_tx_mcs11_max_nss; - u8 rx_tx_mcs13_max_nss; - }; - u8 rx_tx_max_nss[4]; - }; -}; - -/** - * struct ieee80211_eht_mcs_nss_supp_bw - EHT max supported NSS per MCS (except - * 20MHz only stations). - * - * For each field below, bits 0 - 3 indicate the maximal number of spatial - * streams for Rx, and bits 4 - 7 indicate the maximal number of spatial streams - * for Tx. - * - * @rx_tx_mcs9_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 0 - 9. - * @rx_tx_mcs11_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 10 - 11. - * @rx_tx_mcs13_max_nss: indicates the maximum number of spatial streams - * supported for reception and the maximum number of spatial streams - * supported for transmission for MCS 12 - 13. - * @rx_tx_max_nss: array of the previous fields for easier loop access - */ -struct ieee80211_eht_mcs_nss_supp_bw { - union { - struct { - u8 rx_tx_mcs9_max_nss; - u8 rx_tx_mcs11_max_nss; - u8 rx_tx_mcs13_max_nss; - }; - u8 rx_tx_max_nss[3]; - }; -}; - -/** - * struct ieee80211_eht_cap_elem_fixed - EHT capabilities fixed data - * - * This structure is the "EHT Capabilities element" fixed fields as - * described in P802.11be_D2.0 section 9.4.2.313. - * - * @mac_cap_info: MAC capabilities, see IEEE80211_EHT_MAC_CAP* - * @phy_cap_info: PHY capabilities, see IEEE80211_EHT_PHY_CAP* - */ -struct ieee80211_eht_cap_elem_fixed { - u8 mac_cap_info[2]; - u8 phy_cap_info[9]; -} __packed; - -/** - * struct ieee80211_eht_cap_elem - EHT capabilities element - * @fixed: fixed parts, see &ieee80211_eht_cap_elem_fixed - * @optional: optional parts - */ -struct ieee80211_eht_cap_elem { - struct ieee80211_eht_cap_elem_fixed fixed; - - /* - * Followed by: - * Supported EHT-MCS And NSS Set field: 4, 3, 6 or 9 octets. - * EHT PPE Thresholds field: variable length. - */ - u8 optional[]; -} __packed; - -#define IEEE80211_EHT_OPER_INFO_PRESENT 0x01 -#define IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT 0x02 -#define IEEE80211_EHT_OPER_EHT_DEF_PE_DURATION 0x04 -#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_LIMIT 0x08 -#define IEEE80211_EHT_OPER_GROUP_ADDRESSED_BU_IND_EXP_MASK 0x30 -#define IEEE80211_EHT_OPER_MCS15_DISABLE 0x40 - -/** - * struct ieee80211_eht_operation - eht operation element - * - * This structure is the "EHT Operation Element" fields as - * described in P802.11be_D2.0 section 9.4.2.311 - * - * @params: EHT operation element parameters. See &IEEE80211_EHT_OPER_* - * @basic_mcs_nss: indicates the EHT-MCSs for each number of spatial streams in - * EHT PPDUs that are supported by all EHT STAs in the BSS in transmit and - * receive. - * @optional: optional parts - */ -struct ieee80211_eht_operation { - u8 params; - struct ieee80211_eht_mcs_nss_supp_20mhz_only basic_mcs_nss; - u8 optional[]; -} __packed; - -/** - * struct ieee80211_eht_operation_info - eht operation information - * - * @control: EHT operation information control. - * @ccfs0: defines a channel center frequency for a 20, 40, 80, 160, or 320 MHz - * EHT BSS. - * @ccfs1: defines a channel center frequency for a 160 or 320 MHz EHT BSS. - * @optional: optional parts - */ -struct ieee80211_eht_operation_info { - u8 control; - u8 ccfs0; - u8 ccfs1; - u8 optional[]; -} __packed; - /* S1G Capabilities Information field */ #define IEEE80211_S1G_CAPABILITY_LEN 15 @@ -1815,258 +1653,6 @@ struct ieee80211_eht_operation_info { #define S1G_2M_PRIMARY_LOCATION_LOWER 0 #define S1G_2M_PRIMARY_LOCATION_UPPER 1 -/* EHT MAC capabilities as defined in P802.11be_D2.0 section 9.4.2.313.2 */ -#define IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS 0x01 -#define IEEE80211_EHT_MAC_CAP0_OM_CONTROL 0x02 -#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1 0x04 -#define IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE2 0x08 -#define IEEE80211_EHT_MAC_CAP0_RESTRICTED_TWT 0x10 -#define IEEE80211_EHT_MAC_CAP0_SCS_TRAFFIC_DESC 0x20 -#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK 0xc0 -#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_3895 0 -#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_7991 1 -#define IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454 2 - -#define IEEE80211_EHT_MAC_CAP1_MAX_AMPDU_LEN_MASK 0x01 -#define IEEE80211_EHT_MAC_CAP1_EHT_TRS 0x02 -#define IEEE80211_EHT_MAC_CAP1_TXOP_RET 0x04 -#define IEEE80211_EHT_MAC_CAP1_TWO_BQRS 0x08 -#define IEEE80211_EHT_MAC_CAP1_EHT_LINK_ADAPT_MASK 0x30 -#define IEEE80211_EHT_MAC_CAP1_UNSOL_EPCS_PRIO_ACCESS 0x40 - -/* EHT PHY capabilities as defined in P802.11be_D2.0 section 9.4.2.313.3 */ -#define IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ 0x02 -#define IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ 0x04 -#define IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI 0x08 -#define IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO 0x10 -#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER 0x20 -#define IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE 0x40 - -/* EHT beamformee number of spatial streams <= 80MHz is split */ -#define IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK 0x80 -#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK 0x03 - -#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK 0x1c -#define IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_320MHZ_MASK 0xe0 - -#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK 0x07 -#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK 0x38 - -/* EHT number of sounding dimensions for 320MHz is split */ -#define IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_320MHZ_MASK 0xc0 -#define IEEE80211_EHT_PHY_CAP3_SOUNDING_DIM_320MHZ_MASK 0x01 -#define IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK 0x02 -#define IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK 0x04 -#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK 0x08 -#define IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK 0x10 -#define IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK 0x20 -#define IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK 0x40 -#define IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK 0x80 - -#define IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO 0x01 -#define IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP 0x02 -#define IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP 0x04 -#define IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI 0x08 -#define IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK 0xf0 - -#define IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK 0x01 -#define IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP 0x02 -#define IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP 0x04 -#define IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT 0x08 -#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK 0x30 -#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_0US 0 -#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_8US 1 -#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_16US 2 -#define IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_20US 3 - -/* Maximum number of supported EHT LTF is split */ -#define IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK 0xc0 -#define IEEE80211_EHT_PHY_CAP5_SUPP_EXTRA_EHT_LTF 0x40 -#define IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK 0x07 - -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_80MHZ 0x08 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_160MHZ 0x30 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_320MHZ 0x40 -#define IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK 0x78 -#define IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP 0x80 - -#define IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW 0x01 -#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ 0x02 -#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ 0x04 -#define IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ 0x08 -#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ 0x10 -#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ 0x20 -#define IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ 0x40 -#define IEEE80211_EHT_PHY_CAP7_TB_SOUNDING_FDBK_RATE_LIMIT 0x80 - -#define IEEE80211_EHT_PHY_CAP8_RX_1024QAM_WIDER_BW_DL_OFDMA 0x01 -#define IEEE80211_EHT_PHY_CAP8_RX_4096QAM_WIDER_BW_DL_OFDMA 0x02 - -/* - * EHT operation channel width as defined in P802.11be_D2.0 section 9.4.2.311 - */ -#define IEEE80211_EHT_OPER_CHAN_WIDTH 0x7 -#define IEEE80211_EHT_OPER_CHAN_WIDTH_20MHZ 0 -#define IEEE80211_EHT_OPER_CHAN_WIDTH_40MHZ 1 -#define IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ 2 -#define IEEE80211_EHT_OPER_CHAN_WIDTH_160MHZ 3 -#define IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ 4 - -/* need HE definitions for EHT functions */ -#include "ieee80211-he.h" - -/* Calculate 802.11be EHT capabilities IE Tx/Rx EHT MCS NSS Support Field size */ -static inline u8 -ieee80211_eht_mcs_nss_size(const struct ieee80211_he_cap_elem *he_cap, - const struct ieee80211_eht_cap_elem_fixed *eht_cap, - bool from_ap) -{ - u8 count = 0; - - /* on 2.4 GHz, if it supports 40 MHz, the result is 3 */ - if (he_cap->phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) - return 3; - - /* on 2.4 GHz, these three bits are reserved, so should be 0 */ - if (he_cap->phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) - count += 3; - - if (he_cap->phy_cap_info[0] & - IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G) - count += 3; - - if (eht_cap->phy_cap_info[0] & IEEE80211_EHT_PHY_CAP0_320MHZ_IN_6GHZ) - count += 3; - - if (count) - return count; - - return from_ap ? 3 : 4; -} - -/* 802.11be EHT PPE Thresholds */ -#define IEEE80211_EHT_PPE_THRES_NSS_POS 0 -#define IEEE80211_EHT_PPE_THRES_NSS_MASK 0xf -#define IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK 0x1f0 -#define IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE 3 -#define IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE 9 - -/* - * Calculate 802.11be EHT capabilities IE EHT field size - */ -static inline u8 -ieee80211_eht_ppe_size(u16 ppe_thres_hdr, const u8 *phy_cap_info) -{ - u32 n; - - if (!(phy_cap_info[5] & - IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT)) - return 0; - - n = hweight16(ppe_thres_hdr & - IEEE80211_EHT_PPE_THRES_RU_INDEX_BITMASK_MASK); - n *= 1 + u16_get_bits(ppe_thres_hdr, IEEE80211_EHT_PPE_THRES_NSS_MASK); - - /* - * Each pair is 6 bits, and we need to add the 9 "header" bits to the - * total size. - */ - n = n * IEEE80211_EHT_PPE_THRES_INFO_PPET_SIZE * 2 + - IEEE80211_EHT_PPE_THRES_INFO_HEADER_SIZE; - return DIV_ROUND_UP(n, 8); -} - -static inline bool -ieee80211_eht_capa_size_ok(const u8 *he_capa, const u8 *data, u8 len, - bool from_ap) -{ - const struct ieee80211_eht_cap_elem_fixed *elem = (const void *)data; - u8 needed = sizeof(struct ieee80211_eht_cap_elem_fixed); - - if (len < needed || !he_capa) - return false; - - needed += ieee80211_eht_mcs_nss_size((const void *)he_capa, - (const void *)data, - from_ap); - if (len < needed) - return false; - - if (elem->phy_cap_info[5] & - IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT) { - u16 ppe_thres_hdr; - - if (len < needed + sizeof(ppe_thres_hdr)) - return false; - - ppe_thres_hdr = get_unaligned_le16(data + needed); - needed += ieee80211_eht_ppe_size(ppe_thres_hdr, - elem->phy_cap_info); - } - - return len >= needed; -} - -static inline bool -ieee80211_eht_oper_size_ok(const u8 *data, u8 len) -{ - const struct ieee80211_eht_operation *elem = (const void *)data; - u8 needed = sizeof(*elem); - - if (len < needed) - return false; - - if (elem->params & IEEE80211_EHT_OPER_INFO_PRESENT) { - needed += 3; - - if (elem->params & - IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT) - needed += 2; - } - - return len >= needed; -} - -/* must validate ieee80211_eht_oper_size_ok() first */ -static inline u16 -ieee80211_eht_oper_dis_subchan_bitmap(const struct ieee80211_eht_operation *eht_oper) -{ - const struct ieee80211_eht_operation_info *info = - (const void *)eht_oper->optional; - - if (!(eht_oper->params & IEEE80211_EHT_OPER_INFO_PRESENT)) - return 0; - - if (!(eht_oper->params & IEEE80211_EHT_OPER_DISABLED_SUBCHANNEL_BITMAP_PRESENT)) - return 0; - - return get_unaligned_le16(info->optional); -} - -#define IEEE80211_BW_IND_DIS_SUBCH_PRESENT BIT(1) - -struct ieee80211_bandwidth_indication { - u8 params; - struct ieee80211_eht_operation_info info; -} __packed; - -static inline bool -ieee80211_bandwidth_indication_size_ok(const u8 *data, u8 len) -{ - const struct ieee80211_bandwidth_indication *bwi = (const void *)data; - - if (len < sizeof(*bwi)) - return false; - - if (bwi->params & IEEE80211_BW_IND_DIS_SUBCH_PRESENT && - len < sizeof(*bwi) + 2) - return false; - - return true; -} - #define LISTEN_INT_USF GENMASK(15, 14) #define LISTEN_INT_UI GENMASK(13, 0) @@ -2587,23 +2173,6 @@ enum ieee80211_unprotected_wnm_actioncode { WLAN_UNPROTECTED_WNM_ACTION_TIMING_MEASUREMENT_RESPONSE = 1, }; -/* Protected EHT action codes */ -enum ieee80211_protected_eht_actioncode { - WLAN_PROTECTED_EHT_ACTION_TTLM_REQ = 0, - WLAN_PROTECTED_EHT_ACTION_TTLM_RES = 1, - WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN = 2, - WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ = 3, - WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP = 4, - WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN = 5, - WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF = 6, - WLAN_PROTECTED_EHT_ACTION_LINK_RECOMMEND = 7, - WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_REQ = 8, - WLAN_PROTECTED_EHT_ACTION_ML_OP_UPDATE_RESP = 9, - WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_NOTIF = 10, - WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ = 11, - WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP = 12, -}; - /* Security key length */ enum ieee80211_key_len { WLAN_KEY_LEN_WEP40 = 5, @@ -3855,737 +3424,6 @@ struct ieee80211_tbtt_info_ge_11 { struct ieee80211_rnr_mld_params mld_params; } __packed; -/* multi-link device */ -#define IEEE80211_MLD_MAX_NUM_LINKS 15 - -#define IEEE80211_ML_CONTROL_TYPE 0x0007 -#define IEEE80211_ML_CONTROL_TYPE_BASIC 0 -#define IEEE80211_ML_CONTROL_TYPE_PREQ 1 -#define IEEE80211_ML_CONTROL_TYPE_RECONF 2 -#define IEEE80211_ML_CONTROL_TYPE_TDLS 3 -#define IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS 4 -#define IEEE80211_ML_CONTROL_PRESENCE_MASK 0xfff0 - -struct ieee80211_multi_link_elem { - __le16 control; - u8 variable[]; -} __packed; - -#define IEEE80211_MLC_BASIC_PRES_LINK_ID 0x0010 -#define IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT 0x0020 -#define IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY 0x0040 -#define IEEE80211_MLC_BASIC_PRES_EML_CAPA 0x0080 -#define IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP 0x0100 -#define IEEE80211_MLC_BASIC_PRES_MLD_ID 0x0200 -#define IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP 0x0400 - -#define IEEE80211_MED_SYNC_DELAY_DURATION 0x00ff -#define IEEE80211_MED_SYNC_DELAY_SYNC_OFDM_ED_THRESH 0x0f00 -#define IEEE80211_MED_SYNC_DELAY_SYNC_MAX_NUM_TXOPS 0xf000 - -/* - * Described in P802.11be_D3.0 - * dot11MSDTimerDuration should default to 5484 (i.e. 171.375) - * dot11MSDOFDMEDthreshold defaults to -72 (i.e. 0) - * dot11MSDTXOPMAX defaults to 1 - */ -#define IEEE80211_MED_SYNC_DELAY_DEFAULT 0x10ac - -#define IEEE80211_EML_CAP_EMLSR_SUPP 0x0001 -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY 0x000e -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_0US 0 -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_32US 1 -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_64US 2 -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_128US 3 -#define IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US 4 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY 0x0070 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_0US 0 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_16US 1 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_32US 2 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_64US 3 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_128US 4 -#define IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US 5 -#define IEEE80211_EML_CAP_EMLMR_SUPPORT 0x0080 -#define IEEE80211_EML_CAP_EMLMR_DELAY 0x0700 -#define IEEE80211_EML_CAP_EMLMR_DELAY_0US 0 -#define IEEE80211_EML_CAP_EMLMR_DELAY_32US 1 -#define IEEE80211_EML_CAP_EMLMR_DELAY_64US 2 -#define IEEE80211_EML_CAP_EMLMR_DELAY_128US 3 -#define IEEE80211_EML_CAP_EMLMR_DELAY_256US 4 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT 0x7800 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_0 0 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128US 1 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_256US 2 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_512US 3 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_1TU 4 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_2TU 5 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_4TU 6 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_8TU 7 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_16TU 8 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_32TU 9 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_64TU 10 -#define IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU 11 - -#define IEEE80211_MLD_CAP_OP_MAX_SIMUL_LINKS 0x000f -#define IEEE80211_MLD_CAP_OP_SRS_SUPPORT 0x0010 -#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP 0x0060 -#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_NO_SUPP 0 -#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_SAME 1 -#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_RESERVED 2 -#define IEEE80211_MLD_CAP_OP_TID_TO_LINK_MAP_NEG_SUPP_DIFF 3 -#define IEEE80211_MLD_CAP_OP_FREQ_SEP_TYPE_IND 0x0f80 -#define IEEE80211_MLD_CAP_OP_AAR_SUPPORT 0x1000 -#define IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT 0x2000 -#define IEEE80211_MLD_CAP_OP_ALIGNED_TWT_SUPPORT 0x4000 - -struct ieee80211_mle_basic_common_info { - u8 len; - u8 mld_mac_addr[ETH_ALEN]; - u8 variable[]; -} __packed; - -#define IEEE80211_MLC_PREQ_PRES_MLD_ID 0x0010 - -struct ieee80211_mle_preq_common_info { - u8 len; - u8 variable[]; -} __packed; - -#define IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR 0x0010 -#define IEEE80211_MLC_RECONF_PRES_EML_CAPA 0x0020 -#define IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP 0x0040 -#define IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP 0x0080 - -/* no fixed fields in RECONF */ - -struct ieee80211_mle_tdls_common_info { - u8 len; - u8 ap_mld_mac_addr[ETH_ALEN]; -} __packed; - -#define IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR 0x0010 - -/* no fixed fields in PRIO_ACCESS */ - -/** - * ieee80211_mle_common_size - check multi-link element common size - * @data: multi-link element, must already be checked for size using - * ieee80211_mle_size_ok() - * Return: the size of the multi-link element's "common" subfield - */ -static inline u8 ieee80211_mle_common_size(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - - switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { - case IEEE80211_ML_CONTROL_TYPE_BASIC: - case IEEE80211_ML_CONTROL_TYPE_PREQ: - case IEEE80211_ML_CONTROL_TYPE_TDLS: - case IEEE80211_ML_CONTROL_TYPE_RECONF: - case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: - /* - * The length is the first octet pointed by mle->variable so no - * need to add anything - */ - break; - default: - WARN_ON(1); - return 0; - } - - return sizeof(*mle) + mle->variable[0]; -} - -/** - * ieee80211_mle_get_link_id - returns the link ID - * @data: the basic multi link element - * Return: the link ID, or -1 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline int ieee80211_mle_get_link_id(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* common points now at the beginning of ieee80211_mle_basic_common_info */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_LINK_ID)) - return -1; - - return *common; -} - -/** - * ieee80211_mle_get_bss_param_ch_cnt - returns the BSS parameter change count - * @data: pointer to the basic multi link element - * Return: the BSS Parameter Change Count field value, or -1 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline int -ieee80211_mle_get_bss_param_ch_cnt(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* common points now at the beginning of ieee80211_mle_basic_common_info */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT)) - return -1; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - - return *common; -} - -/** - * ieee80211_mle_get_eml_med_sync_delay - returns the medium sync delay - * @data: pointer to the multi-link element - * Return: the medium synchronization delay field value from the multi-link - * element, or the default value (%IEEE80211_MED_SYNC_DELAY_DEFAULT) - * if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline u16 ieee80211_mle_get_eml_med_sync_delay(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* common points now at the beginning of ieee80211_mle_basic_common_info */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY)) - return IEEE80211_MED_SYNC_DELAY_DEFAULT; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - - return get_unaligned_le16(common); -} - -/** - * ieee80211_mle_get_eml_cap - returns the EML capability - * @data: pointer to the multi-link element - * Return: the EML capability field value from the multi-link element, - * or 0 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline u16 ieee80211_mle_get_eml_cap(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* common points now at the beginning of ieee80211_mle_basic_common_info */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_EML_CAPA)) - return 0; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) - common += 2; - - return get_unaligned_le16(common); -} - -/** - * ieee80211_mle_get_mld_capa_op - returns the MLD capabilities and operations. - * @data: pointer to the multi-link element - * Return: the MLD capabilities and operations field value from the multi-link - * element, or 0 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline u16 ieee80211_mle_get_mld_capa_op(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* - * common points now at the beginning of - * ieee80211_mle_basic_common_info - */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP)) - return 0; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) - common += 2; - - return get_unaligned_le16(common); -} - -/* Defined in Figure 9-1074t in P802.11be_D7.0 */ -#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_PARAM_UPDATE 0x0001 -#define IEEE80211_EHT_ML_EXT_MLD_CAPA_OP_RECO_MAX_LINKS_MASK 0x001e -#define IEEE80211_EHT_ML_EXT_MLD_CAPA_NSTR_UPDATE 0x0020 -#define IEEE80211_EHT_ML_EXT_MLD_CAPA_EMLSR_ENA_ON_ONE_LINK 0x0040 -#define IEEE80211_EHT_ML_EXT_MLD_CAPA_BTM_MLD_RECO_MULTI_AP 0x0080 - -/** - * ieee80211_mle_get_ext_mld_capa_op - returns the extended MLD capabilities - * and operations. - * @data: pointer to the multi-link element - * Return: the extended MLD capabilities and operations field value from - * the multi-link element, or 0 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline u16 ieee80211_mle_get_ext_mld_capa_op(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* - * common points now at the beginning of - * ieee80211_mle_basic_common_info - */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP)) - return 0; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) - common += 1; - - return get_unaligned_le16(common); -} - -/** - * ieee80211_mle_get_mld_id - returns the MLD ID - * @data: pointer to the multi-link element - * Return: The MLD ID in the given multi-link element, or 0 if not present - * - * The element is assumed to be of the correct type (BASIC) and big enough, - * this must be checked using ieee80211_mle_type_ok(). - */ -static inline u8 ieee80211_mle_get_mld_id(const u8 *data) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control = le16_to_cpu(mle->control); - const u8 *common = mle->variable; - - /* - * common points now at the beginning of - * ieee80211_mle_basic_common_info - */ - common += sizeof(struct ieee80211_mle_basic_common_info); - - if (!(control & IEEE80211_MLC_BASIC_PRES_MLD_ID)) - return 0; - - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) - common += 2; - - return *common; -} - -/** - * ieee80211_mle_size_ok - validate multi-link element size - * @data: pointer to the element data - * @len: length of the containing element - * Return: whether or not the multi-link element size is OK - */ -static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u8 fixed = sizeof(*mle); - u8 common = 0; - bool check_common_len = false; - u16 control; - - if (!data || len < fixed) - return false; - - control = le16_to_cpu(mle->control); - - switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { - case IEEE80211_ML_CONTROL_TYPE_BASIC: - common += sizeof(struct ieee80211_mle_basic_common_info); - check_common_len = true; - if (control & IEEE80211_MLC_BASIC_PRES_LINK_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_BSS_PARAM_CH_CNT) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_MED_SYNC_DELAY) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_EML_CAPA) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_MLD_CAPA_OP) - common += 2; - if (control & IEEE80211_MLC_BASIC_PRES_MLD_ID) - common += 1; - if (control & IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP) - common += 2; - break; - case IEEE80211_ML_CONTROL_TYPE_PREQ: - common += sizeof(struct ieee80211_mle_preq_common_info); - if (control & IEEE80211_MLC_PREQ_PRES_MLD_ID) - common += 1; - check_common_len = true; - break; - case IEEE80211_ML_CONTROL_TYPE_RECONF: - if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) - common += ETH_ALEN; - if (control & IEEE80211_MLC_RECONF_PRES_EML_CAPA) - common += 2; - if (control & IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP) - common += 2; - if (control & IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP) - common += 2; - break; - case IEEE80211_ML_CONTROL_TYPE_TDLS: - common += sizeof(struct ieee80211_mle_tdls_common_info); - check_common_len = true; - break; - case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: - common = ETH_ALEN + 1; - break; - default: - /* we don't know this type */ - return true; - } - - if (len < fixed + common) - return false; - - if (!check_common_len) - return true; - - /* if present, common length is the first octet there */ - return mle->variable[0] >= common; -} - -/** - * ieee80211_mle_type_ok - validate multi-link element type and size - * @data: pointer to the element data - * @type: expected type of the element - * @len: length of the containing element - * Return: whether or not the multi-link element type matches and size is OK - */ -static inline bool ieee80211_mle_type_ok(const u8 *data, u8 type, size_t len) -{ - const struct ieee80211_multi_link_elem *mle = (const void *)data; - u16 control; - - if (!ieee80211_mle_size_ok(data, len)) - return false; - - control = le16_to_cpu(mle->control); - - if (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE) == type) - return true; - - return false; -} - -enum ieee80211_mle_subelems { - IEEE80211_MLE_SUBELEM_PER_STA_PROFILE = 0, - IEEE80211_MLE_SUBELEM_FRAGMENT = 254, -}; - -#define IEEE80211_MLE_STA_CONTROL_LINK_ID 0x000f -#define IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE 0x0010 -#define IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 -#define IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT 0x0040 -#define IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT 0x0080 -#define IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT 0x0100 -#define IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT 0x0200 -#define IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE 0x0400 -#define IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT 0x0800 - -struct ieee80211_mle_per_sta_profile { - __le16 control; - u8 sta_info_len; - u8 variable[]; -} __packed; - -/** - * ieee80211_mle_basic_sta_prof_size_ok - validate basic multi-link element sta - * profile size - * @data: pointer to the sub element data - * @len: length of the containing sub element - * Return: %true if the STA profile is large enough, %false otherwise - */ -static inline bool ieee80211_mle_basic_sta_prof_size_ok(const u8 *data, - size_t len) -{ - const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; - u16 control; - u8 fixed = sizeof(*prof); - u8 info_len = 1; - - if (len < fixed) - return false; - - control = le16_to_cpu(prof->control); - - if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) - info_len += 6; - if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) - info_len += 2; - if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) - info_len += 8; - if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) - info_len += 2; - if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && - control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) { - if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) - info_len += 2; - else - info_len += 1; - } - if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT) - info_len += 1; - - return prof->sta_info_len >= info_len && - fixed + prof->sta_info_len - 1 <= len; -} - -/** - * ieee80211_mle_basic_sta_prof_bss_param_ch_cnt - get per-STA profile BSS - * parameter change count - * @prof: the per-STA profile, having been checked with - * ieee80211_mle_basic_sta_prof_size_ok() for the correct length - * - * Return: The BSS parameter change count value if present, 0 otherwise. - */ -static inline u8 -ieee80211_mle_basic_sta_prof_bss_param_ch_cnt(const struct ieee80211_mle_per_sta_profile *prof) -{ - u16 control = le16_to_cpu(prof->control); - const u8 *pos = prof->variable; - - if (!(control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT)) - return 0; - - if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) - pos += 6; - if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) - pos += 2; - if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) - pos += 8; - if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) - pos += 2; - if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && - control & IEEE80211_MLE_STA_CONTROL_NSTR_LINK_PAIR_PRESENT) { - if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) - pos += 2; - else - pos += 1; - } - - return *pos; -} - -#define IEEE80211_MLE_STA_RECONF_CONTROL_LINK_ID 0x000f -#define IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE 0x0010 -#define IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT 0x0020 -#define IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT 0x0040 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE 0x0780 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_AP_REM 0 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_OP_PARAM_UPDATE 1 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK 2 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK 3 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_NSTR_STATUS 4 -#define IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT 0x0800 - -/** - * ieee80211_mle_reconf_sta_prof_size_ok - validate reconfiguration multi-link - * element sta profile size. - * @data: pointer to the sub element data - * @len: length of the containing sub element - * Return: %true if the STA profile is large enough, %false otherwise - */ -static inline bool ieee80211_mle_reconf_sta_prof_size_ok(const u8 *data, - size_t len) -{ - const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; - u16 control; - u8 fixed = sizeof(*prof); - u8 info_len = 1; - - if (len < fixed) - return false; - - control = le16_to_cpu(prof->control); - - if (control & IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT) - info_len += ETH_ALEN; - if (control & IEEE80211_MLE_STA_RECONF_CONTROL_AP_REM_TIMER_PRESENT) - info_len += 2; - if (control & IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_PARAMS_PRESENT) - info_len += 2; - - return prof->sta_info_len >= info_len && - fixed + prof->sta_info_len - 1 <= len; -} - -#define IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID 0x000f -#define IEEE80211_EPCS_ENA_RESP_BODY_LEN 3 - -static inline bool ieee80211_tid_to_link_map_size_ok(const u8 *data, size_t len) -{ - const struct ieee80211_ttlm_elem *t2l = (const void *)data; - u8 control, fixed = sizeof(*t2l), elem_len = 0; - - if (len < fixed) - return false; - - control = t2l->control; - - if (control & IEEE80211_TTLM_CONTROL_SWITCH_TIME_PRESENT) - elem_len += 2; - if (control & IEEE80211_TTLM_CONTROL_EXPECTED_DUR_PRESENT) - elem_len += 3; - - if (!(control & IEEE80211_TTLM_CONTROL_DEF_LINK_MAP)) { - u8 bm_size; - - elem_len += 1; - if (len < fixed + elem_len) - return false; - - if (control & IEEE80211_TTLM_CONTROL_LINK_MAP_SIZE) - bm_size = 1; - else - bm_size = 2; - - elem_len += hweight8(t2l->optional[0]) * bm_size; - } - - return len >= fixed + elem_len; -} - -/** - * ieee80211_emlsr_pad_delay_in_us - Fetch the EMLSR Padding delay - * in microseconds - * @eml_cap: EML capabilities field value from common info field of - * the Multi-link element - * Return: the EMLSR Padding delay (in microseconds) encoded in the - * EML Capabilities field - */ - -static inline u32 ieee80211_emlsr_pad_delay_in_us(u16 eml_cap) -{ - /* IEEE Std 802.11be-2024 Table 9-417i—Encoding of the EMLSR - * Padding Delay subfield. - */ - u32 pad_delay = u16_get_bits(eml_cap, - IEEE80211_EML_CAP_EMLSR_PADDING_DELAY); - - if (!pad_delay || - pad_delay > IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US) - return 0; - - return 32 * (1 << (pad_delay - 1)); -} - -/** - * ieee80211_emlsr_trans_delay_in_us - Fetch the EMLSR Transition - * delay in microseconds - * @eml_cap: EML capabilities field value from common info field of - * the Multi-link element - * Return: the EMLSR Transition delay (in microseconds) encoded in the - * EML Capabilities field - */ - -static inline u32 ieee80211_emlsr_trans_delay_in_us(u16 eml_cap) -{ - /* IEEE Std 802.11be-2024 Table 9-417j—Encoding of the EMLSR - * Transition Delay subfield. - */ - u32 trans_delay = - u16_get_bits(eml_cap, - IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY); - - /* invalid values also just use 0 */ - if (!trans_delay || - trans_delay > IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US) - return 0; - - return 16 * (1 << (trans_delay - 1)); -} - -/** - * ieee80211_eml_trans_timeout_in_us - Fetch the EMLSR Transition - * timeout value in microseconds - * @eml_cap: EML capabilities field value from common info field of - * the Multi-link element - * Return: the EMLSR Transition timeout (in microseconds) encoded in - * the EML Capabilities field - */ - -static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) -{ - /* IEEE Std 802.11be-2024 Table 9-417m—Encoding of the - * Transition Timeout subfield. - */ - u8 timeout = u16_get_bits(eml_cap, - IEEE80211_EML_CAP_TRANSITION_TIMEOUT); - - /* invalid values also just use 0 */ - if (!timeout || timeout > IEEE80211_EML_CAP_TRANSITION_TIMEOUT_128TU) - return 0; - - return 128 * (1 << (timeout - 1)); -} - -#define for_each_mle_subelement(_elem, _data, _len) \ - if (ieee80211_mle_size_ok(_data, _len)) \ - for_each_element(_elem, \ - _data + ieee80211_mle_common_size(_data),\ - _len - ieee80211_mle_common_size(_data)) - /* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */ #define NAN_OP_MODE_PHY_MODE_VHT 0x01 #define NAN_OP_MODE_PHY_MODE_HE 0x10 @@ -4605,6 +3443,8 @@ static inline u32 ieee80211_eml_trans_timeout_in_us(u16 eml_cap) #include "ieee80211-ht.h" #include "ieee80211-vht.h" +#include "ieee80211-he.h" +#include "ieee80211-eht.h" #include "ieee80211-mesh.h" #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From 00105d7600bfb171037783da5f26e2565c7d2106 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:54 +0100 Subject: wifi: ieee80211: split S1G definitions out The ieee80211.h file has gotten very long, continue splitting it by putting S1G definitions into a separate file. Link: https://patch.msgid.link/20251105153843.82c0bddee6e3.Ic6646615286dad240b42e31e9d428c7e4ea40ce0@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-s1g.h | 575 +++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 585 ++---------------------------------------- 2 files changed, 591 insertions(+), 569 deletions(-) create mode 100644 include/linux/ieee80211-s1g.h (limited to 'include') diff --git a/include/linux/ieee80211-s1g.h b/include/linux/ieee80211-s1g.h new file mode 100644 index 000000000000..5b9ed2dcc00e --- /dev/null +++ b/include/linux/ieee80211-s1g.h @@ -0,0 +1,575 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * IEEE 802.11 S1G definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_S1G_H +#define LINUX_IEEE80211_S1G_H + +#include +#include + +/* bits unique to S1G beacon frame control */ +#define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 +#define IEEE80211_S1G_BCN_CSSID 0x200 +#define IEEE80211_S1G_BCN_ANO 0x400 + +/* see 802.11ah-2016 9.9 NDP CMAC frames */ +#define IEEE80211_S1G_1MHZ_NDP_BITS 25 +#define IEEE80211_S1G_1MHZ_NDP_BYTES 4 +#define IEEE80211_S1G_2MHZ_NDP_BITS 37 +#define IEEE80211_S1G_2MHZ_NDP_BYTES 5 + +/** + * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT && + * IEEE80211_STYPE_S1G_BEACON + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame is an S1G beacon + */ +static inline bool ieee80211_is_s1g_beacon(__le16 fc) +{ + return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | + IEEE80211_FCTL_STYPE)) == + cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON); +} + +/** + * ieee80211_s1g_has_next_tbtt - check if IEEE80211_S1G_BCN_NEXT_TBTT + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * next TBTT field + */ +static inline bool ieee80211_s1g_has_next_tbtt(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT)); +} + +/** + * ieee80211_s1g_has_ano - check if IEEE80211_S1G_BCN_ANO + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * ANO field + */ +static inline bool ieee80211_s1g_has_ano(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_ANO)); +} + +/** + * ieee80211_s1g_has_cssid - check if IEEE80211_S1G_BCN_CSSID + * @fc: frame control bytes in little-endian byteorder + * Return: whether or not the frame contains the variable-length + * compressed SSID field + */ +static inline bool ieee80211_s1g_has_cssid(__le16 fc) +{ + return ieee80211_is_s1g_beacon(fc) && + (fc & cpu_to_le16(IEEE80211_S1G_BCN_CSSID)); +} + +/** + * enum ieee80211_s1g_chanwidth - S1G channel widths + * These are defined in IEEE802.11-2016ah Table 10-20 + * as BSS Channel Width + * + * @IEEE80211_S1G_CHANWIDTH_1MHZ: 1MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_2MHZ: 2MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_4MHZ: 4MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_8MHZ: 8MHz operating channel + * @IEEE80211_S1G_CHANWIDTH_16MHZ: 16MHz operating channel + */ +enum ieee80211_s1g_chanwidth { + IEEE80211_S1G_CHANWIDTH_1MHZ = 0, + IEEE80211_S1G_CHANWIDTH_2MHZ = 1, + IEEE80211_S1G_CHANWIDTH_4MHZ = 3, + IEEE80211_S1G_CHANWIDTH_8MHZ = 7, + IEEE80211_S1G_CHANWIDTH_16MHZ = 15, +}; + +/** + * enum ieee80211_s1g_pri_chanwidth - S1G primary channel widths + * described in IEEE80211-2024 Table 10-39. + * + * @IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: 2MHz primary channel + * @IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: 1MHz primary channel + */ +enum ieee80211_s1g_pri_chanwidth { + IEEE80211_S1G_PRI_CHANWIDTH_2MHZ = 0, + IEEE80211_S1G_PRI_CHANWIDTH_1MHZ = 1, +}; + +/** + * struct ieee80211_s1g_bcn_compat_ie - S1G Beacon Compatibility element + * @compat_info: Compatibility Information + * @beacon_int: Beacon Interval + * @tsf_completion: TSF Completion + * + * This structure represents the payload of the "S1G Beacon + * Compatibility element" as described in IEEE Std 802.11-2020 section + * 9.4.2.196. + */ +struct ieee80211_s1g_bcn_compat_ie { + __le16 compat_info; + __le16 beacon_int; + __le32 tsf_completion; +} __packed; + +/** + * struct ieee80211_s1g_oper_ie - S1G Operation element + * @ch_width: S1G Operation Information Channel Width + * @oper_class: S1G Operation Information Operating Class + * @primary_ch: S1G Operation Information Primary Channel Number + * @oper_ch: S1G Operation Information Channel Center Frequency + * @basic_mcs_nss: Basic S1G-MCS and NSS Set + * + * This structure represents the payload of the "S1G Operation + * element" as described in IEEE Std 802.11-2020 section 9.4.2.212. + */ +struct ieee80211_s1g_oper_ie { + u8 ch_width; + u8 oper_class; + u8 primary_ch; + u8 oper_ch; + __le16 basic_mcs_nss; +} __packed; + +/** + * struct ieee80211_aid_response_ie - AID Response element + * @aid: AID/Group AID + * @switch_count: AID Switch Count + * @response_int: AID Response Interval + * + * This structure represents the payload of the "AID Response element" + * as described in IEEE Std 802.11-2020 section 9.4.2.194. + */ +struct ieee80211_aid_response_ie { + __le16 aid; + u8 switch_count; + __le16 response_int; +} __packed; + +struct ieee80211_s1g_cap { + u8 capab_info[10]; + u8 supp_mcs_nss[5]; +} __packed; + +/** + * ieee80211_s1g_optional_len - determine length of optional S1G beacon fields + * @fc: frame control bytes in little-endian byteorder + * Return: total length in bytes of the optional fixed-length fields + * + * S1G beacons may contain up to three optional fixed-length fields that + * precede the variable-length elements. Whether these fields are present + * is indicated by flags in the frame control field. + * + * From IEEE 802.11-2024 section 9.3.4.3: + * - Next TBTT field may be 0 or 3 bytes + * - Short SSID field may be 0 or 4 bytes + * - Access Network Options (ANO) field may be 0 or 1 byte + */ +static inline size_t +ieee80211_s1g_optional_len(__le16 fc) +{ + size_t len = 0; + + if (ieee80211_s1g_has_next_tbtt(fc)) + len += 3; + + if (ieee80211_s1g_has_cssid(fc)) + len += 4; + + if (ieee80211_s1g_has_ano(fc)) + len += 1; + + return len; +} + +/* S1G Capabilities Information field */ +#define IEEE80211_S1G_CAPABILITY_LEN 15 + +#define S1G_CAP0_S1G_LONG BIT(0) +#define S1G_CAP0_SGI_1MHZ BIT(1) +#define S1G_CAP0_SGI_2MHZ BIT(2) +#define S1G_CAP0_SGI_4MHZ BIT(3) +#define S1G_CAP0_SGI_8MHZ BIT(4) +#define S1G_CAP0_SGI_16MHZ BIT(5) +#define S1G_CAP0_SUPP_CH_WIDTH GENMASK(7, 6) + +#define S1G_SUPP_CH_WIDTH_2 0 +#define S1G_SUPP_CH_WIDTH_4 1 +#define S1G_SUPP_CH_WIDTH_8 2 +#define S1G_SUPP_CH_WIDTH_16 3 +#define S1G_SUPP_CH_WIDTH_MAX(cap) ((1 << FIELD_GET(S1G_CAP0_SUPP_CH_WIDTH, \ + cap[0])) << 1) + +#define S1G_CAP1_RX_LDPC BIT(0) +#define S1G_CAP1_TX_STBC BIT(1) +#define S1G_CAP1_RX_STBC BIT(2) +#define S1G_CAP1_SU_BFER BIT(3) +#define S1G_CAP1_SU_BFEE BIT(4) +#define S1G_CAP1_BFEE_STS GENMASK(7, 5) + +#define S1G_CAP2_SOUNDING_DIMENSIONS GENMASK(2, 0) +#define S1G_CAP2_MU_BFER BIT(3) +#define S1G_CAP2_MU_BFEE BIT(4) +#define S1G_CAP2_PLUS_HTC_VHT BIT(5) +#define S1G_CAP2_TRAVELING_PILOT GENMASK(7, 6) + +#define S1G_CAP3_RD_RESPONDER BIT(0) +#define S1G_CAP3_HT_DELAYED_BA BIT(1) +#define S1G_CAP3_MAX_MPDU_LEN BIT(2) +#define S1G_CAP3_MAX_AMPDU_LEN_EXP GENMASK(4, 3) +#define S1G_CAP3_MIN_MPDU_START GENMASK(7, 5) + +#define S1G_CAP4_UPLINK_SYNC BIT(0) +#define S1G_CAP4_DYNAMIC_AID BIT(1) +#define S1G_CAP4_BAT BIT(2) +#define S1G_CAP4_TIME_ADE BIT(3) +#define S1G_CAP4_NON_TIM BIT(4) +#define S1G_CAP4_GROUP_AID BIT(5) +#define S1G_CAP4_STA_TYPE GENMASK(7, 6) + +#define S1G_CAP5_CENT_AUTH_CONTROL BIT(0) +#define S1G_CAP5_DIST_AUTH_CONTROL BIT(1) +#define S1G_CAP5_AMSDU BIT(2) +#define S1G_CAP5_AMPDU BIT(3) +#define S1G_CAP5_ASYMMETRIC_BA BIT(4) +#define S1G_CAP5_FLOW_CONTROL BIT(5) +#define S1G_CAP5_SECTORIZED_BEAM GENMASK(7, 6) + +#define S1G_CAP6_OBSS_MITIGATION BIT(0) +#define S1G_CAP6_FRAGMENT_BA BIT(1) +#define S1G_CAP6_NDP_PS_POLL BIT(2) +#define S1G_CAP6_RAW_OPERATION BIT(3) +#define S1G_CAP6_PAGE_SLICING BIT(4) +#define S1G_CAP6_TXOP_SHARING_IMP_ACK BIT(5) +#define S1G_CAP6_VHT_LINK_ADAPT GENMASK(7, 6) + +#define S1G_CAP7_TACK_AS_PS_POLL BIT(0) +#define S1G_CAP7_DUP_1MHZ BIT(1) +#define S1G_CAP7_MCS_NEGOTIATION BIT(2) +#define S1G_CAP7_1MHZ_CTL_RESPONSE_PREAMBLE BIT(3) +#define S1G_CAP7_NDP_BFING_REPORT_POLL BIT(4) +#define S1G_CAP7_UNSOLICITED_DYN_AID BIT(5) +#define S1G_CAP7_SECTOR_TRAINING_OPERATION BIT(6) +#define S1G_CAP7_TEMP_PS_MODE_SWITCH BIT(7) + +#define S1G_CAP8_TWT_GROUPING BIT(0) +#define S1G_CAP8_BDT BIT(1) +#define S1G_CAP8_COLOR GENMASK(4, 2) +#define S1G_CAP8_TWT_REQUEST BIT(5) +#define S1G_CAP8_TWT_RESPOND BIT(6) +#define S1G_CAP8_PV1_FRAME BIT(7) + +#define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) + +#define S1G_OPER_CH_WIDTH_PRIMARY BIT(0) +#define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) +#define S1G_OPER_CH_PRIMARY_LOCATION BIT(5) + +#define S1G_2M_PRIMARY_LOCATION_LOWER 0 +#define S1G_2M_PRIMARY_LOCATION_UPPER 1 + +#define LISTEN_INT_USF GENMASK(15, 14) +#define LISTEN_INT_UI GENMASK(13, 0) + +#define IEEE80211_MAX_USF FIELD_MAX(LISTEN_INT_USF) +#define IEEE80211_MAX_UI FIELD_MAX(LISTEN_INT_UI) + +/* S1G encoding types */ +#define IEEE80211_S1G_TIM_ENC_MODE_BLOCK 0 +#define IEEE80211_S1G_TIM_ENC_MODE_SINGLE 1 +#define IEEE80211_S1G_TIM_ENC_MODE_OLB 2 + +enum ieee80211_s1g_actioncode { + WLAN_S1G_AID_SWITCH_REQUEST, + WLAN_S1G_AID_SWITCH_RESPONSE, + WLAN_S1G_SYNC_CONTROL, + WLAN_S1G_STA_INFO_ANNOUNCE, + WLAN_S1G_EDCA_PARAM_SET, + WLAN_S1G_EL_OPERATION, + WLAN_S1G_TWT_SETUP, + WLAN_S1G_TWT_TEARDOWN, + WLAN_S1G_SECT_GROUP_ID_LIST, + WLAN_S1G_SECT_ID_FEEDBACK, + WLAN_S1G_TWT_INFORMATION = 11, +}; + +/** + * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon + * @fc: frame control bytes in little-endian byteorder + * @variable: pointer to the beacon frame elements + * @variable_len: length of the frame elements + * Return: whether or not the frame is an S1G short beacon. As per + * IEEE80211-2024 11.1.3.10.1, The S1G beacon compatibility element shall + * always be present as the first element in beacon frames generated at a + * TBTT (Target Beacon Transmission Time), so any frame not containing + * this element must have been generated at a TSBTT (Target Short Beacon + * Transmission Time) that is not a TBTT. Additionally, short beacons are + * prohibited from containing the S1G beacon compatibility element as per + * IEEE80211-2024 9.3.4.3 Table 9-76, so if we have an S1G beacon with + * either no elements or the first element is not the beacon compatibility + * element, we have a short beacon. + */ +static inline bool ieee80211_is_s1g_short_beacon(__le16 fc, const u8 *variable, + size_t variable_len) +{ + if (!ieee80211_is_s1g_beacon(fc)) + return false; + + /* + * If the frame does not contain at least 1 element (this is perfectly + * valid in a short beacon) and is an S1G beacon, we have a short + * beacon. + */ + if (variable_len < 2) + return true; + + return variable[0] != WLAN_EID_S1G_BCN_COMPAT; +} + +struct s1g_tim_aid { + u16 aid; + u8 target_blk; /* Target block index */ + u8 target_subblk; /* Target subblock index */ + u8 target_subblk_bit; /* Target subblock bit */ +}; + +struct s1g_tim_enc_block { + u8 enc_mode; + bool inverse; + const u8 *ptr; + u8 len; + + /* + * For an OLB encoded block that spans multiple blocks, this + * is the offset into the span described by that encoded block. + */ + u8 olb_blk_offset; +}; + +/* + * Helper routines to quickly extract the length of an encoded block. Validation + * is also performed to ensure the length extracted lies within the TIM. + */ + +static inline int ieee80211_s1g_len_bitmap(const u8 *ptr, const u8 *end) +{ + u8 blkmap; + u8 n_subblks; + + if (ptr >= end) + return -EINVAL; + + blkmap = *ptr; + n_subblks = hweight8(blkmap); + + if (ptr + 1 + n_subblks > end) + return -EINVAL; + + return 1 + n_subblks; +} + +static inline int ieee80211_s1g_len_single(const u8 *ptr, const u8 *end) +{ + return (ptr + 1 > end) ? -EINVAL : 1; +} + +static inline int ieee80211_s1g_len_olb(const u8 *ptr, const u8 *end) +{ + if (ptr >= end) + return -EINVAL; + + return (ptr + 1 + *ptr > end) ? -EINVAL : 1 + *ptr; +} + +/* + * Enumerate all encoded blocks until we find the encoded block that describes + * our target AID. OLB is a special case as a single encoded block can describe + * multiple blocks as a single encoded block. + */ +static inline int ieee80211_s1g_find_target_block(struct s1g_tim_enc_block *enc, + const struct s1g_tim_aid *aid, + const u8 *ptr, const u8 *end) +{ + /* need at least block-control octet */ + while (ptr + 1 <= end) { + u8 ctrl = *ptr++; + u8 mode = ctrl & 0x03; + bool contains, inverse = ctrl & BIT(2); + u8 span, blk_off = ctrl >> 3; + int len; + + switch (mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: + len = ieee80211_s1g_len_bitmap(ptr, end); + contains = blk_off == aid->target_blk; + break; + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: + len = ieee80211_s1g_len_single(ptr, end); + contains = blk_off == aid->target_blk; + break; + case IEEE80211_S1G_TIM_ENC_MODE_OLB: + len = ieee80211_s1g_len_olb(ptr, end); + /* + * An OLB encoded block can describe more then one + * block, meaning an encoded OLB block can span more + * then a single block. + */ + if (len > 0) { + /* Minus one for the length octet */ + span = DIV_ROUND_UP(len - 1, 8); + /* + * Check if our target block lies within the + * block span described by this encoded block. + */ + contains = (aid->target_blk >= blk_off) && + (aid->target_blk < blk_off + span); + } + break; + default: + return -EOPNOTSUPP; + } + + if (len < 0) + return len; + + if (contains) { + enc->enc_mode = mode; + enc->inverse = inverse; + enc->ptr = ptr; + enc->len = (u8)len; + enc->olb_blk_offset = blk_off; + return 0; + } + + ptr += len; + } + + return -ENOENT; +} + +static inline bool ieee80211_s1g_parse_bitmap(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + const u8 *ptr = enc->ptr; + u8 blkmap = *ptr++; + + /* + * If our block bitmap does not contain a set bit that corresponds + * to our AID, it could mean a variety of things depending on if + * the encoding mode is inverted or not. + * + * 1. If inverted, it means the entire subblock is present and hence + * our AID has been set. + * 2. If not inverted, it means our subblock is not present and hence + * it is all zero meaning our AID is not set. + */ + if (!(blkmap & BIT(aid->target_subblk))) + return enc->inverse; + + /* + * Increment ptr by the number of set subblocks that appear before our + * target subblock. If our target subblock is 0, do nothing as ptr + * already points to our target subblock. + */ + if (aid->target_subblk) + ptr += hweight8(blkmap & GENMASK(aid->target_subblk - 1, 0)); + + return !!(*ptr & BIT(aid->target_subblk_bit)) ^ enc->inverse; +} + +static inline bool ieee80211_s1g_parse_single(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + /* + * Single AID mode describes, as the name suggests, a single AID + * within the block described by the encoded block. The octet + * contains the 6 LSBs of the AID described in the block. The other + * 2 bits are reserved. When inversed, every single AID described + * by the current block have buffered traffic except for the AID + * described in the single AID octet. + */ + return ((*enc->ptr & 0x3f) == (aid->aid & 0x3f)) ^ enc->inverse; +} + +static inline bool ieee80211_s1g_parse_olb(struct s1g_tim_enc_block *enc, + struct s1g_tim_aid *aid) +{ + const u8 *ptr = enc->ptr; + u8 blk_len = *ptr++; + /* + * Given an OLB encoded block that describes multiple blocks, + * calculate the offset into the span. Then calculate the + * subblock location normally. + */ + u16 span_offset = aid->target_blk - enc->olb_blk_offset; + u16 subblk_idx = span_offset * 8 + aid->target_subblk; + + if (subblk_idx >= blk_len) + return enc->inverse; + + return !!(ptr[subblk_idx] & BIT(aid->target_subblk_bit)) ^ enc->inverse; +} + +/* + * An S1G PVB has 3 non optional encoding types, each that can be inverted. + * An S1G PVB is constructed with zero or more encoded block subfields. Each + * encoded block represents a single "block" of AIDs (64), and each encoded + * block can contain one of the 3 encoding types alongside a single bit for + * whether the bits should be inverted. + * + * As the standard makes no guarantee about the ordering of encoded blocks, + * we must parse every encoded block in the worst case scenario given an + * AID that lies within the last block. + */ +static inline bool ieee80211_s1g_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid) +{ + int err; + struct s1g_tim_aid target_aid; + struct s1g_tim_enc_block enc_blk; + + if (tim_len < 3) + return false; + + target_aid.aid = aid; + target_aid.target_blk = (aid >> 6) & 0x1f; + target_aid.target_subblk = (aid >> 3) & 0x7; + target_aid.target_subblk_bit = aid & 0x7; + + /* + * Find our AIDs target encoded block and fill &enc_blk with the + * encoded blocks information. If no entry is found or an error + * occurs return false. + */ + err = ieee80211_s1g_find_target_block(&enc_blk, &target_aid, + tim->virtual_map, + (const u8 *)tim + tim_len + 2); + if (err) + return false; + + switch (enc_blk.enc_mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: + return ieee80211_s1g_parse_bitmap(&enc_blk, &target_aid); + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: + return ieee80211_s1g_parse_single(&enc_blk, &target_aid); + case IEEE80211_S1G_TIM_ENC_MODE_OLB: + return ieee80211_s1g_parse_olb(&enc_blk, &target_aid); + default: + return false; + } +} + +#endif /* LINUX_IEEE80211_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 63a9775b059d..1b27bbac145b 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -109,17 +109,6 @@ #define IEEE80211_STYPE_DMG_BEACON 0x0000 #define IEEE80211_STYPE_S1G_BEACON 0x0010 -/* bits unique to S1G beacon */ -#define IEEE80211_S1G_BCN_NEXT_TBTT 0x100 -#define IEEE80211_S1G_BCN_CSSID 0x200 -#define IEEE80211_S1G_BCN_ANO 0x400 - -/* see 802.11ah-2016 9.9 NDP CMAC frames */ -#define IEEE80211_S1G_1MHZ_NDP_BITS 25 -#define IEEE80211_S1G_1MHZ_NDP_BYTES 4 -#define IEEE80211_S1G_2MHZ_NDP_BITS 37 -#define IEEE80211_S1G_2MHZ_NDP_BYTES 5 - #define IEEE80211_NDP_FTYPE_CTS 0 #define IEEE80211_NDP_FTYPE_CF_END 0 #define IEEE80211_NDP_FTYPE_PS_POLL 1 @@ -221,11 +210,6 @@ static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2) #define IEEE80211_MAX_TIM_LEN 251 #define IEEE80211_MAX_MESH_PEERINGS 63 -/* S1G encoding types */ -#define IEEE80211_S1G_TIM_ENC_MODE_BLOCK 0 -#define IEEE80211_S1G_TIM_ENC_MODE_SINGLE 1 -#define IEEE80211_S1G_TIM_ENC_MODE_OLB 2 - /* Maximum size for the MA-UNITDATA primitive, 802.11 standard section 6.2.1.1.2. @@ -604,55 +588,6 @@ static inline bool ieee80211_is_beacon(__le16 fc) cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); } -/** - * ieee80211_is_s1g_beacon - check if IEEE80211_FTYPE_EXT && - * IEEE80211_STYPE_S1G_BEACON - * @fc: frame control bytes in little-endian byteorder - * Return: whether or not the frame is an S1G beacon - */ -static inline bool ieee80211_is_s1g_beacon(__le16 fc) -{ - return (fc & cpu_to_le16(IEEE80211_FCTL_FTYPE | - IEEE80211_FCTL_STYPE)) == - cpu_to_le16(IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON); -} - -/** - * ieee80211_s1g_has_next_tbtt - check if IEEE80211_S1G_BCN_NEXT_TBTT - * @fc: frame control bytes in little-endian byteorder - * Return: whether or not the frame contains the variable-length - * next TBTT field - */ -static inline bool ieee80211_s1g_has_next_tbtt(__le16 fc) -{ - return ieee80211_is_s1g_beacon(fc) && - (fc & cpu_to_le16(IEEE80211_S1G_BCN_NEXT_TBTT)); -} - -/** - * ieee80211_s1g_has_ano - check if IEEE80211_S1G_BCN_ANO - * @fc: frame control bytes in little-endian byteorder - * Return: whether or not the frame contains the variable-length - * ANO field - */ -static inline bool ieee80211_s1g_has_ano(__le16 fc) -{ - return ieee80211_is_s1g_beacon(fc) && - (fc & cpu_to_le16(IEEE80211_S1G_BCN_ANO)); -} - -/** - * ieee80211_s1g_has_cssid - check if IEEE80211_S1G_BCN_CSSID - * @fc: frame control bytes in little-endian byteorder - * Return: whether or not the frame contains the variable-length - * compressed SSID field - */ -static inline bool ieee80211_s1g_has_cssid(__le16 fc) -{ - return ieee80211_is_s1g_beacon(fc) && - (fc & cpu_to_le16(IEEE80211_S1G_BCN_CSSID)); -} - /** * ieee80211_is_atim - check if IEEE80211_FTYPE_MGMT && IEEE80211_STYPE_ATIM * @fc: frame control bytes in little-endian byteorder @@ -984,37 +919,6 @@ struct ieee80211_tim_ie { }; } __packed; -/** - * enum ieee80211_s1g_chanwidth - S1G channel widths - * These are defined in IEEE802.11-2016ah Table 10-20 - * as BSS Channel Width - * - * @IEEE80211_S1G_CHANWIDTH_1MHZ: 1MHz operating channel - * @IEEE80211_S1G_CHANWIDTH_2MHZ: 2MHz operating channel - * @IEEE80211_S1G_CHANWIDTH_4MHZ: 4MHz operating channel - * @IEEE80211_S1G_CHANWIDTH_8MHZ: 8MHz operating channel - * @IEEE80211_S1G_CHANWIDTH_16MHZ: 16MHz operating channel - */ -enum ieee80211_s1g_chanwidth { - IEEE80211_S1G_CHANWIDTH_1MHZ = 0, - IEEE80211_S1G_CHANWIDTH_2MHZ = 1, - IEEE80211_S1G_CHANWIDTH_4MHZ = 3, - IEEE80211_S1G_CHANWIDTH_8MHZ = 7, - IEEE80211_S1G_CHANWIDTH_16MHZ = 15, -}; - -/** - * enum ieee80211_s1g_pri_chanwidth - S1G primary channel widths - * described in IEEE80211-2024 Table 10-39. - * - * @IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: 2MHz primary channel - * @IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: 1MHz primary channel - */ -enum ieee80211_s1g_pri_chanwidth { - IEEE80211_S1G_PRI_CHANWIDTH_2MHZ = 0, - IEEE80211_S1G_PRI_CHANWIDTH_1MHZ = 1, -}; - #define WLAN_SA_QUERY_TR_ID_LEN 2 #define WLAN_MEMBERSHIP_LEN 8 #define WLAN_USER_POSITION_LEN 16 @@ -1042,61 +946,6 @@ struct ieee80211_addba_ext_ie { u8 data; } __packed; -/** - * struct ieee80211_s1g_bcn_compat_ie - S1G Beacon Compatibility element - * @compat_info: Compatibility Information - * @beacon_int: Beacon Interval - * @tsf_completion: TSF Completion - * - * This structure represents the payload of the "S1G Beacon - * Compatibility element" as described in IEEE Std 802.11-2020 section - * 9.4.2.196. - */ -struct ieee80211_s1g_bcn_compat_ie { - __le16 compat_info; - __le16 beacon_int; - __le32 tsf_completion; -} __packed; - -/** - * struct ieee80211_s1g_oper_ie - S1G Operation element - * @ch_width: S1G Operation Information Channel Width - * @oper_class: S1G Operation Information Operating Class - * @primary_ch: S1G Operation Information Primary Channel Number - * @oper_ch: S1G Operation Information Channel Center Frequency - * @basic_mcs_nss: Basic S1G-MCS and NSS Set - * - * This structure represents the payload of the "S1G Operation - * element" as described in IEEE Std 802.11-2020 section 9.4.2.212. - */ -struct ieee80211_s1g_oper_ie { - u8 ch_width; - u8 oper_class; - u8 primary_ch; - u8 oper_ch; - __le16 basic_mcs_nss; -} __packed; - -/** - * struct ieee80211_aid_response_ie - AID Response element - * @aid: AID/Group AID - * @switch_count: AID Switch Count - * @response_int: AID Response Interval - * - * This structure represents the payload of the "AID Response element" - * as described in IEEE Std 802.11-2020 section 9.4.2.194. - */ -struct ieee80211_aid_response_ie { - __le16 aid; - u8 switch_count; - __le16 response_int; -} __packed; - -struct ieee80211_s1g_cap { - u8 capab_info[10]; - u8 supp_mcs_nss[5]; -} __packed; - struct ieee80211_ext { __le16 frame_control; __le16 duration; @@ -1110,37 +959,6 @@ struct ieee80211_ext { } u; } __packed __aligned(2); -/** - * ieee80211_s1g_optional_len - determine length of optional S1G beacon fields - * @fc: frame control bytes in little-endian byteorder - * Return: total length in bytes of the optional fixed-length fields - * - * S1G beacons may contain up to three optional fixed-length fields that - * precede the variable-length elements. Whether these fields are present - * is indicated by flags in the frame control field. - * - * From IEEE 802.11-2024 section 9.3.4.3: - * - Next TBTT field may be 0 or 3 bytes - * - Short SSID field may be 0 or 4 bytes - * - Access Network Options (ANO) field may be 0 or 1 byte - */ -static inline size_t -ieee80211_s1g_optional_len(__le16 fc) -{ - size_t len = 0; - - if (ieee80211_s1g_has_next_tbtt(fc)) - len += 3; - - if (ieee80211_s1g_has_cssid(fc)) - len += 4; - - if (ieee80211_s1g_has_ano(fc)) - len += 1; - - return len; -} - /** * struct ieee80211_bss_load_elem - BSS Load elemen * @@ -1567,98 +1385,6 @@ struct ieee80211_p2p_noa_attr { #define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) #define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F -/* S1G Capabilities Information field */ -#define IEEE80211_S1G_CAPABILITY_LEN 15 - -#define S1G_CAP0_S1G_LONG BIT(0) -#define S1G_CAP0_SGI_1MHZ BIT(1) -#define S1G_CAP0_SGI_2MHZ BIT(2) -#define S1G_CAP0_SGI_4MHZ BIT(3) -#define S1G_CAP0_SGI_8MHZ BIT(4) -#define S1G_CAP0_SGI_16MHZ BIT(5) -#define S1G_CAP0_SUPP_CH_WIDTH GENMASK(7, 6) - -#define S1G_SUPP_CH_WIDTH_2 0 -#define S1G_SUPP_CH_WIDTH_4 1 -#define S1G_SUPP_CH_WIDTH_8 2 -#define S1G_SUPP_CH_WIDTH_16 3 -#define S1G_SUPP_CH_WIDTH_MAX(cap) ((1 << FIELD_GET(S1G_CAP0_SUPP_CH_WIDTH, \ - cap[0])) << 1) - -#define S1G_CAP1_RX_LDPC BIT(0) -#define S1G_CAP1_TX_STBC BIT(1) -#define S1G_CAP1_RX_STBC BIT(2) -#define S1G_CAP1_SU_BFER BIT(3) -#define S1G_CAP1_SU_BFEE BIT(4) -#define S1G_CAP1_BFEE_STS GENMASK(7, 5) - -#define S1G_CAP2_SOUNDING_DIMENSIONS GENMASK(2, 0) -#define S1G_CAP2_MU_BFER BIT(3) -#define S1G_CAP2_MU_BFEE BIT(4) -#define S1G_CAP2_PLUS_HTC_VHT BIT(5) -#define S1G_CAP2_TRAVELING_PILOT GENMASK(7, 6) - -#define S1G_CAP3_RD_RESPONDER BIT(0) -#define S1G_CAP3_HT_DELAYED_BA BIT(1) -#define S1G_CAP3_MAX_MPDU_LEN BIT(2) -#define S1G_CAP3_MAX_AMPDU_LEN_EXP GENMASK(4, 3) -#define S1G_CAP3_MIN_MPDU_START GENMASK(7, 5) - -#define S1G_CAP4_UPLINK_SYNC BIT(0) -#define S1G_CAP4_DYNAMIC_AID BIT(1) -#define S1G_CAP4_BAT BIT(2) -#define S1G_CAP4_TIME_ADE BIT(3) -#define S1G_CAP4_NON_TIM BIT(4) -#define S1G_CAP4_GROUP_AID BIT(5) -#define S1G_CAP4_STA_TYPE GENMASK(7, 6) - -#define S1G_CAP5_CENT_AUTH_CONTROL BIT(0) -#define S1G_CAP5_DIST_AUTH_CONTROL BIT(1) -#define S1G_CAP5_AMSDU BIT(2) -#define S1G_CAP5_AMPDU BIT(3) -#define S1G_CAP5_ASYMMETRIC_BA BIT(4) -#define S1G_CAP5_FLOW_CONTROL BIT(5) -#define S1G_CAP5_SECTORIZED_BEAM GENMASK(7, 6) - -#define S1G_CAP6_OBSS_MITIGATION BIT(0) -#define S1G_CAP6_FRAGMENT_BA BIT(1) -#define S1G_CAP6_NDP_PS_POLL BIT(2) -#define S1G_CAP6_RAW_OPERATION BIT(3) -#define S1G_CAP6_PAGE_SLICING BIT(4) -#define S1G_CAP6_TXOP_SHARING_IMP_ACK BIT(5) -#define S1G_CAP6_VHT_LINK_ADAPT GENMASK(7, 6) - -#define S1G_CAP7_TACK_AS_PS_POLL BIT(0) -#define S1G_CAP7_DUP_1MHZ BIT(1) -#define S1G_CAP7_MCS_NEGOTIATION BIT(2) -#define S1G_CAP7_1MHZ_CTL_RESPONSE_PREAMBLE BIT(3) -#define S1G_CAP7_NDP_BFING_REPORT_POLL BIT(4) -#define S1G_CAP7_UNSOLICITED_DYN_AID BIT(5) -#define S1G_CAP7_SECTOR_TRAINING_OPERATION BIT(6) -#define S1G_CAP7_TEMP_PS_MODE_SWITCH BIT(7) - -#define S1G_CAP8_TWT_GROUPING BIT(0) -#define S1G_CAP8_BDT BIT(1) -#define S1G_CAP8_COLOR GENMASK(4, 2) -#define S1G_CAP8_TWT_REQUEST BIT(5) -#define S1G_CAP8_TWT_RESPOND BIT(6) -#define S1G_CAP8_PV1_FRAME BIT(7) - -#define S1G_CAP9_LINK_ADAPT_PER_CONTROL_RESPONSE BIT(0) - -#define S1G_OPER_CH_WIDTH_PRIMARY BIT(0) -#define S1G_OPER_CH_WIDTH_OPER GENMASK(4, 1) -#define S1G_OPER_CH_PRIMARY_LOCATION BIT(5) - -#define S1G_2M_PRIMARY_LOCATION_LOWER 0 -#define S1G_2M_PRIMARY_LOCATION_UPPER 1 - -#define LISTEN_INT_USF GENMASK(15, 14) -#define LISTEN_INT_UI GENMASK(13, 0) - -#define IEEE80211_MAX_USF FIELD_MAX(LISTEN_INT_USF) -#define IEEE80211_MAX_UI FIELD_MAX(LISTEN_INT_UI) - /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -2189,20 +1915,6 @@ enum ieee80211_key_len { WLAN_KEY_LEN_BIP_GMAC_256 = 32, }; -enum ieee80211_s1g_actioncode { - WLAN_S1G_AID_SWITCH_REQUEST, - WLAN_S1G_AID_SWITCH_RESPONSE, - WLAN_S1G_SYNC_CONTROL, - WLAN_S1G_STA_INFO_ANNOUNCE, - WLAN_S1G_EDCA_PARAM_SET, - WLAN_S1G_EL_OPERATION, - WLAN_S1G_TWT_SETUP, - WLAN_S1G_TWT_TEARDOWN, - WLAN_S1G_SECT_GROUP_ID_LIST, - WLAN_S1G_SECT_ID_FEEDBACK, - WLAN_S1G_TWT_INFORMATION = 11, -}; - /* Radio measurement action codes as defined in IEEE 802.11-2024 - Table 9-470 */ enum ieee80211_radio_measurement_actioncode { WLAN_RM_ACTION_RADIO_MEASUREMENT_REQUEST = 0, @@ -2877,254 +2589,6 @@ static inline bool __ieee80211_check_tim(const struct ieee80211_tim_ie *tim, return !!(tim->virtual_map[index] & mask); } -struct s1g_tim_aid { - u16 aid; - u8 target_blk; /* Target block index */ - u8 target_subblk; /* Target subblock index */ - u8 target_subblk_bit; /* Target subblock bit */ -}; - -struct s1g_tim_enc_block { - u8 enc_mode; - bool inverse; - const u8 *ptr; - u8 len; - - /* - * For an OLB encoded block that spans multiple blocks, this - * is the offset into the span described by that encoded block. - */ - u8 olb_blk_offset; -}; - -/* - * Helper routines to quickly extract the length of an encoded block. Validation - * is also performed to ensure the length extracted lies within the TIM. - */ - -static inline int ieee80211_s1g_len_bitmap(const u8 *ptr, const u8 *end) -{ - u8 blkmap; - u8 n_subblks; - - if (ptr >= end) - return -EINVAL; - - blkmap = *ptr; - n_subblks = hweight8(blkmap); - - if (ptr + 1 + n_subblks > end) - return -EINVAL; - - return 1 + n_subblks; -} - -static inline int ieee80211_s1g_len_single(const u8 *ptr, const u8 *end) -{ - return (ptr + 1 > end) ? -EINVAL : 1; -} - -static inline int ieee80211_s1g_len_olb(const u8 *ptr, const u8 *end) -{ - if (ptr >= end) - return -EINVAL; - - return (ptr + 1 + *ptr > end) ? -EINVAL : 1 + *ptr; -} - -/* - * Enumerate all encoded blocks until we find the encoded block that describes - * our target AID. OLB is a special case as a single encoded block can describe - * multiple blocks as a single encoded block. - */ -static inline int ieee80211_s1g_find_target_block(struct s1g_tim_enc_block *enc, - const struct s1g_tim_aid *aid, - const u8 *ptr, const u8 *end) -{ - /* need at least block-control octet */ - while (ptr + 1 <= end) { - u8 ctrl = *ptr++; - u8 mode = ctrl & 0x03; - bool contains, inverse = ctrl & BIT(2); - u8 span, blk_off = ctrl >> 3; - int len; - - switch (mode) { - case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: - len = ieee80211_s1g_len_bitmap(ptr, end); - contains = blk_off == aid->target_blk; - break; - case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: - len = ieee80211_s1g_len_single(ptr, end); - contains = blk_off == aid->target_blk; - break; - case IEEE80211_S1G_TIM_ENC_MODE_OLB: - len = ieee80211_s1g_len_olb(ptr, end); - /* - * An OLB encoded block can describe more then one - * block, meaning an encoded OLB block can span more - * then a single block. - */ - if (len > 0) { - /* Minus one for the length octet */ - span = DIV_ROUND_UP(len - 1, 8); - /* - * Check if our target block lies within the - * block span described by this encoded block. - */ - contains = (aid->target_blk >= blk_off) && - (aid->target_blk < blk_off + span); - } - break; - default: - return -EOPNOTSUPP; - } - - if (len < 0) - return len; - - if (contains) { - enc->enc_mode = mode; - enc->inverse = inverse; - enc->ptr = ptr; - enc->len = (u8)len; - enc->olb_blk_offset = blk_off; - return 0; - } - - ptr += len; - } - - return -ENOENT; -} - -static inline bool ieee80211_s1g_parse_bitmap(struct s1g_tim_enc_block *enc, - struct s1g_tim_aid *aid) -{ - const u8 *ptr = enc->ptr; - u8 blkmap = *ptr++; - - /* - * If our block bitmap does not contain a set bit that corresponds - * to our AID, it could mean a variety of things depending on if - * the encoding mode is inverted or not. - * - * 1. If inverted, it means the entire subblock is present and hence - * our AID has been set. - * 2. If not inverted, it means our subblock is not present and hence - * it is all zero meaning our AID is not set. - */ - if (!(blkmap & BIT(aid->target_subblk))) - return enc->inverse; - - /* - * Increment ptr by the number of set subblocks that appear before our - * target subblock. If our target subblock is 0, do nothing as ptr - * already points to our target subblock. - */ - if (aid->target_subblk) - ptr += hweight8(blkmap & GENMASK(aid->target_subblk - 1, 0)); - - return !!(*ptr & BIT(aid->target_subblk_bit)) ^ enc->inverse; -} - -static inline bool ieee80211_s1g_parse_single(struct s1g_tim_enc_block *enc, - struct s1g_tim_aid *aid) -{ - /* - * Single AID mode describes, as the name suggests, a single AID - * within the block described by the encoded block. The octet - * contains the 6 LSBs of the AID described in the block. The other - * 2 bits are reserved. When inversed, every single AID described - * by the current block have buffered traffic except for the AID - * described in the single AID octet. - */ - return ((*enc->ptr & 0x3f) == (aid->aid & 0x3f)) ^ enc->inverse; -} - -static inline bool ieee80211_s1g_parse_olb(struct s1g_tim_enc_block *enc, - struct s1g_tim_aid *aid) -{ - const u8 *ptr = enc->ptr; - u8 blk_len = *ptr++; - /* - * Given an OLB encoded block that describes multiple blocks, - * calculate the offset into the span. Then calculate the - * subblock location normally. - */ - u16 span_offset = aid->target_blk - enc->olb_blk_offset; - u16 subblk_idx = span_offset * 8 + aid->target_subblk; - - if (subblk_idx >= blk_len) - return enc->inverse; - - return !!(ptr[subblk_idx] & BIT(aid->target_subblk_bit)) ^ enc->inverse; -} - -/* - * An S1G PVB has 3 non optional encoding types, each that can be inverted. - * An S1G PVB is constructed with zero or more encoded block subfields. Each - * encoded block represents a single "block" of AIDs (64), and each encoded - * block can contain one of the 3 encoding types alongside a single bit for - * whether the bits should be inverted. - * - * As the standard makes no guarantee about the ordering of encoded blocks, - * we must parse every encoded block in the worst case scenario given an - * AID that lies within the last block. - */ -static inline bool ieee80211_s1g_check_tim(const struct ieee80211_tim_ie *tim, - u8 tim_len, u16 aid) -{ - int err; - struct s1g_tim_aid target_aid; - struct s1g_tim_enc_block enc_blk; - - if (tim_len < 3) - return false; - - target_aid.aid = aid; - target_aid.target_blk = (aid >> 6) & 0x1f; - target_aid.target_subblk = (aid >> 3) & 0x7; - target_aid.target_subblk_bit = aid & 0x7; - - /* - * Find our AIDs target encoded block and fill &enc_blk with the - * encoded blocks information. If no entry is found or an error - * occurs return false. - */ - err = ieee80211_s1g_find_target_block(&enc_blk, &target_aid, - tim->virtual_map, - (const u8 *)tim + tim_len + 2); - if (err) - return false; - - switch (enc_blk.enc_mode) { - case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: - return ieee80211_s1g_parse_bitmap(&enc_blk, &target_aid); - case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: - return ieee80211_s1g_parse_single(&enc_blk, &target_aid); - case IEEE80211_S1G_TIM_ENC_MODE_OLB: - return ieee80211_s1g_parse_olb(&enc_blk, &target_aid); - default: - return false; - } -} - -/** - * ieee80211_check_tim - check if AID bit is set in TIM - * @tim: the TIM IE - * @tim_len: length of the TIM IE - * @aid: the AID to look for - * @s1g: whether the TIM is from an S1G PPDU - * Return: whether or not traffic is indicated in the TIM for the given AID - */ -static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, - u8 tim_len, u16 aid, bool s1g) -{ - return s1g ? ieee80211_s1g_check_tim(tim, tim_len, aid) : - __ieee80211_check_tim(tim, tim_len, aid); -} - /** * ieee80211_get_tdls_action - get TDLS action code * @skb: the skb containing the frame, length will not be checked @@ -3258,39 +2722,6 @@ static inline bool ieee80211_is_ftm(struct sk_buff *skb) return false; } -/** - * ieee80211_is_s1g_short_beacon - check if frame is an S1G short beacon - * @fc: frame control bytes in little-endian byteorder - * @variable: pointer to the beacon frame elements - * @variable_len: length of the frame elements - * Return: whether or not the frame is an S1G short beacon. As per - * IEEE80211-2024 11.1.3.10.1, The S1G beacon compatibility element shall - * always be present as the first element in beacon frames generated at a - * TBTT (Target Beacon Transmission Time), so any frame not containing - * this element must have been generated at a TSBTT (Target Short Beacon - * Transmission Time) that is not a TBTT. Additionally, short beacons are - * prohibited from containing the S1G beacon compatibility element as per - * IEEE80211-2024 9.3.4.3 Table 9-76, so if we have an S1G beacon with - * either no elements or the first element is not the beacon compatibility - * element, we have a short beacon. - */ -static inline bool ieee80211_is_s1g_short_beacon(__le16 fc, const u8 *variable, - size_t variable_len) -{ - if (!ieee80211_is_s1g_beacon(fc)) - return false; - - /* - * If the frame does not contain at least 1 element (this is perfectly - * valid in a short beacon) and is an S1G beacon, we have a short - * beacon. - */ - if (variable_len < 2) - return true; - - return variable[0] != WLAN_EID_S1G_BCN_COMPAT; -} - struct element { u8 id; u8 datalen; @@ -3446,5 +2877,21 @@ struct ieee80211_tbtt_info_ge_11 { #include "ieee80211-he.h" #include "ieee80211-eht.h" #include "ieee80211-mesh.h" +#include "ieee80211-s1g.h" + +/** + * ieee80211_check_tim - check if AID bit is set in TIM + * @tim: the TIM IE + * @tim_len: length of the TIM IE + * @aid: the AID to look for + * @s1g: whether the TIM is from an S1G PPDU + * Return: whether or not traffic is indicated in the TIM for the given AID + */ +static inline bool ieee80211_check_tim(const struct ieee80211_tim_ie *tim, + u8 tim_len, u16 aid, bool s1g) +{ + return s1g ? ieee80211_s1g_check_tim(tim, tim_len, aid) : + __ieee80211_check_tim(tim, tim_len, aid); +} #endif /* LINUX_IEEE80211_H */ -- cgit v1.2.3 From fcd42b909ba06737dfcda47f3a0a9718bd3ebf03 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:55 +0100 Subject: wifi: ieee80211: split P2P definitions out The ieee80211.h file has gotten very long, continue splitting it by putting P2P definitions into a separate file. Note that P2P isn't really even IEEE 802.11 but WFA. Link: https://patch.msgid.link/20251105153843.e47b2614e9d2.Id242f61da720e365f6b5d7a4a545fbbc2f1e92b4@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-p2p.h | 71 +++++++++++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 53 +------------------------------- 2 files changed, 72 insertions(+), 52 deletions(-) create mode 100644 include/linux/ieee80211-p2p.h (limited to 'include') diff --git a/include/linux/ieee80211-p2p.h b/include/linux/ieee80211-p2p.h new file mode 100644 index 000000000000..180891c11f08 --- /dev/null +++ b/include/linux/ieee80211-p2p.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * WFA P2P definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_P2P_H +#define LINUX_IEEE80211_P2P_H + +#include +/* + * Peer-to-Peer IE attribute related definitions. + */ +/* + * enum ieee80211_p2p_attr_id - identifies type of peer-to-peer attribute. + */ +enum ieee80211_p2p_attr_id { + IEEE80211_P2P_ATTR_STATUS = 0, + IEEE80211_P2P_ATTR_MINOR_REASON, + IEEE80211_P2P_ATTR_CAPABILITY, + IEEE80211_P2P_ATTR_DEVICE_ID, + IEEE80211_P2P_ATTR_GO_INTENT, + IEEE80211_P2P_ATTR_GO_CONFIG_TIMEOUT, + IEEE80211_P2P_ATTR_LISTEN_CHANNEL, + IEEE80211_P2P_ATTR_GROUP_BSSID, + IEEE80211_P2P_ATTR_EXT_LISTEN_TIMING, + IEEE80211_P2P_ATTR_INTENDED_IFACE_ADDR, + IEEE80211_P2P_ATTR_MANAGABILITY, + IEEE80211_P2P_ATTR_CHANNEL_LIST, + IEEE80211_P2P_ATTR_ABSENCE_NOTICE, + IEEE80211_P2P_ATTR_DEVICE_INFO, + IEEE80211_P2P_ATTR_GROUP_INFO, + IEEE80211_P2P_ATTR_GROUP_ID, + IEEE80211_P2P_ATTR_INTERFACE, + IEEE80211_P2P_ATTR_OPER_CHANNEL, + IEEE80211_P2P_ATTR_INVITE_FLAGS, + /* 19 - 220: Reserved */ + IEEE80211_P2P_ATTR_VENDOR_SPECIFIC = 221, + + IEEE80211_P2P_ATTR_MAX +}; + +/* Notice of Absence attribute - described in P2P spec 4.1.14 */ +/* Typical max value used here */ +#define IEEE80211_P2P_NOA_DESC_MAX 4 + +struct ieee80211_p2p_noa_desc { + u8 count; + __le32 duration; + __le32 interval; + __le32 start_time; +} __packed; + +struct ieee80211_p2p_noa_attr { + u8 index; + u8 oppps_ctwindow; + struct ieee80211_p2p_noa_desc desc[IEEE80211_P2P_NOA_DESC_MAX]; +} __packed; + +#define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) +#define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F + +#endif /* LINUX_IEEE80211_P2P_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 1b27bbac145b..fa0f7f917ce7 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1333,58 +1333,6 @@ struct ieee80211_tdls_data { } u; } __packed; -/* - * Peer-to-Peer IE attribute related definitions. - */ -/* - * enum ieee80211_p2p_attr_id - identifies type of peer-to-peer attribute. - */ -enum ieee80211_p2p_attr_id { - IEEE80211_P2P_ATTR_STATUS = 0, - IEEE80211_P2P_ATTR_MINOR_REASON, - IEEE80211_P2P_ATTR_CAPABILITY, - IEEE80211_P2P_ATTR_DEVICE_ID, - IEEE80211_P2P_ATTR_GO_INTENT, - IEEE80211_P2P_ATTR_GO_CONFIG_TIMEOUT, - IEEE80211_P2P_ATTR_LISTEN_CHANNEL, - IEEE80211_P2P_ATTR_GROUP_BSSID, - IEEE80211_P2P_ATTR_EXT_LISTEN_TIMING, - IEEE80211_P2P_ATTR_INTENDED_IFACE_ADDR, - IEEE80211_P2P_ATTR_MANAGABILITY, - IEEE80211_P2P_ATTR_CHANNEL_LIST, - IEEE80211_P2P_ATTR_ABSENCE_NOTICE, - IEEE80211_P2P_ATTR_DEVICE_INFO, - IEEE80211_P2P_ATTR_GROUP_INFO, - IEEE80211_P2P_ATTR_GROUP_ID, - IEEE80211_P2P_ATTR_INTERFACE, - IEEE80211_P2P_ATTR_OPER_CHANNEL, - IEEE80211_P2P_ATTR_INVITE_FLAGS, - /* 19 - 220: Reserved */ - IEEE80211_P2P_ATTR_VENDOR_SPECIFIC = 221, - - IEEE80211_P2P_ATTR_MAX -}; - -/* Notice of Absence attribute - described in P2P spec 4.1.14 */ -/* Typical max value used here */ -#define IEEE80211_P2P_NOA_DESC_MAX 4 - -struct ieee80211_p2p_noa_desc { - u8 count; - __le32 duration; - __le32 interval; - __le32 start_time; -} __packed; - -struct ieee80211_p2p_noa_attr { - u8 index; - u8 oppps_ctwindow; - struct ieee80211_p2p_noa_desc desc[IEEE80211_P2P_NOA_DESC_MAX]; -} __packed; - -#define IEEE80211_P2P_OPPPS_ENABLE_BIT BIT(7) -#define IEEE80211_P2P_OPPPS_CTWINDOW_MASK 0x7F - /* Authentication algorithms */ #define WLAN_AUTH_OPEN 0 #define WLAN_AUTH_SHARED_KEY 1 @@ -2878,6 +2826,7 @@ struct ieee80211_tbtt_info_ge_11 { #include "ieee80211-eht.h" #include "ieee80211-mesh.h" #include "ieee80211-s1g.h" +#include "ieee80211-p2p.h" /** * ieee80211_check_tim - check if AID bit is set in TIM -- cgit v1.2.3 From 60a3734192fa6909c48e33b0d212990ebaff54c4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:36:56 +0100 Subject: wifi: ieee80211: split NAN definitions out The ieee80211.h file has gotten very long, continue splitting it by putting NAN definitions into a separate file. Note that NAN isn't really even IEEE 802.11 but WFA. Link: https://patch.msgid.link/20251105153843.8da0e796dda2.I7b2ce11220b70e8794019501eabbf8afbaf431a6@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-nan.h | 35 +++++++++++++++++++++++++++++++++++ include/linux/ieee80211.h | 18 +----------------- 2 files changed, 36 insertions(+), 17 deletions(-) create mode 100644 include/linux/ieee80211-nan.h (limited to 'include') diff --git a/include/linux/ieee80211-nan.h b/include/linux/ieee80211-nan.h new file mode 100644 index 000000000000..d07959bf8a90 --- /dev/null +++ b/include/linux/ieee80211-nan.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * WFA NAN definitions + * + * Copyright (c) 2001-2002, SSH Communications Security Corp and Jouni Malinen + * + * Copyright (c) 2002-2003, Jouni Malinen + * Copyright (c) 2005, Devicescape Software, Inc. + * Copyright (c) 2006, Michael Wu + * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH + * Copyright (c) 2018 - 2025 Intel Corporation + */ + +#ifndef LINUX_IEEE80211_NAN_H +#define LINUX_IEEE80211_NAN_H + +/* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */ +#define NAN_OP_MODE_PHY_MODE_VHT 0x01 +#define NAN_OP_MODE_PHY_MODE_HE 0x10 +#define NAN_OP_MODE_PHY_MODE_MASK 0x11 +#define NAN_OP_MODE_80P80MHZ 0x02 +#define NAN_OP_MODE_160MHZ 0x04 +#define NAN_OP_MODE_PNDL_SUPPRTED 0x08 + +/* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification + * Table 79 + */ +#define NAN_DEV_CAPA_DFS_OWNER 0x01 +#define NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED 0x02 +#define NAN_DEV_CAPA_SIM_NDP_RX_SUPPORTED 0x04 +#define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 +#define NAN_DEV_CAPA_S3_SUPPORTED 0x10 + +#endif /* LINUX_IEEE80211_NAN_H */ diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index fa0f7f917ce7..48ce05e1d203 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -2803,23 +2803,6 @@ struct ieee80211_tbtt_info_ge_11 { struct ieee80211_rnr_mld_params mld_params; } __packed; -/* NAN operation mode, as defined in Wi-Fi Aware (TM) specification Table 81 */ -#define NAN_OP_MODE_PHY_MODE_VHT 0x01 -#define NAN_OP_MODE_PHY_MODE_HE 0x10 -#define NAN_OP_MODE_PHY_MODE_MASK 0x11 -#define NAN_OP_MODE_80P80MHZ 0x02 -#define NAN_OP_MODE_160MHZ 0x04 -#define NAN_OP_MODE_PNDL_SUPPRTED 0x08 - -/* NAN Device capabilities, as defined in Wi-Fi Aware (TM) specification - * Table 79 - */ -#define NAN_DEV_CAPA_DFS_OWNER 0x01 -#define NAN_DEV_CAPA_EXT_KEY_ID_SUPPORTED 0x02 -#define NAN_DEV_CAPA_SIM_NDP_RX_SUPPORTED 0x04 -#define NAN_DEV_CAPA_NDPE_SUPPORTED 0x08 -#define NAN_DEV_CAPA_S3_SUPPORTED 0x10 - #include "ieee80211-ht.h" #include "ieee80211-vht.h" #include "ieee80211-he.h" @@ -2827,6 +2810,7 @@ struct ieee80211_tbtt_info_ge_11 { #include "ieee80211-mesh.h" #include "ieee80211-s1g.h" #include "ieee80211-p2p.h" +#include "ieee80211-nan.h" /** * ieee80211_check_tim - check if AID bit is set in TIM -- cgit v1.2.3 From 30b6089aad35500e683025dddc029ac28705385d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:39:57 +0100 Subject: wifi: cfg80211: fix EHT typo This is clearly EHT, not ETH, fix the typo. Link: https://patch.msgid.link/20251105153958.e9d4af3b768e.I5f3378326837e3f62928a2f1fd3403f29cea069b@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f2e8963cfaac..84be0cdd1da0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -685,7 +685,7 @@ ieee80211_get_he_6ghz_capa(const struct ieee80211_supported_band *sband, } /** - * ieee80211_get_eht_iftype_cap - return ETH capabilities for an sband's iftype + * ieee80211_get_eht_iftype_cap - return EHT capabilities for an sband's iftype * @sband: the sband to search for the iftype on * @iftype: enum nl80211_iftype * -- cgit v1.2.3 From 1a1cad924e8a60252132446fbba1284035010b4f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 15:39:58 +0100 Subject: wifi: mac80211: fix EHT typo This is clearly EHT, not ETH, fix the typo. Link: https://patch.msgid.link/20251105153958.12a04517f7ec.Idcf800817fa30605b1002c3d2287cad016e7aea7@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index c326243e1f01..c2e49542626c 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7223,7 +7223,7 @@ ieee80211_get_he_6ghz_capa_vif(const struct ieee80211_supported_band *sband, } /** - * ieee80211_get_eht_iftype_cap_vif - return ETH capabilities for sband/vif + * ieee80211_get_eht_iftype_cap_vif - return EHT capabilities for sband/vif * @sband: the sband to search for the iftype on * @vif: the vif to get the iftype from * -- cgit v1.2.3 From 68eb1b791ac8da7c3d03967143f1417e2978bf5e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Nov 2025 16:08:10 +0100 Subject: wifi: mac80211: pass frame type to element parsing This will be needed for UHR operation parsing, and we already pass whether or not the frame is an action frame, replace that by the full type. Note this fixes a few cases where 'false' was erroneously passed (mesh and TDLS) and removes ieee802_11_parse_elems_crc() as it's unused. Link: https://patch.msgid.link/20251105160810.a476d20a6e01.Ie659535f9357f2f9a3c73f8c059ccfc96bf93b54@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 48ce05e1d203..6d4bc80caf96 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -43,6 +43,7 @@ #define IEEE80211_FCTL_VERS 0x0003 #define IEEE80211_FCTL_FTYPE 0x000c #define IEEE80211_FCTL_STYPE 0x00f0 +#define IEEE80211_FCTL_TYPE (IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE) #define IEEE80211_FCTL_TODS 0x0100 #define IEEE80211_FCTL_FROMDS 0x0200 #define IEEE80211_FCTL_MOREFRAGS 0x0400 -- cgit v1.2.3 From 473235677af46ecb167917887586646e9d70d9ff Mon Sep 17 00:00:00 2001 From: Chien Wong Date: Fri, 7 Nov 2025 22:23:32 +0800 Subject: wifi: cfg80211: fix doc of struct key_params The seq in struct key_params is for many ciphers, including CCMP, GCMP, CMAC, GMAC. In addition to get_key(), it is also used when setting keys. Signed-off-by: Chien Wong Link: https://patch.msgid.link/20251107142332.181308-1-m@xv97.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 84be0cdd1da0..d87c18e1b133 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -786,8 +786,7 @@ struct vif_params { * @key: key material * @key_len: length of key material * @cipher: cipher suite selector - * @seq: sequence counter (IV/PN) for TKIP and CCMP keys, only used - * with the get_key() callback, must be in little endian, + * @seq: sequence counter (IV/PN), must be in little endian, * length given by @seq_len. * @seq_len: length of @seq. * @vlan_id: vlan_id for VLAN group key (if nonzero) -- cgit v1.2.3 From 1de3d9e2cd3a3c6f503cd31ec1f552c9dd8cf8ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Thu, 6 Nov 2025 22:16:01 +0100 Subject: dt-bindings: clock: r8a779a0: Add ZG core clock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the core clock used by the GPU on the Renesas R-Car V3U (R8A779A0) SoC. Signed-off-by: Niklas Söderlund Acked-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Link: https://patch.msgid.link/20251106211604.2766465-2-niklas.soderlund+renesas@ragnatech.se Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/r8a779a0-cpg-mssr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/r8a779a0-cpg-mssr.h b/include/dt-bindings/clock/r8a779a0-cpg-mssr.h index f1d737ca7ca1..124a6b8856df 100644 --- a/include/dt-bindings/clock/r8a779a0-cpg-mssr.h +++ b/include/dt-bindings/clock/r8a779a0-cpg-mssr.h @@ -51,5 +51,6 @@ #define R8A779A0_CLK_CBFUSA 40 #define R8A779A0_CLK_R 41 #define R8A779A0_CLK_OSC 42 +#define R8A779A0_CLK_ZG 43 #endif /* __DT_BINDINGS_CLOCK_R8A779A0_CPG_MSSR_H__ */ -- cgit v1.2.3 From aaa5abcc9d44d2c8484f779ab46d242d774cabcb Mon Sep 17 00:00:00 2001 From: Carl Worth Date: Thu, 25 Sep 2025 18:42:31 +0800 Subject: coresight: tmc: add the handle of the event to the path The handle is essential for retrieving the AUX_EVENT of each CPU and is required in perf mode. It has been added to the coresight_path so that dependent devices can access it from the path when needed. The existing bug can be reproduced with: perf record -e cs_etm//k -C 0-9 dd if=/dev/zero of=/dev/null Showing an oops as follows: Unable to handle kernel paging request at virtual address 000f6e84934ed19e Call trace: tmc_etr_get_buffer+0x30/0x80 [coresight_tmc] (P) catu_enable_hw+0xbc/0x3d0 [coresight_catu] catu_enable+0x70/0xe0 [coresight_catu] coresight_enable_path+0xb0/0x258 [coresight] Fixes: 080ee83cc361 ("Coresight: Change functions to accept the coresight_path") Signed-off-by: Carl Worth Reviewed-by: Leo Yan Co-developed-by: Jie Gan Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-1-edd8a07c1646@oss.qualcomm.com --- include/linux/coresight.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 6de59ce8ef8c..2626105e3719 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -332,12 +332,14 @@ static struct coresight_dev_list (var) = { \ /** * struct coresight_path - data needed by enable/disable path - * @path_list: path from source to sink. - * @trace_id: trace_id of the whole path. + * @path_list: path from source to sink. + * @trace_id: trace_id of the whole path. + * @handle: handle of the aux_event. */ struct coresight_path { - struct list_head path_list; - u8 trace_id; + struct list_head path_list; + u8 trace_id; + struct perf_output_handle *handle; }; enum cs_mode { -- cgit v1.2.3 From 94baedb51dea4b0c97e3c9acd90953bec98d03e7 Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Thu, 25 Sep 2025 18:42:32 +0800 Subject: coresight: change helper_ops to accept coresight_path Update the helper_enable and helper_disable functions to accept coresight_path instead of a generic void *data, as coresight_path encapsulates all the necessary data required by devices along the path. Tested-by: Carl Worth Reviewed-by: Carl Worth Reviewed-by: Leo Yan Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-2-edd8a07c1646@oss.qualcomm.com --- include/linux/coresight.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2626105e3719..2bee2e3bb1c6 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -424,8 +424,9 @@ struct coresight_ops_source { */ struct coresight_ops_helper { int (*enable)(struct coresight_device *csdev, enum cs_mode mode, - void *data); - int (*disable)(struct coresight_device *csdev, void *data); + struct coresight_path *path); + int (*disable)(struct coresight_device *csdev, + struct coresight_path *path); }; -- cgit v1.2.3 From b139702a889692ec30702534ebb1ae2b11ed1cbf Mon Sep 17 00:00:00 2001 From: Jie Gan Date: Thu, 25 Sep 2025 18:42:33 +0800 Subject: coresight: change the sink_ops to accept coresight_path Update the sink_enable functions to accept coresight_path instead of a generic void *data, as coresight_path encapsulates all the necessary data required by devices along the path. Tested-by: Carl Worth Reviewed-by: Carl Worth Reviewed-by: Leo Yan Signed-off-by: Jie Gan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20250925-fix_helper_data-v2-3-edd8a07c1646@oss.qualcomm.com --- include/linux/coresight.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2bee2e3bb1c6..56d0108658db 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -367,7 +367,7 @@ enum cs_mode { */ struct coresight_ops_sink { int (*enable)(struct coresight_device *csdev, enum cs_mode mode, - void *data); + struct coresight_path *path); int (*disable)(struct coresight_device *csdev); void *(*alloc_buffer)(struct coresight_device *csdev, struct perf_event *event, void **pages, -- cgit v1.2.3 From 772ada50282b0c80343c8989147db816961f571d Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 10 Nov 2025 11:43:27 +0000 Subject: ASoC: cs35l56: Alter error codes for calibration routine Adjust the error codes returned by the calibration debugfs files to provide a wider range of errors and make them more consistent. There is a limited number of standard errors and it's not always easy to find an error code that accurately describes what happened. Additionally, user code often uses strerror() or something similar to report a generic error description. The original calibration code used a limited set of errors to attempt to avoid user error strings that would be confusing or unclear on a file read/write. However, this restricts the ability to provide informative errors. This limited error range didn't help very much with debugging so it has been expanded, rather than worrying about what strerror() would return. The errors are now more consistent: ENXIO Calibration is not supported by the driver. EOPNOTSUPP The given calibration command is not supported. EBUSY Cannot calibrate because the amp is playing audio. ERANGE Calibration result was out-of-range. ETIMEDOUT Calibration did not complete. EMSGSIZE Blob written to cal_data is the wrong size. ENODATA No calibration data available to read from cal_data, or Blob written to cal_data does not contain calibration, or No calibration data available to save to UEFI. EIO General failure to communicate with the firmware, mainly indicating that firmware controls are missing. EINVAL Has its normal meaning that an invalid argument was passed. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20251110114327.84370-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 82559be0f249..0a740a99ad31 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -265,6 +265,9 @@ #define CS35L56_PS3_POLL_US 500 #define CS35L56_PS3_TIMEOUT_US 300000 +#define CS35L56_CAL_STATUS_SUCCESS 1 +#define CS35L56_CAL_STATUS_OUT_OF_RANGE 3 + #define CS35L56_CONTROL_PORT_READY_US 2200 #define CS35L56_HALO_STATE_POLL_US 1000 #define CS35L56_HALO_STATE_TIMEOUT_US 250000 -- cgit v1.2.3 From 22ea7b9d96e26147b7a3ea1be7aa106cc700907c Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:11 +0100 Subject: platform/x86: asus-wmi: export symbols used for read/write WMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export symbols for reading/writing WMI symbols using a namespace. Existing functions: - asus_wmi_evaluate_method - asus_wmi_set_devstate New function: - asus_wmi_get_devstate_dsts The new function is intended for use with DSTS WMI method only and avoids requiring the asus_wmi driver data to select the WMI method. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Link: https://patch.msgid.link/20251102215319.3126879-2-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 8a515179113d..dbd44d9fbb6f 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -166,6 +166,7 @@ enum asus_ally_mcu_hack { #if IS_REACHABLE(CONFIG_ASUS_WMI) void set_ally_mcu_hack(enum asus_ally_mcu_hack status); void set_ally_mcu_powersave(bool enabled); +int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval); int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval); int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval); #else @@ -179,6 +180,10 @@ static inline int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval) { return -ENODEV; } +static inline int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval) +{ + return -ENODEV; +} static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval) { -- cgit v1.2.3 From 0d9a7085ec24983b5b8a4971c0eb6c756c75f1bf Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Sat, 8 Nov 2025 00:05:16 +0100 Subject: scsi: ufs: core: Convert string descriptor format macros to enum Convert SD_ASCII_STD and SD_RAW from boolean macros to enum values for improved code readability. This makes ufshcd_read_string_desc() calls self-documenting by using explicit enum values instead of true/false. Move the ufshcd_read_string_desc() declaration from include/ufs/ufshcd.h to drivers/ufs/core/ufshcd-priv.h since this function is not exported. Co-developed-by: Bart Van Assche Signed-off-by: Bart Van Assche Suggested-by: Avri Altman Signed-off-by: Bean Huo Link: https://patch.msgid.link/20251107230518.4060231-2-beanhuo@iokpp.de Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 9425cfd9d00e..b4eb2fa58552 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -1431,10 +1431,6 @@ static inline int ufshcd_disable_host_tx_lcc(struct ufs_hba *hba) void ufshcd_auto_hibern8_update(struct ufs_hba *hba, u32 ahit); void ufshcd_fixup_dev_quirks(struct ufs_hba *hba, const struct ufs_dev_quirk *fixups); -#define SD_ASCII_STD true -#define SD_RAW false -int ufshcd_read_string_desc(struct ufs_hba *hba, u8 desc_index, - u8 **buf, bool ascii); void ufshcd_hold(struct ufs_hba *hba); void ufshcd_release(struct ufs_hba *hba); -- cgit v1.2.3 From b06b8c421485e0e96d7fd6aa614fb0b6f2778a03 Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Sat, 8 Nov 2025 00:05:18 +0100 Subject: scsi: ufs: core: Add OP-TEE based RPMB driver for UFS devices Add OP-TEE based RPMB support for UFS devices. This enables secure RPMB operations on UFS devices through OP-TEE, providing the same functionality available for eMMC devices and extending kernel-based secure storage support to UFS-based systems. Benefits of OP-TEE based RPMB implementation: - Eliminates dependency on userspace supplicant for RPMB access - Enables early boot secure storage access (e.g., fTPM, secure UEFI variables) - Provides kernel-level RPMB access as soon as UFS driver is initialized - Removes complex initramfs dependencies and boot ordering requirements - Ensures reliable and deterministic secure storage operations - Supports both built-in and modular fTPM configurations [mkp: make this build as a module] Co-developed-by: Can Guo Signed-off-by: Can Guo Reviewed-by: Avri Altman Reviewed-by: Bart Van Assche Signed-off-by: Bean Huo Link: https://patch.msgid.link/20251107230518.4060231-4-beanhuo@iokpp.de Signed-off-by: Martin K. Petersen --- include/ufs/ufs.h | 5 +++++ include/ufs/ufshcd.h | 7 +++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/ufs/ufs.h b/include/ufs/ufs.h index 245a6a829ce9..ab8f6c07b5a2 100644 --- a/include/ufs/ufs.h +++ b/include/ufs/ufs.h @@ -651,6 +651,11 @@ struct ufs_dev_info { u8 rtt_cap; /* bDeviceRTTCap */ bool hid_sup; + + /* Unique device ID string (manufacturer+model+serial+version+date) */ + char *device_id; + u8 rpmb_io_size; + u8 rpmb_region_size[4]; }; #endif /* End of Header */ diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index b4eb2fa58552..7efef792c269 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -826,6 +826,7 @@ enum ufshcd_mcq_opr { * @host: Scsi_Host instance of the driver * @dev: device handle * @ufs_device_wlun: WLUN that controls the entire UFS device. + * @ufs_rpmb_wlun: RPMB WLUN SCSI device * @hwmon_device: device instance registered with the hwmon core. * @curr_dev_pwr_mode: active UFS device power mode. * @uic_link_state: active state of the link to the UFS device. @@ -941,8 +942,8 @@ enum ufshcd_mcq_opr { * @pm_qos_mutex: synchronizes PM QoS request and status updates * @critical_health_count: count of critical health exceptions * @dev_lvl_exception_count: count of device level exceptions since last reset - * @dev_lvl_exception_id: vendor specific information about the - * device level exception event. + * @dev_lvl_exception_id: vendor specific information about the device level exception event. + * @rpmbs: list of OP-TEE RPMB devices (one per RPMB region) */ struct ufs_hba { void __iomem *mmio_base; @@ -960,6 +961,7 @@ struct ufs_hba { struct Scsi_Host *host; struct device *dev; struct scsi_device *ufs_device_wlun; + struct scsi_device *ufs_rpmb_wlun; #ifdef CONFIG_SCSI_UFS_HWMON struct device *hwmon_device; @@ -1117,6 +1119,7 @@ struct ufs_hba { int critical_health_count; atomic_t dev_lvl_exception_count; u64 dev_lvl_exception_id; + struct list_head rpmbs; }; /** -- cgit v1.2.3 From 8989d328dfe7c7a3b9f4b9f0ef60006d277f81cc Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 5 Nov 2025 21:19:38 +0100 Subject: net: Helper to move packet data and metadata after skb_push/pull Lay groundwork for fixing BPF helpers available to TC(X) programs. When skb_push() or skb_pull() is called in a TC(X) ingress BPF program, the skb metadata must be kept in front of the MAC header. Otherwise, BPF programs using the __sk_buff->data_meta pseudo-pointer lose access to it. Introduce a helper that moves both metadata and a specified number of packet data bytes together, suitable as a drop-in replacement for memmove(). Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-1-5ceb08a9b37b@cloudflare.com --- include/linux/skbuff.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a7cc3d1f4fd1..ff90281ddf90 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4564,6 +4564,81 @@ static inline void skb_metadata_clear(struct sk_buff *skb) skb_metadata_set(skb, 0); } +/** + * skb_data_move - Move packet data and metadata after skb_push() or skb_pull(). + * @skb: packet to operate on + * @len: number of bytes pushed or pulled from &sk_buff->data + * @n: number of bytes to memmove() from pre-push/pull &sk_buff->data + * + * Moves @n bytes of packet data, can be zero, and all bytes of skb metadata. + * + * Assumes metadata is located immediately before &sk_buff->data prior to the + * push/pull, and that sufficient headroom exists to hold it after an + * skb_push(). Otherwise, metadata is cleared and a one-time warning is issued. + * + * Prefer skb_postpull_data_move() or skb_postpush_data_move() to calling this + * helper directly. + */ +static inline void skb_data_move(struct sk_buff *skb, const int len, + const unsigned int n) +{ + const u8 meta_len = skb_metadata_len(skb); + u8 *meta, *meta_end; + + if (!len || (!n && !meta_len)) + return; + + if (!meta_len) + goto no_metadata; + + meta_end = skb_metadata_end(skb); + meta = meta_end - meta_len; + + if (WARN_ON_ONCE(meta_end + len != skb->data || + meta_len > skb_headroom(skb))) { + skb_metadata_clear(skb); + goto no_metadata; + } + + memmove(meta + len, meta, meta_len + n); + return; + +no_metadata: + memmove(skb->data, skb->data - len, n); +} + +/** + * skb_postpull_data_move - Move packet data and metadata after skb_pull(). + * @skb: packet to operate on + * @len: number of bytes pulled from &sk_buff->data + * @n: number of bytes to memmove() from pre-pull &sk_buff->data + * + * See skb_data_move() for details. + */ +static inline void skb_postpull_data_move(struct sk_buff *skb, + const unsigned int len, + const unsigned int n) +{ + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb_data_move(skb, len, n); +} + +/** + * skb_postpush_data_move - Move packet data and metadata after skb_push(). + * @skb: packet to operate on + * @len: number of bytes pushed onto &sk_buff->data + * @n: number of bytes to memmove() from pre-push &sk_buff->data + * + * See skb_data_move() for details. + */ +static inline void skb_postpush_data_move(struct sk_buff *skb, + const unsigned int len, + const unsigned int n) +{ + DEBUG_NET_WARN_ON_ONCE(len > INT_MAX); + skb_data_move(skb, -len, n); +} + struct sk_buff *skb_clone_sk(struct sk_buff *skb); #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING -- cgit v1.2.3 From f38499ff45f567c932d0911e6a30b8ca022b9b52 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 5 Nov 2025 21:19:40 +0100 Subject: bpf: Unclone skb head on bpf_dynptr_write to skb metadata Currently bpf_dynptr_from_skb_meta() marks the dynptr as read-only when the skb is cloned, preventing writes to metadata. Remove this restriction and unclone the skb head on bpf_dynptr_write() to metadata, now that the metadata is preserved during uncloning. This makes metadata dynptr consistent with skb dynptr, allowing writes regardless of whether the skb is cloned. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-3-5ceb08a9b37b@cloudflare.com --- include/linux/filter.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index e116de7edc58..a104b3994230 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1781,6 +1781,8 @@ int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); +int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, u64 flags); void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, @@ -1817,6 +1819,13 @@ static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, voi { } +static inline int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, + u64 flags) +{ + return -EOPNOTSUPP; +} + static inline void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) { return ERR_PTR(-EOPNOTSUPP); -- cgit v1.2.3 From b85be58e2f7cff47f7477ae61022644a198ee592 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 5 Nov 2025 21:19:41 +0100 Subject: vlan: Make vlan_remove_tag return nothing All callers ignore the return value. Prepare to reorder memmove() after skb_pull() which is a common pattern. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-4-5ceb08a9b37b@cloudflare.com --- include/linux/if_vlan.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 15e01935d3fa..afa5cc61a0fa 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -731,10 +731,8 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb, * * Expects the skb to contain a VLAN tag in the payload, and to have skb->data * pointing at the MAC header. - * - * Returns: a new pointer to skb->data, or NULL on failure to pull. */ -static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) +static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) { struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); @@ -742,7 +740,7 @@ static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); vlan_set_encap_proto(skb, vhdr); - return __skb_pull(skb, VLAN_HLEN); + __skb_pull(skb, VLAN_HLEN); } /** -- cgit v1.2.3 From efd35c26239bed39193201e958d65e695231ccda Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 5 Nov 2025 21:19:42 +0100 Subject: bpf: Make bpf_skb_vlan_pop helper metadata-safe Use the metadata-aware helper to move packet bytes after skb_pull(), ensuring metadata remains valid after calling the BPF helper. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-5-5ceb08a9b37b@cloudflare.com --- include/linux/if_vlan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index afa5cc61a0fa..4ecc2509b0d4 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -738,9 +738,9 @@ static inline void vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci) *vlan_tci = ntohs(vhdr->h_vlan_TCI); - memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); vlan_set_encap_proto(skb, vhdr); __skb_pull(skb, VLAN_HLEN); + skb_postpull_data_move(skb, VLAN_HLEN, 2 * ETH_ALEN); } /** -- cgit v1.2.3 From 55ffc98b44d28e13a218306666d16f2c7236d0ae Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 5 Nov 2025 21:19:43 +0100 Subject: bpf: Make bpf_skb_vlan_push helper metadata-safe Use the metadata-aware helper to move packet bytes after skb_push(), ensuring metadata remains valid after calling the BPF helper. Also, take care to reserve sufficient headroom for metadata to fit. Signed-off-by: Jakub Sitnicki Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251105-skb-meta-rx-path-v4-6-5ceb08a9b37b@cloudflare.com --- include/linux/if_vlan.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 4ecc2509b0d4..f7f34eb15e06 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -355,16 +355,17 @@ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { + const u8 meta_len = mac_len > ETH_TLEN ? skb_metadata_len(skb) : 0; struct vlan_ethhdr *veth; - if (skb_cow_head(skb, VLAN_HLEN) < 0) + if (skb_cow_head(skb, meta_len + VLAN_HLEN) < 0) return -ENOMEM; skb_push(skb, VLAN_HLEN); /* Move the mac header sans proto to the beginning of the new header. */ if (likely(mac_len > ETH_TLEN)) - memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); + skb_postpush_data_move(skb, VLAN_HLEN, mac_len - ETH_TLEN); if (skb_mac_header_was_set(skb)) skb->mac_header -= VLAN_HLEN; -- cgit v1.2.3 From 15f295f55656658e65bdbc9b901d6b2e49d68d72 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Fri, 7 Nov 2025 11:56:31 +0800 Subject: net/smc: bpf: Introduce generic hook for handshake flow The introduction of IPPROTO_SMC enables eBPF programs to determine whether to use SMC based on the context of socket creation, such as network namespaces, PID and comm name, etc. As a subsequent enhancement, to introduce a new generic hook that allows decisions on whether to use SMC or not at runtime, including but not limited to local/remote IP address or ports. User can write their own implememtion via bpf_struct_ops now to choose whether to use SMC or not before TCP 3rd handshake to be comleted. Signed-off-by: D. Wythe Signed-off-by: Martin KaFai Lau Reviewed-by: Dust Li Link: https://patch.msgid.link/20251107035632.115950-3-alibuda@linux.alibaba.com --- include/net/netns/smc.h | 3 +++ include/net/smc.h | 53 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) (limited to 'include') diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 6ceb12baec24..ed24c9f638ee 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -17,6 +17,9 @@ struct netns_smc { #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + struct smc_hs_ctrl __rcu *hs_ctrl; +#endif /* CONFIG_SMC_HS_CTRL_BPF */ unsigned int sysctl_autocorking_size; unsigned int sysctl_smcr_buf_type; int sysctl_smcr_testlink_time; diff --git a/include/net/smc.h b/include/net/smc.h index 08bee529ed8d..bfdc4c41f019 100644 --- a/include/net/smc.h +++ b/include/net/smc.h @@ -17,6 +17,8 @@ #include #include +struct tcp_sock; +struct inet_request_sock; struct sock; #define SMC_MAX_PNETID_LEN 16 /* Max. length of PNET id */ @@ -50,4 +52,55 @@ struct smcd_dev { u8 going_away : 1; }; +#define SMC_HS_CTRL_NAME_MAX 16 + +enum { + /* ops can be inherit from init_net */ + SMC_HS_CTRL_FLAG_INHERITABLE = 0x1, + + SMC_HS_CTRL_ALL_FLAGS = SMC_HS_CTRL_FLAG_INHERITABLE, +}; + +struct smc_hs_ctrl { + /* private */ + + struct list_head list; + struct module *owner; + + /* public */ + + /* unique name */ + char name[SMC_HS_CTRL_NAME_MAX]; + int flags; + + /* Invoked before computing SMC option for SYN packets. + * We can control whether to set SMC options by returning various value. + * Return 0 to disable SMC, or return any other value to enable it. + */ + int (*syn_option)(struct tcp_sock *tp); + + /* Invoked before Set up SMC options for SYN-ACK packets + * We can control whether to respond SMC options by returning various + * value. Return 0 to disable SMC, or return any other value to enable + * it. + */ + int (*synack_option)(const struct tcp_sock *tp, + struct inet_request_sock *ireq); +}; + +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) +#define smc_call_hsbpf(init_val, tp, func, ...) ({ \ + typeof(init_val) __ret = (init_val); \ + struct smc_hs_ctrl *ctrl; \ + rcu_read_lock(); \ + ctrl = rcu_dereference(sock_net((struct sock *)(tp))->smc.hs_ctrl); \ + if (ctrl && ctrl->func) \ + __ret = ctrl->func(tp, ##__VA_ARGS__); \ + rcu_read_unlock(); \ + __ret; \ +}) +#else +#define smc_call_hsbpf(init_val, tp, ...) ({ (void)(tp); (init_val); }) +#endif /* CONFIG_SMC_HS_CTRL_BPF */ + #endif /* _SMC_H */ -- cgit v1.2.3 From 73edb26b06ea0eb5bd8c6cae5b2df212ae3c7ab5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 6 Nov 2025 22:34:06 +0000 Subject: sctp: Don't inherit do_auto_asconf in sctp_clone_sock(). syzbot reported list_del(&sp->auto_asconf_list) corruption in sctp_destroy_sock(). The repro calls setsockopt(SCTP_AUTO_ASCONF, 1) to a SCTP listener, calls accept(), and close()s the child socket. setsockopt(SCTP_AUTO_ASCONF, 1) sets sp->do_auto_asconf to 1 and links sp->auto_asconf_list to a per-netns list. Both fields are placed after sp->pd_lobby in struct sctp_sock, and sctp_copy_descendant() did not copy the fields before the cited commit. Also, sctp_clone_sock() did not set them explicitly. In addition, sctp_auto_asconf_init() is called from sctp_sock_migrate(), but it initialises the fields only conditionally. The two fields relied on __GFP_ZERO added in sk_alloc(), but sk_clone() does not use it. Let's clear newsp->do_auto_asconf in sctp_clone_sock(). [0]: list_del corruption. prev->next should be ffff8880799e9148, but was ffff8880799e8808. (prev=ffff88803347d9f8) kernel BUG at lib/list_debug.c:64! Oops: invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 0 UID: 0 PID: 6008 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025 RIP: 0010:__list_del_entry_valid_or_report+0x15a/0x190 lib/list_debug.c:62 Code: e8 7b 26 71 fd 43 80 3c 2c 00 74 08 4c 89 ff e8 7c ee 92 fd 49 8b 17 48 c7 c7 80 0a bf 8b 48 89 de 4c 89 f9 e8 07 c6 94 fc 90 <0f> 0b 4c 89 f7 e8 4c 26 71 fd 43 80 3c 2c 00 74 08 4c 89 ff e8 4d RSP: 0018:ffffc90003067ad8 EFLAGS: 00010246 RAX: 000000000000006d RBX: ffff8880799e9148 RCX: b056988859ee6e00 RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffc90003067807 R09: 1ffff9200060cf00 R10: dffffc0000000000 R11: fffff5200060cf01 R12: 1ffff1100668fb3f R13: dffffc0000000000 R14: ffff88803347d9f8 R15: ffff88803347d9f8 FS: 00005555823e5500(0000) GS:ffff88812613e000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000200000000480 CR3: 00000000741ce000 CR4: 00000000003526f0 Call Trace: __list_del_entry_valid include/linux/list.h:132 [inline] __list_del_entry include/linux/list.h:223 [inline] list_del include/linux/list.h:237 [inline] sctp_destroy_sock+0xb4/0x370 net/sctp/socket.c:5163 sk_common_release+0x75/0x310 net/core/sock.c:3961 sctp_close+0x77e/0x900 net/sctp/socket.c:1550 inet_release+0x144/0x190 net/ipv4/af_inet.c:437 __sock_release net/socket.c:662 [inline] sock_close+0xc3/0x240 net/socket.c:1455 __fput+0x44c/0xa70 fs/file_table.c:468 task_work_run+0x1d4/0x260 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop+0xe9/0x130 kernel/entry/common.c:43 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] do_syscall_64+0x2bd/0xfa0 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 16942cf4d3e3 ("sctp: Use sk_clone() in sctp_accept().") Reported-by: syzbot+ba535cb417f106327741@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/690d2185.a70a0220.22f260.000e.GAE@google.com/ Signed-off-by: Kuniyuki Iwashima Acked-by: Xin Long Link: https://patch.msgid.link/20251106223418.1455510-1-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/sctp/structs.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 5900196d65fd..affee44bd38e 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -228,10 +228,6 @@ struct sctp_sock { atomic_t pd_mode; - /* Fields after this point will be skipped on copies, like on accept - * and peeloff operations - */ - /* Receive to here while partial delivery is in effect. */ struct sk_buff_head pd_lobby; -- cgit v1.2.3 From 7ff14c52049eafecdd72cd0a12cae6905876566a Mon Sep 17 00:00:00 2001 From: Simon Schippers Date: Thu, 6 Nov 2025 18:56:15 +0100 Subject: usbnet: Add support for Byte Queue Limits (BQL) In the current implementation, usbnet uses a fixed tx_qlen of: USB2: 60 * 1518 bytes = 91.08 KB USB3: 60 * 5 * 1518 bytes = 454.80 KB Such large transmit queues can be problematic, especially for cellular modems. For example, with a typical celluar link speed of 10 Mbit/s, a fully occupied USB3 transmit queue results in: 454.80 KB / (10 Mbit/s / 8 bit/byte) = 363.84 ms of additional latency. This patch adds support for Byte Queue Limits (BQL) [1] to dynamically manage the transmit queue size and reduce latency without sacrificing throughput. Testing was performed on various devices using the usbnet driver for packet transmission: - DELOCK 66045: USB3 to 2.5 GbE adapter (ax88179_178a) - DELOCK 61969: USB2 to 1 GbE adapter (asix) - Quectel RM520: 5G modem (qmi_wwan) - USB2 Android tethering (cdc_ncm) No performance degradation was observed for iperf3 TCP or UDP traffic, while latency for a prioritized ping application was significantly reduced. For example, using the USB3 to 2.5 GbE adapter, which was fully utilized by iperf3 UDP traffic, the prioritized ping was improved from 1.6 ms to 0.6 ms. With the same setup but with a 100 Mbit/s Ethernet connection, the prioritized ping was improved from 35 ms to 5 ms. [1] https://lwn.net/Articles/469652/ Signed-off-by: Simon Schippers Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20251106175615.26948-1-simon.schippers@tu-dortmund.de Signed-off-by: Jakub Kicinski --- include/linux/usb/usbnet.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index a2d54122823d..2945923a8a95 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -14,6 +14,7 @@ #include #include #include +#include /* interface from usbnet core to each USB networking link we handle */ struct usbnet { @@ -59,6 +60,7 @@ struct usbnet { struct mutex interrupt_mutex; struct usb_anchor deferred; struct work_struct bh_work; + spinlock_t bql_spinlock; struct work_struct kevent; unsigned long flags; -- cgit v1.2.3 From 62ed1b58224636185fa689db81224b8c8af46473 Mon Sep 17 00:00:00 2001 From: Li Nan Date: Mon, 3 Nov 2025 20:57:57 +0800 Subject: md: allow configuring logical block size Previously, raid array used the maximum logical block size (LBS) of all member disks. Adding a larger LBS disk at runtime could unexpectedly increase RAID's LBS, risking corruption of existing partitions. This can be reproduced by: ``` # LBS of sd[de] is 512 bytes, sdf is 4096 bytes. mdadm -CRq /dev/md0 -l1 -n3 /dev/sd[de] missing --assume-clean # LBS is 512 cat /sys/block/md0/queue/logical_block_size # create partition md0p1 parted -s /dev/md0 mklabel gpt mkpart primary 1MiB 100% lsblk | grep md0p1 # LBS becomes 4096 after adding sdf mdadm --add -q /dev/md0 /dev/sdf cat /sys/block/md0/queue/logical_block_size # partition lost partprobe /dev/md0 lsblk | grep md0p1 ``` Simply restricting larger-LBS disks is inflexible. In some scenarios, only disks with 512 bytes LBS are available currently, but later, disks with 4KB LBS may be added to the array. Making LBS configurable is the best way to solve this scenario. After this patch, the raid will: - store LBS in disk metadata - add a read-write sysfs 'mdX/logical_block_size' Future mdadm should support setting LBS via metadata field during RAID creation and the new sysfs. Though the kernel allows runtime LBS changes, users should avoid modifying it after creating partitions or filesystems to prevent compatibility issues. Only 1.x metadata supports configurable LBS. 0.90 metadata inits all fields to default values at auto-detect. Supporting 0.90 would require more extensive changes and no such use case has been observed. Note that many RAID paths rely on PAGE_SIZE alignment, including for metadata I/O. A larger LBS than PAGE_SIZE will result in metadata read/write failures. So this config should be prevented. Link: https://lore.kernel.org/linux-raid/20251103125757.1405796-6-linan666@huaweicloud.com Signed-off-by: Li Nan Reviewed-by: Xiao Ni Signed-off-by: Yu Kuai --- include/uapi/linux/raid/md_p.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index ac74133a4768..310068bb2a1d 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -291,7 +291,8 @@ struct mdp_superblock_1 { __le64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ __le32 sb_csum; /* checksum up to devs[max_dev] */ __le32 max_dev; /* size of devs[] array to consider */ - __u8 pad3[64-32]; /* set to 0 when writing */ + __le32 logical_block_size; /* same as q->limits->logical_block_size */ + __u8 pad3[64-36]; /* set to 0 when writing */ /* device state information. Indexed by dev_number. * 2 bytes per device -- cgit v1.2.3 From 2b9a0f21fbb8a3b7df7faa5b7534897a86c44b98 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:13 +0100 Subject: ns: move namespace types into separate header Add a dedicated header for namespace types. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-1-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns/ns_common_types.h | 205 +++++++++++++++++++++++++++++++++++++ include/linux/ns_common.h | 196 +---------------------------------- 2 files changed, 206 insertions(+), 195 deletions(-) create mode 100644 include/linux/ns/ns_common_types.h (limited to 'include') diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h new file mode 100644 index 000000000000..ccd1d1e116f6 --- /dev/null +++ b/include/linux/ns/ns_common_types.h @@ -0,0 +1,205 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NS_COMMON_TYPES_H +#define _LINUX_NS_COMMON_TYPES_H + +#include +#include +#include +#include + +struct cgroup_namespace; +struct dentry; +struct ipc_namespace; +struct mnt_namespace; +struct net; +struct pid_namespace; +struct proc_ns_operations; +struct time_namespace; +struct user_namespace; +struct uts_namespace; + +extern struct cgroup_namespace init_cgroup_ns; +extern struct ipc_namespace init_ipc_ns; +extern struct mnt_namespace init_mnt_ns; +extern struct net init_net; +extern struct pid_namespace init_pid_ns; +extern struct time_namespace init_time_ns; +extern struct user_namespace init_user_ns; +extern struct uts_namespace init_uts_ns; + +extern const struct proc_ns_operations cgroupns_operations; +extern const struct proc_ns_operations ipcns_operations; +extern const struct proc_ns_operations mntns_operations; +extern const struct proc_ns_operations netns_operations; +extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; +extern const struct proc_ns_operations timens_operations; +extern const struct proc_ns_operations timens_for_children_operations; +extern const struct proc_ns_operations userns_operations; +extern const struct proc_ns_operations utsns_operations; + +/* + * Namespace lifetimes are managed via a two-tier reference counting model: + * + * (1) __ns_ref (refcount_t): Main reference count tracking memory + * lifetime. Controls when the namespace structure itself is freed. + * It also pins the namespace on the namespace trees whereas (2) + * only regulates their visibility to userspace. + * + * (2) __ns_ref_active (atomic_t): Reference count tracking active users. + * Controls visibility of the namespace in the namespace trees. + * Any live task that uses the namespace (via nsproxy or cred) holds + * an active reference. Any open file descriptor or bind-mount of + * the namespace holds an active reference. Once all tasks have + * called exited their namespaces and all file descriptors and + * bind-mounts have been released the active reference count drops + * to zero and the namespace becomes inactive. IOW, the namespace + * cannot be listed or opened via file handles anymore. + * + * Note that it is valid to transition from active to inactive and + * back from inactive to active e.g., when resurrecting an inactive + * namespace tree via the SIOCGSKNS ioctl(). + * + * Relationship and lifecycle states: + * + * - Active (__ns_ref_active > 0): + * Namespace is actively used and visible to userspace. The namespace + * can be reopened via /proc//ns/, via namespace file + * handles, or discovered via listns(). + * + * - Inactive (__ns_ref_active == 0, __ns_ref > 0): + * No tasks are actively using the namespace and it isn't pinned by + * any bind-mounts or open file descriptors anymore. But the namespace + * is still kept alive by internal references. For example, the user + * namespace could be pinned by an open file through file->f_cred + * references when one of the now defunct tasks had opened a file and + * handed the file descriptor off to another process via a UNIX + * sockets. Such references keep the namespace structure alive through + * __ns_ref but will not hold an active reference. + * + * - Destroyed (__ns_ref == 0): + * No references remain. The namespace is removed from the tree and freed. + * + * State transitions: + * + * Active -> Inactive: + * When the last task using the namespace exits it drops its active + * references to all namespaces. However, user and pid namespaces + * remain accessible until the task has been reaped. + * + * Inactive -> Active: + * An inactive namespace tree might be resurrected due to e.g., the + * SIOCGSKNS ioctl() on a socket. + * + * Inactive -> Destroyed: + * When __ns_ref drops to zero the namespace is removed from the + * namespaces trees and the memory is freed (after RCU grace period). + * + * Initial namespaces: + * Boot-time namespaces (init_net, init_pid_ns, etc.) start with + * __ns_ref_active = 1 and remain active forever. + */ +struct ns_common { + u32 ns_type; + struct dentry *stashed; + const struct proc_ns_operations *ops; + unsigned int inum; + refcount_t __ns_ref; /* do not use directly */ + union { + struct { + u64 ns_id; + struct /* global namespace rbtree and list */ { + struct rb_node ns_unified_tree_node; + struct list_head ns_unified_list_node; + }; + struct /* per type rbtree and list */ { + struct rb_node ns_tree_node; + struct list_head ns_list_node; + }; + struct /* namespace ownership rbtree and list */ { + struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */ + struct list_head ns_owner; /* list of namespaces owned by this namespace */ + struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */ + struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */ + }; + atomic_t __ns_ref_active; /* do not use directly */ + }; + struct rcu_head ns_rcu; + }; +}; + +#define to_ns_common(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &(__ns)->ns, \ + const struct cgroup_namespace *: &(__ns)->ns, \ + struct ipc_namespace *: &(__ns)->ns, \ + const struct ipc_namespace *: &(__ns)->ns, \ + struct mnt_namespace *: &(__ns)->ns, \ + const struct mnt_namespace *: &(__ns)->ns, \ + struct net *: &(__ns)->ns, \ + const struct net *: &(__ns)->ns, \ + struct pid_namespace *: &(__ns)->ns, \ + const struct pid_namespace *: &(__ns)->ns, \ + struct time_namespace *: &(__ns)->ns, \ + const struct time_namespace *: &(__ns)->ns, \ + struct user_namespace *: &(__ns)->ns, \ + const struct user_namespace *: &(__ns)->ns, \ + struct uts_namespace *: &(__ns)->ns, \ + const struct uts_namespace *: &(__ns)->ns) + +#define ns_init_inum(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ + struct ipc_namespace *: IPC_NS_INIT_INO, \ + struct mnt_namespace *: MNT_NS_INIT_INO, \ + struct net *: NET_NS_INIT_INO, \ + struct pid_namespace *: PID_NS_INIT_INO, \ + struct time_namespace *: TIME_NS_INIT_INO, \ + struct user_namespace *: USER_NS_INIT_INO, \ + struct uts_namespace *: UTS_NS_INIT_INO) + +#define ns_init_ns(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: &init_cgroup_ns, \ + struct ipc_namespace *: &init_ipc_ns, \ + struct mnt_namespace *: &init_mnt_ns, \ + struct net *: &init_net, \ + struct pid_namespace *: &init_pid_ns, \ + struct time_namespace *: &init_time_ns, \ + struct user_namespace *: &init_user_ns, \ + struct uts_namespace *: &init_uts_ns) + +#define ns_init_id(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CGROUP_NS_INIT_ID, \ + struct ipc_namespace *: IPC_NS_INIT_ID, \ + struct mnt_namespace *: MNT_NS_INIT_ID, \ + struct net *: NET_NS_INIT_ID, \ + struct pid_namespace *: PID_NS_INIT_ID, \ + struct time_namespace *: TIME_NS_INIT_ID, \ + struct user_namespace *: USER_NS_INIT_ID, \ + struct uts_namespace *: UTS_NS_INIT_ID) + +#define to_ns_operations(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ + struct mnt_namespace *: &mntns_operations, \ + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) + +#define ns_common_type(__ns) \ + _Generic((__ns), \ + struct cgroup_namespace *: CLONE_NEWCGROUP, \ + struct ipc_namespace *: CLONE_NEWIPC, \ + struct mnt_namespace *: CLONE_NEWNS, \ + struct net *: CLONE_NEWNET, \ + struct pid_namespace *: CLONE_NEWPID, \ + struct time_namespace *: CLONE_NEWTIME, \ + struct user_namespace *: CLONE_NEWUSER, \ + struct uts_namespace *: CLONE_NEWUTS) + +#endif /* _LINUX_NS_COMMON_TYPES_H */ diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 66ea09b48377..6a4ca8c3b9c4 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -2,133 +2,12 @@ #ifndef _LINUX_NS_COMMON_H #define _LINUX_NS_COMMON_H +#include #include -#include #include #include #include -struct proc_ns_operations; - -struct cgroup_namespace; -struct ipc_namespace; -struct mnt_namespace; -struct net; -struct pid_namespace; -struct time_namespace; -struct user_namespace; -struct uts_namespace; - -extern struct cgroup_namespace init_cgroup_ns; -extern struct ipc_namespace init_ipc_ns; -extern struct mnt_namespace init_mnt_ns; -extern struct net init_net; -extern struct pid_namespace init_pid_ns; -extern struct time_namespace init_time_ns; -extern struct user_namespace init_user_ns; -extern struct uts_namespace init_uts_ns; - -extern const struct proc_ns_operations netns_operations; -extern const struct proc_ns_operations utsns_operations; -extern const struct proc_ns_operations ipcns_operations; -extern const struct proc_ns_operations pidns_operations; -extern const struct proc_ns_operations pidns_for_children_operations; -extern const struct proc_ns_operations userns_operations; -extern const struct proc_ns_operations mntns_operations; -extern const struct proc_ns_operations cgroupns_operations; -extern const struct proc_ns_operations timens_operations; -extern const struct proc_ns_operations timens_for_children_operations; - -/* - * Namespace lifetimes are managed via a two-tier reference counting model: - * - * (1) __ns_ref (refcount_t): Main reference count tracking memory - * lifetime. Controls when the namespace structure itself is freed. - * It also pins the namespace on the namespace trees whereas (2) - * only regulates their visibility to userspace. - * - * (2) __ns_ref_active (atomic_t): Reference count tracking active users. - * Controls visibility of the namespace in the namespace trees. - * Any live task that uses the namespace (via nsproxy or cred) holds - * an active reference. Any open file descriptor or bind-mount of - * the namespace holds an active reference. Once all tasks have - * called exited their namespaces and all file descriptors and - * bind-mounts have been released the active reference count drops - * to zero and the namespace becomes inactive. IOW, the namespace - * cannot be listed or opened via file handles anymore. - * - * Note that it is valid to transition from active to inactive and - * back from inactive to active e.g., when resurrecting an inactive - * namespace tree via the SIOCGSKNS ioctl(). - * - * Relationship and lifecycle states: - * - * - Active (__ns_ref_active > 0): - * Namespace is actively used and visible to userspace. The namespace - * can be reopened via /proc//ns/, via namespace file - * handles, or discovered via listns(). - * - * - Inactive (__ns_ref_active == 0, __ns_ref > 0): - * No tasks are actively using the namespace and it isn't pinned by - * any bind-mounts or open file descriptors anymore. But the namespace - * is still kept alive by internal references. For example, the user - * namespace could be pinned by an open file through file->f_cred - * references when one of the now defunct tasks had opened a file and - * handed the file descriptor off to another process via a UNIX - * sockets. Such references keep the namespace structure alive through - * __ns_ref but will not hold an active reference. - * - * - Destroyed (__ns_ref == 0): - * No references remain. The namespace is removed from the tree and freed. - * - * State transitions: - * - * Active -> Inactive: - * When the last task using the namespace exits it drops its active - * references to all namespaces. However, user and pid namespaces - * remain accessible until the task has been reaped. - * - * Inactive -> Active: - * An inactive namespace tree might be resurrected due to e.g., the - * SIOCGSKNS ioctl() on a socket. - * - * Inactive -> Destroyed: - * When __ns_ref drops to zero the namespace is removed from the - * namespaces trees and the memory is freed (after RCU grace period). - * - * Initial namespaces: - * Boot-time namespaces (init_net, init_pid_ns, etc.) start with - * __ns_ref_active = 1 and remain active forever. - */ -struct ns_common { - u32 ns_type; - struct dentry *stashed; - const struct proc_ns_operations *ops; - unsigned int inum; - refcount_t __ns_ref; /* do not use directly */ - union { - struct { - u64 ns_id; - struct /* global namespace rbtree and list */ { - struct rb_node ns_unified_tree_node; - struct list_head ns_unified_list_node; - }; - struct /* per type rbtree and list */ { - struct rb_node ns_tree_node; - struct list_head ns_list_node; - }; - struct /* namespace ownership rbtree and list */ { - struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */ - struct list_head ns_owner; /* list of namespaces owned by this namespace */ - struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */ - struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */ - }; - atomic_t __ns_ref_active; /* do not use directly */ - }; - struct rcu_head ns_rcu; - }; -}; - bool is_current_namespace(struct ns_common *ns); int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); void __ns_common_free(struct ns_common *ns); @@ -147,79 +26,6 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) return ns->ns_id <= NS_LAST_INIT_ID; } -#define to_ns_common(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &(__ns)->ns, \ - const struct cgroup_namespace *: &(__ns)->ns, \ - struct ipc_namespace *: &(__ns)->ns, \ - const struct ipc_namespace *: &(__ns)->ns, \ - struct mnt_namespace *: &(__ns)->ns, \ - const struct mnt_namespace *: &(__ns)->ns, \ - struct net *: &(__ns)->ns, \ - const struct net *: &(__ns)->ns, \ - struct pid_namespace *: &(__ns)->ns, \ - const struct pid_namespace *: &(__ns)->ns, \ - struct time_namespace *: &(__ns)->ns, \ - const struct time_namespace *: &(__ns)->ns, \ - struct user_namespace *: &(__ns)->ns, \ - const struct user_namespace *: &(__ns)->ns, \ - struct uts_namespace *: &(__ns)->ns, \ - const struct uts_namespace *: &(__ns)->ns) - -#define ns_init_inum(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ - struct ipc_namespace *: IPC_NS_INIT_INO, \ - struct mnt_namespace *: MNT_NS_INIT_INO, \ - struct net *: NET_NS_INIT_INO, \ - struct pid_namespace *: PID_NS_INIT_INO, \ - struct time_namespace *: TIME_NS_INIT_INO, \ - struct user_namespace *: USER_NS_INIT_INO, \ - struct uts_namespace *: UTS_NS_INIT_INO) - -#define ns_init_ns(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: &init_cgroup_ns, \ - struct ipc_namespace *: &init_ipc_ns, \ - struct mnt_namespace *: &init_mnt_ns, \ - struct net *: &init_net, \ - struct pid_namespace *: &init_pid_ns, \ - struct time_namespace *: &init_time_ns, \ - struct user_namespace *: &init_user_ns, \ - struct uts_namespace *: &init_uts_ns) - -#define ns_init_id(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CGROUP_NS_INIT_ID, \ - struct ipc_namespace *: IPC_NS_INIT_ID, \ - struct mnt_namespace *: MNT_NS_INIT_ID, \ - struct net *: NET_NS_INIT_ID, \ - struct pid_namespace *: PID_NS_INIT_ID, \ - struct time_namespace *: TIME_NS_INIT_ID, \ - struct user_namespace *: USER_NS_INIT_ID, \ - struct uts_namespace *: UTS_NS_INIT_ID) - -#define to_ns_operations(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ - struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ - struct mnt_namespace *: &mntns_operations, \ - struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ - struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ - struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ - struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ - struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) - -#define ns_common_type(__ns) \ - _Generic((__ns), \ - struct cgroup_namespace *: CLONE_NEWCGROUP, \ - struct ipc_namespace *: CLONE_NEWIPC, \ - struct mnt_namespace *: CLONE_NEWNS, \ - struct net *: CLONE_NEWNET, \ - struct pid_namespace *: CLONE_NEWPID, \ - struct time_namespace *: CLONE_NEWTIME, \ - struct user_namespace *: CLONE_NEWUSER, \ - struct uts_namespace *: CLONE_NEWUTS) #define NS_COMMON_INIT(nsname, refs) \ { \ -- cgit v1.2.3 From ea1549e628ec51dcbea1d158301993364b754d75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:14 +0100 Subject: nstree: decouple from ns_common header Foward declare struct ns_common and remove the include of ns_common.h. We want ns_common.h to possibly include nstree structures but not the other way around. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-2-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/nstree.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 38674c6fa4f7..25040a98a92b 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -3,7 +3,6 @@ #ifndef _LINUX_NSTREE_H #define _LINUX_NSTREE_H -#include #include #include #include @@ -11,6 +10,8 @@ #include #include +struct ns_common; + extern struct ns_tree cgroup_ns_tree; extern struct ns_tree ipc_ns_tree; extern struct ns_tree mnt_ns_tree; -- cgit v1.2.3 From 1c64fb02ac46f5ca93ac9f5470f124921b4713b7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:15 +0100 Subject: nstree: move nstree types into separate header Introduce two new fundamental data structures for namespace tree management in a separate header file. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-3-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns/nstree_types.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/nstree.h | 1 + 2 files changed, 37 insertions(+) create mode 100644 include/linux/ns/nstree_types.h (limited to 'include') diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h new file mode 100644 index 000000000000..6ee0c39686f8 --- /dev/null +++ b/include/linux/ns/nstree_types.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2025 Christian Brauner */ +#ifndef _LINUX_NSTREE_TYPES_H +#define _LINUX_NSTREE_TYPES_H + +#include +#include + +/** + * struct ns_tree_root - Root of a namespace tree + * @ns_rb: Red-black tree root for efficient lookups + * @ns_list_head: List head for sequential iteration + * + * Each namespace tree maintains both an rbtree (for O(log n) lookups) + * and a list (for efficient sequential iteration). The list is kept in + * the same sorted order as the rbtree. + */ +struct ns_tree_root { + struct rb_root ns_rb; + struct list_head ns_list_head; +}; + +/** + * struct ns_tree_node - Node in a namespace tree + * @ns_node: Red-black tree node + * @ns_list_entry: List entry for sequential iteration + * + * Represents a namespace's position in a tree. Each namespace has + * multiple tree nodes for different trees (unified, per-type, owner). + */ +struct ns_tree_node { + struct rb_node ns_node; + struct list_head ns_list_entry; +}; + +#endif /* _LINUX_NSTREE_TYPES_H */ diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 25040a98a92b..0e275df7e99a 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -3,6 +3,7 @@ #ifndef _LINUX_NSTREE_H #define _LINUX_NSTREE_H +#include #include #include #include -- cgit v1.2.3 From d12ea8062fd31f02beeeb76a7884ab9bc4f5b197 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:16 +0100 Subject: nstree: add helper to operate on struct ns_tree_{node,root} Add helpers that work on the combined rbtree and rculist combined. This will make the code a lot more managable and legible. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-4-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/nstree.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 0e275df7e99a..98b848cf2f1c 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -22,6 +22,14 @@ extern struct ns_tree time_ns_tree; extern struct ns_tree user_ns_tree; extern struct ns_tree uts_ns_tree; +void ns_tree_node_init(struct ns_tree_node *node); +void ns_tree_root_init(struct ns_tree_root *root); +bool ns_tree_node_empty(const struct ns_tree_node *node); +struct rb_node *ns_tree_node_add(struct ns_tree_node *node, + struct ns_tree_root *root, + int (*cmp)(struct rb_node *, const struct rb_node *)); +void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root); + #define to_ns_tree(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(cgroup_ns_tree), \ -- cgit v1.2.3 From a657bc8a75cf40c3d0814fe6488ba4af56528f42 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:17 +0100 Subject: nstree: switch to new structures Switch the nstree management to the new combined structures. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-5-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns/ns_common_types.h | 27 +++++++++------------------ include/linux/ns/nstree_types.h | 19 +++++++++++++++++++ include/linux/ns_common.h | 27 +++++++++++++-------------- include/linux/nstree.h | 26 +++++++++++++------------- 4 files changed, 54 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/ns/ns_common_types.h b/include/linux/ns/ns_common_types.h index ccd1d1e116f6..b332b019b29c 100644 --- a/include/linux/ns/ns_common_types.h +++ b/include/linux/ns/ns_common_types.h @@ -3,6 +3,7 @@ #define _LINUX_NS_COMMON_TYPES_H #include +#include #include #include #include @@ -98,6 +99,13 @@ extern const struct proc_ns_operations utsns_operations; * Initial namespaces: * Boot-time namespaces (init_net, init_pid_ns, etc.) start with * __ns_ref_active = 1 and remain active forever. + * + * @ns_type: type of namespace (e.g., CLONE_NEWNET) + * @stashed: cached dentry to be used by the vfs + * @ops: namespace operations + * @inum: namespace inode number (quickly recycled for non-initial namespaces) + * @__ns_ref: main reference count (do not use directly) + * @ns_tree: namespace tree nodes and active reference count */ struct ns_common { u32 ns_type; @@ -106,24 +114,7 @@ struct ns_common { unsigned int inum; refcount_t __ns_ref; /* do not use directly */ union { - struct { - u64 ns_id; - struct /* global namespace rbtree and list */ { - struct rb_node ns_unified_tree_node; - struct list_head ns_unified_list_node; - }; - struct /* per type rbtree and list */ { - struct rb_node ns_tree_node; - struct list_head ns_list_node; - }; - struct /* namespace ownership rbtree and list */ { - struct rb_root ns_owner_tree; /* rbtree of namespaces owned by this namespace */ - struct list_head ns_owner; /* list of namespaces owned by this namespace */ - struct rb_node ns_owner_tree_node; /* node in the owner namespace's rbtree */ - struct list_head ns_owner_entry; /* node in the owner namespace's ns_owned list */ - }; - atomic_t __ns_ref_active; /* do not use directly */ - }; + struct ns_tree; struct rcu_head ns_rcu; }; }; diff --git a/include/linux/ns/nstree_types.h b/include/linux/ns/nstree_types.h index 6ee0c39686f8..2fb28ee31efb 100644 --- a/include/linux/ns/nstree_types.h +++ b/include/linux/ns/nstree_types.h @@ -33,4 +33,23 @@ struct ns_tree_node { struct list_head ns_list_entry; }; +/** + * struct ns_tree - Namespace tree nodes and active reference count + * @ns_id: Unique namespace identifier + * @__ns_ref_active: Active reference count (do not use directly) + * @ns_unified_node: Node in the global namespace tree + * @ns_tree_node: Node in the per-type namespace tree + * @ns_owner_node: Node in the owner namespace's tree of owned namespaces + * @ns_owner_root: Root of the tree of namespaces owned by this namespace + * (only used when this namespace is an owner) + */ +struct ns_tree { + u64 ns_id; + atomic_t __ns_ref_active; + struct ns_tree_node ns_unified_node; + struct ns_tree_node ns_tree_node; + struct ns_tree_node ns_owner_node; + struct ns_tree_root ns_owner_root; +}; + #endif /* _LINUX_NSTREE_TYPES_H */ diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 6a4ca8c3b9c4..f90509ee0900 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -26,20 +26,19 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) return ns->ns_id <= NS_LAST_INIT_ID; } - -#define NS_COMMON_INIT(nsname, refs) \ -{ \ - .ns_type = ns_common_type(&nsname), \ - .ns_id = ns_init_id(&nsname), \ - .inum = ns_init_inum(&nsname), \ - .ops = to_ns_operations(&nsname), \ - .stashed = NULL, \ - .__ns_ref = REFCOUNT_INIT(refs), \ - .__ns_ref_active = ATOMIC_INIT(1), \ - .ns_list_node = LIST_HEAD_INIT(nsname.ns.ns_list_node), \ - .ns_owner_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_entry), \ - .ns_owner = LIST_HEAD_INIT(nsname.ns.ns_owner), \ - .ns_unified_list_node = LIST_HEAD_INIT(nsname.ns.ns_unified_list_node), \ +#define NS_COMMON_INIT(nsname, refs) \ +{ \ + .ns_type = ns_common_type(&nsname), \ + .ns_id = ns_init_id(&nsname), \ + .inum = ns_init_inum(&nsname), \ + .ops = to_ns_operations(&nsname), \ + .stashed = NULL, \ + .__ns_ref = REFCOUNT_INIT(refs), \ + .__ns_ref_active = ATOMIC_INIT(1), \ + .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \ + .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \ + .ns_owner_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry), \ + .ns_owner_root.ns_list_head = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head), \ } #define ns_common_init(__ns) \ diff --git a/include/linux/nstree.h b/include/linux/nstree.h index 98b848cf2f1c..175e4625bfa6 100644 --- a/include/linux/nstree.h +++ b/include/linux/nstree.h @@ -13,14 +13,14 @@ struct ns_common; -extern struct ns_tree cgroup_ns_tree; -extern struct ns_tree ipc_ns_tree; -extern struct ns_tree mnt_ns_tree; -extern struct ns_tree net_ns_tree; -extern struct ns_tree pid_ns_tree; -extern struct ns_tree time_ns_tree; -extern struct ns_tree user_ns_tree; -extern struct ns_tree uts_ns_tree; +extern struct ns_tree_root cgroup_ns_tree; +extern struct ns_tree_root ipc_ns_tree; +extern struct ns_tree_root mnt_ns_tree; +extern struct ns_tree_root net_ns_tree; +extern struct ns_tree_root pid_ns_tree; +extern struct ns_tree_root time_ns_tree; +extern struct ns_tree_root user_ns_tree; +extern struct ns_tree_root uts_ns_tree; void ns_tree_node_init(struct ns_tree_node *node); void ns_tree_root_init(struct ns_tree_root *root); @@ -46,14 +46,14 @@ void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root); (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) u64 __ns_tree_gen_id(struct ns_common *ns, u64 id); -void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); -void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree); +void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree); struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, - struct ns_tree *ns_tree, + struct ns_tree_root *ns_tree, bool previous); -static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree, u64 id) +static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id) { __ns_tree_gen_id(ns, id); __ns_tree_add_raw(ns, ns_tree); @@ -91,6 +91,6 @@ static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree, #define ns_tree_adjoined_rcu(__ns, __previous) \ __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) -#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) +#define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node)) #endif /* _LINUX_NSTREE_H */ -- cgit v1.2.3 From ed93c0697a8dcb70972a77bca2522a6a23ba6658 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:20 +0100 Subject: ns: make is_initial_namespace() argument const We don't modify the data structure at all so pass it as const. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-8-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index f90509ee0900..7e4df96b7411 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -13,7 +13,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope void __ns_common_free(struct ns_common *ns); struct ns_common *__must_check ns_owner(struct ns_common *ns); -static __always_inline bool is_initial_namespace(struct ns_common *ns) +static __always_inline bool is_initial_namespace(const struct ns_common *ns) { VFS_WARN_ON_ONCE(ns->inum == 0); return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, -- cgit v1.2.3 From 6bf253855aa8c970d2191f87ee23f9f184ddaa79 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:21 +0100 Subject: ns: rename is_initial_namespace() Rename is_initial_namespace() to ns_init_inum() and make it symmetrical with the ns id variant. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-9-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 7e4df96b7411..b9e8f21a6984 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -13,7 +13,7 @@ int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_ope void __ns_common_free(struct ns_common *ns); struct ns_common *__must_check ns_owner(struct ns_common *ns); -static __always_inline bool is_initial_namespace(const struct ns_common *ns) +static __always_inline bool is_ns_init_inum(const struct ns_common *ns) { VFS_WARN_ON_ONCE(ns->inum == 0); return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, -- cgit v1.2.3 From 657aeb436d70c66583cb2b5b6c65ca64bcf503a8 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:24 +0100 Subject: ns: make all reference counts on initial namespace a nop They are always active so no need to needlessly cacheline ping-pong. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-12-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index b9e8f21a6984..5b8f2f0163d7 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -62,6 +62,8 @@ static __always_inline __must_check int __ns_ref_active_read(const struct ns_com static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { + if (is_ns_init_id(ns)) + return false; if (refcount_dec_and_test(&ns->__ns_ref)) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); return true; @@ -71,6 +73,8 @@ static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { + if (is_ns_init_id(ns)) + return true; if (refcount_inc_not_zero(&ns->__ns_ref)) return true; VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); @@ -82,12 +86,27 @@ static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns return refcount_read(&ns->__ns_ref); } +static __always_inline void __ns_ref_inc(struct ns_common *ns) +{ + if (is_ns_init_id(ns)) + return; + refcount_inc(&ns->__ns_ref); +} + +static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns, + spinlock_t *ns_lock) +{ + if (is_ns_init_id(ns)) + return false; + return refcount_dec_and_lock(&ns->__ns_ref, ns_lock); +} + #define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns))) -#define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) +#define ns_ref_inc(__ns) __ns_ref_inc(to_ns_common((__ns))) #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) -#define ns_ref_put_and_lock(__ns, __lock) \ - refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) +#define ns_ref_put_and_lock(__ns, __ns_lock) \ + __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) #define ns_ref_active_read(__ns) \ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) -- cgit v1.2.3 From 2b60d56acc5b4fcab29fc323e6b82597ec78596f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:25 +0100 Subject: ns: add asserts for initial namespace reference counts They always remain fixed at one. Notice when that assumptions is broken. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-13-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 5b8f2f0163d7..dfb6b798ba82 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -60,10 +60,17 @@ static __always_inline __must_check int __ns_ref_active_read(const struct ns_com return atomic_read(&ns->__ns_ref_active); } +static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) +{ + return refcount_read(&ns->__ns_ref); +} + static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { - if (is_ns_init_id(ns)) + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); return false; + } if (refcount_dec_and_test(&ns->__ns_ref)) { VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); return true; @@ -73,31 +80,32 @@ static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { - if (is_ns_init_id(ns)) + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); return true; + } if (refcount_inc_not_zero(&ns->__ns_ref)) return true; VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); return false; } -static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) -{ - return refcount_read(&ns->__ns_ref); -} - static __always_inline void __ns_ref_inc(struct ns_common *ns) { - if (is_ns_init_id(ns)) + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); return; + } refcount_inc(&ns->__ns_ref); } static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns, spinlock_t *ns_lock) { - if (is_ns_init_id(ns)) + if (is_ns_init_id(ns)) { + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); return false; + } return refcount_dec_and_lock(&ns->__ns_ref, ns_lock); } -- cgit v1.2.3 From 7118daabb65585163fd70eb782f1fbbdb64968a6 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:26 +0100 Subject: ns: add asserts for initial namespace active reference counts They always remain fixed at one. Notice when that assumptions is broken. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-14-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index dfb6b798ba82..43f709ab846a 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -69,6 +69,7 @@ static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return false; } if (refcount_dec_and_test(&ns->__ns_ref)) { @@ -82,6 +83,7 @@ static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return true; } if (refcount_inc_not_zero(&ns->__ns_ref)) @@ -94,6 +96,7 @@ static __always_inline void __ns_ref_inc(struct ns_common *ns) { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return; } refcount_inc(&ns->__ns_ref); @@ -104,6 +107,7 @@ static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common { if (is_ns_init_id(ns)) { VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); return false; } return refcount_dec_and_lock(&ns->__ns_ref, ns_lock); -- cgit v1.2.3 From 282879afa01936954a570e15b4088a89b6e1b549 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:27 +0100 Subject: pid: rely on common reference count behavior Now that we changed the generic reference counting mechanism for all namespaces to never manipulate reference counts of initial namespaces we can drop the special handling for pid namespaces. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-15-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/pid_namespace.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 445517a72ad0..0e7ae12c96d2 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -61,8 +61,7 @@ static inline struct pid_namespace *to_pid_ns(struct ns_common *ns) static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { - if (ns != &init_pid_ns) - ns_ref_inc(ns); + ns_ref_inc(ns); return ns; } -- cgit v1.2.3 From c2bbd2db521b018c59fb0ff8e1cdfa8ee907ba88 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 10 Nov 2025 16:08:28 +0100 Subject: ns: drop custom reference count initialization for initial namespaces Initial namespaces don't modify their reference count anymore. They remain fixed at one so drop the custom refcount initializations. Link: https://patch.msgid.link/20251110-work-namespace-nstree-fixes-v1-16-e8a9264e0fb9@kernel.org Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 43f709ab846a..136f6a322e53 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -26,14 +26,14 @@ static __always_inline bool is_ns_init_id(const struct ns_common *ns) return ns->ns_id <= NS_LAST_INIT_ID; } -#define NS_COMMON_INIT(nsname, refs) \ +#define NS_COMMON_INIT(nsname) \ { \ .ns_type = ns_common_type(&nsname), \ .ns_id = ns_init_id(&nsname), \ .inum = ns_init_inum(&nsname), \ .ops = to_ns_operations(&nsname), \ .stashed = NULL, \ - .__ns_ref = REFCOUNT_INIT(refs), \ + .__ns_ref = REFCOUNT_INIT(1), \ .__ns_ref_active = ATOMIC_INIT(1), \ .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \ .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \ -- cgit v1.2.3 From 8da7bea7db692e786165b71729fb68b7ff65ee56 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Fri, 31 Oct 2025 18:33:28 +0800 Subject: xsk: add indirect call for xsk_destruct_skb Since Eric proposed an idea about adding indirect call wrappers for UDP and managed to see a huge improvement[1], the same situation can also be applied in xsk scenario. This patch adds an indirect call for xsk and helps current copy mode improve the performance by around 1% stably which was observed with IXGBE at 10Gb/sec loaded. If the throughput grows, the positive effect will be magnified. I applied this patch on top of batch xmit series[2], and was able to see <5% improvement from our internal application which is a little bit unstable though. Use INDIRECT wrappers to keep xsk_destruct_skb static as it used to be when the mitigation config is off. Be aware of the freeing path that can be very hot since the frequency can reach around 2,000,000 times per second with the xdpsock test. [1]: https://lore.kernel.org/netdev/20251006193103.2684156-2-edumazet@google.com/ [2]: https://lore.kernel.org/all/20251021131209.41491-1-kerneljasonxing@gmail.com/ Suggested-by: Alexander Lobakin Signed-off-by: Jason Xing Reviewed-by: Alexander Lobakin Link: https://patch.msgid.link/20251031103328.95468-1-kerneljasonxing@gmail.com Signed-off-by: Paolo Abeni --- include/net/xdp_sock.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index ce587a225661..23e8861e8b25 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -125,6 +125,7 @@ struct xsk_tx_metadata_ops { int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(struct list_head *flush_list); +INDIRECT_CALLABLE_DECLARE(void xsk_destruct_skb(struct sk_buff *)); /** * xsk_tx_metadata_to_compl - Save enough relevant metadata information @@ -218,6 +219,12 @@ static inline void __xsk_map_flush(struct list_head *flush_list) { } +#ifdef CONFIG_MITIGATION_RETPOLINE +static inline void xsk_destruct_skb(struct sk_buff *skb) +{ +} +#endif + static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, struct xsk_tx_metadata_compl *compl) { -- cgit v1.2.3 From dca3aa666fbd71118905d88bb1c353881002b647 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sun, 9 Nov 2025 13:19:31 +0100 Subject: fs: move inode fields used during fast path lookup closer together This should avoid *some* cache misses. Successful path lookup is guaranteed to load at least ->i_mode, ->i_opflags and ->i_acl. At the same time the common case will avoid looking at more fields. struct inode is not guaranteed to have any particular alignment, notably ext4 has it only aligned to 8 bytes meaning nearby fields might happen to be on the same or only adjacent cache lines depending on luck (or no luck). According to pahole: umode_t i_mode; /* 0 2 */ short unsigned int i_opflags; /* 2 2 */ kuid_t i_uid; /* 4 4 */ kgid_t i_gid; /* 8 4 */ unsigned int i_flags; /* 12 4 */ struct posix_acl * i_acl; /* 16 8 */ struct posix_acl * i_default_acl; /* 24 8 */ ->i_acl is unnecessarily separated by 8 bytes from the other fields. With struct inode being offset 48 bytes into the cacheline this means an avoidable miss. Note it will still be there for the 56 byte case. New layout: umode_t i_mode; /* 0 2 */ short unsigned int i_opflags; /* 2 2 */ unsigned int i_flags; /* 4 4 */ struct posix_acl * i_acl; /* 8 8 */ struct posix_acl * i_default_acl; /* 16 8 */ kuid_t i_uid; /* 24 4 */ kgid_t i_gid; /* 28 4 */ I verified with pahole there are no size or hole changes. This is stopgap until someone(tm) sanitizes the layout in the first place, allocation methods aside. Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251109121931.1285366-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c0c0095b2b60..64dc2e2c281f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -781,14 +781,13 @@ enum inode_state_flags_t { struct inode { umode_t i_mode; unsigned short i_opflags; - kuid_t i_uid; - kgid_t i_gid; unsigned int i_flags; - #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif + kuid_t i_uid; + kgid_t i_gid; const struct inode_operations *i_op; struct super_block *i_sb; -- cgit v1.2.3 From e18efacc9c2f17b12c6e019cabad70a2989bd3a9 Mon Sep 17 00:00:00 2001 From: Pagadala Yesu Anjaneyulu Date: Mon, 10 Nov 2025 14:10:29 +0200 Subject: wifi: cfg80211/mac80211: clean up duplicate ap_power handling Move duplicated ap_power type handling code to an inline function in cfg80211. Signed-off-by: Pagadala Yesu Anjaneyulu Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251110140806.959948da1cb5.I893b5168329fb3232f249c182a35c99804112da6@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d87c18e1b133..1b257eaf8de5 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -10134,6 +10134,30 @@ static inline int cfg80211_color_change_notify(struct net_device *dev, 0, 0, link_id); } +/** + * cfg80211_6ghz_power_type - determine AP regulatory power type + * @control: control flags + * + * Return: regulatory power type from &enum ieee80211_ap_reg_power + */ +static inline enum ieee80211_ap_reg_power +cfg80211_6ghz_power_type(u8 control) +{ + switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { + case IEEE80211_6GHZ_CTRL_REG_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: + return IEEE80211_REG_LPI_AP; + case IEEE80211_6GHZ_CTRL_REG_SP_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: + return IEEE80211_REG_SP_AP; + case IEEE80211_6GHZ_CTRL_REG_VLP_AP: + return IEEE80211_REG_VLP_AP; + default: + return IEEE80211_REG_UNSET_AP; + } +} + /** * cfg80211_links_removed - Notify about removed STA MLD setup links. * @dev: network device. -- cgit v1.2.3 From b54cf0f4495a8f3fa94245cdda7716792400299e Mon Sep 17 00:00:00 2001 From: Pagadala Yesu Anjaneyulu Date: Mon, 10 Nov 2025 14:10:30 +0200 Subject: wifi: cfg80211/mac80211: Add fallback mechanism for INDOOR_SP connection Implement fallback to LPI mode when SP mode is not permitted by regulatory constraints for INDOOR_SP connections. Limit fallback mechanism to client mode. Signed-off-by: Pagadala Yesu Anjaneyulu Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251110140806.8b43201a34ae.I37fc7bb5892eb9d044d619802e8f2095fde6b296@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1b257eaf8de5..625cb2c78361 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -10137,22 +10137,27 @@ static inline int cfg80211_color_change_notify(struct net_device *dev, /** * cfg80211_6ghz_power_type - determine AP regulatory power type * @control: control flags + * @client_flags: &enum ieee80211_channel_flags for station mode to enable + * SP to LPI fallback, zero otherwise. * * Return: regulatory power type from &enum ieee80211_ap_reg_power */ static inline enum ieee80211_ap_reg_power -cfg80211_6ghz_power_type(u8 control) +cfg80211_6ghz_power_type(u8 control, u32 client_flags) { switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { case IEEE80211_6GHZ_CTRL_REG_LPI_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: return IEEE80211_REG_LPI_AP; case IEEE80211_6GHZ_CTRL_REG_SP_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: return IEEE80211_REG_SP_AP; case IEEE80211_6GHZ_CTRL_REG_VLP_AP: return IEEE80211_REG_VLP_AP; + case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: + if (client_flags & IEEE80211_CHAN_NO_6GHZ_AFC_CLIENT) + return IEEE80211_REG_LPI_AP; + return IEEE80211_REG_SP_AP; default: return IEEE80211_REG_UNSET_AP; } -- cgit v1.2.3 From f5a538c07df26f5c601e41f7b9c7ade3e1e75803 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 31 Oct 2025 13:54:24 +0100 Subject: sched/deadline: Fix dl_server stop condition Gabriel reported that the dl_server doesn't stop as expected. The problem was found to be the fact that idle time and fair runtime are treated equally. Both will count towards dl_server runtime and push the activation forwards when it is in the zero-laxity wait state. Notably: dl_server_update_idle() update_curr_dl_se() if (dl_defer && dl_throttled && dl_runtime_exceeded()) hrtimer_try_to_cancel(); // stop timer replenish_dl_new_period() deadline = now + dl_deadline; // fwd period runtime = dl_runtime; start_dl_timer(); // restart timer And while we do want idle time accounted towards the *current* activation of the dl_server -- after all, a fair task could've ran if we had any -- we don't necessarily want idle time to cause or push forward an activation. Introduce dl_defer_idle to make this distinction. It will be set once idle time pushed the activation forward, once set idle time will only be allowed to consume any runtime but not push the activation. This will then cause dl_server_timer() to fire, which will stop the dl_server. Any non-idle time accounting during this phase will clear dl_defer_idle, so only a full period of idle will cause the dl_server to stop. Reported-by: Gabriele Monaco Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251101000057.GA2184199@noisy.programming.kicks-ass.net --- include/linux/sched.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 07576479c0ed..bb436ee1942d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -685,20 +685,22 @@ struct sched_dl_entity { * * @dl_server tells if this is a server entity. * - * @dl_defer tells if this is a deferred or regular server. For - * now only defer server exists. - * - * @dl_defer_armed tells if the deferrable server is waiting - * for the replenishment timer to activate it. - * * @dl_server_active tells if the dlserver is active(started). * dlserver is started on first cfs enqueue on an idle runqueue * and is stopped when a dequeue results in 0 cfs tasks on the * runqueue. In other words, dlserver is active only when cpu's * runqueue has atleast one cfs task. * + * @dl_defer tells if this is a deferred or regular server. For + * now only defer server exists. + * + * @dl_defer_armed tells if the deferrable server is waiting + * for the replenishment timer to activate it. + * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. + * + * @dl_defer_idle tracks idle state */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; @@ -709,6 +711,7 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; + unsigned int dl_defer_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its -- cgit v1.2.3 From 0e535824d0bcf7c9bb0532d902283c31c78cd6f3 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 7 Nov 2025 23:04:02 -0800 Subject: devlink: Introduce switchdev_inactive eswitch mode Adds DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE attribute to UAPI and documentation. Before having traffic flow through an eswitch, a user may want to have the ability to block traffic towards the FDB until FDB is fully programmed and the user is ready to send traffic to it. For example: when two eswitches are present for vports in a multi-PF setup, one eswitch may take over the traffic from the other when the user chooses. Before this take over, a user may want to first program the inactive eswitch and then once ready redirect traffic to this new eswitch. switchdev modes transition semantics: legacy->switchdev_inactive: Create switchdev mode normally, traffic not allowed to flow yet. switchdev_inactive->switchdev: Enable traffic to flow. switchdev->switchdev_inactive: Block traffic on the FDB, FDB and representros state and content is preserved. When eswitch is configured to this mode, traffic is ignored/dropped on this eswitch FDB, while current configuration is kept, e.g FDB rules and netdev representros are kept available, FDB programming is allowed. Example: # start inactive switchdev devlink dev eswitch set pci/0000:08:00.1 mode switchdev_inactive # setup TC rules, representors etc .. # activate devlink dev eswitch set pci/0000:08:00.1 mode switchdev Signed-off-by: Saeed Mahameed Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20251108070404.1551708-2-saeed@kernel.org Signed-off-by: Paolo Abeni --- include/uapi/linux/devlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index bcad11a787a5..157f11d3fb72 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -181,6 +181,7 @@ enum devlink_sb_threshold_type { enum devlink_eswitch_mode { DEVLINK_ESWITCH_MODE_LEGACY, DEVLINK_ESWITCH_MODE_SWITCHDEV, + DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE, }; enum devlink_eswitch_inline_mode { -- cgit v1.2.3 From 9da611df15aa8d519f9947b88a5c733267cba888 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Fri, 7 Nov 2025 23:04:04 -0800 Subject: net/mlx5: E-Switch, support eswitch inactive mode Add support for eswitch switchdev inactive mode Inactive mode: Drop all traffic going to FDB, Remove mpfs l2 rules and disconnect adjacent vports. Active mode: Traffic flows through FDB, mpfs table populated, and adjacent vports are connected. Signed-off-by: Saeed Mahameed Signed-off-by: Adithya Jayachandran Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20251108070404.1551708-4-saeed@kernel.org Signed-off-by: Paolo Abeni --- include/linux/mlx5/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 6ac76a0c3827..7bf2449c53b2 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -116,6 +116,7 @@ enum mlx5_flow_namespace_type { }; enum { + FDB_DROP_ROOT, FDB_BYPASS_PATH, FDB_CRYPTO_INGRESS, FDB_TC_OFFLOAD, -- cgit v1.2.3 From 4f739ed19d222de33b19ca639a34523fbbec20d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 14 Oct 2025 07:51:56 +0200 Subject: rv: Pass va_list to reactors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The only thing the reactors can do with the passed in varargs is to convert it into a va_list. Do that in a central helper instead. It simplifies the reactors, removes some hairy macro-generated code and introduces a convenient hook point to modify reactor behavior. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-1-0b9e51919ea8@linutronix.de Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 11 +++++++++-- include/rv/da_monitor.h | 35 ++++++++++------------------------- include/rv/ltl_monitor.h | 18 +++++------------- 3 files changed, 24 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/rv.h b/include/linux/rv.h index 9520aab34bcb..b567b0191e67 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -88,7 +88,7 @@ union rv_task_monitor { struct rv_reactor { const char *name; const char *description; - __printf(1, 2) void (*react)(const char *msg, ...); + __printf(1, 0) void (*react)(const char *msg, va_list args); struct list_head list; }; #endif @@ -102,7 +102,7 @@ struct rv_monitor { void (*reset)(void); #ifdef CONFIG_RV_REACTORS struct rv_reactor *reactor; - __printf(1, 2) void (*react)(const char *msg, ...); + __printf(1, 0) void (*react)(const char *msg, va_list args); #endif struct list_head list; struct rv_monitor *parent; @@ -119,11 +119,18 @@ void rv_put_task_monitor_slot(int slot); bool rv_reacting_on(void); int rv_unregister_reactor(struct rv_reactor *reactor); int rv_register_reactor(struct rv_reactor *reactor); +__printf(2, 3) +void rv_react(struct rv_monitor *monitor, const char *msg, ...); #else static inline bool rv_reacting_on(void) { return false; } + +__printf(2, 3) +static inline void rv_react(struct rv_monitor *monitor, const char *msg, ...) +{ +} #endif /* CONFIG_RV_REACTORS */ #endif /* CONFIG_RV */ diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h index 17fa4f6e5ea6..0cef64366538 100644 --- a/include/rv/da_monitor.h +++ b/include/rv/da_monitor.h @@ -16,34 +16,19 @@ #include #include -#ifdef CONFIG_RV_REACTORS - -#define DECLARE_RV_REACTING_HELPERS(name, type) \ -static void cond_react_##name(type curr_state, type event) \ -{ \ - if (!rv_reacting_on() || !rv_##name.react) \ - return; \ - rv_##name.react("rv: monitor %s does not allow event %s on state %s\n", \ - #name, \ - model_get_event_name_##name(event), \ - model_get_state_name_##name(curr_state)); \ -} - -#else /* CONFIG_RV_REACTOR */ - -#define DECLARE_RV_REACTING_HELPERS(name, type) \ -static void cond_react_##name(type curr_state, type event) \ -{ \ - return; \ -} -#endif - /* * Generic helpers for all types of deterministic automata monitors. */ #define DECLARE_DA_MON_GENERIC_HELPERS(name, type) \ \ -DECLARE_RV_REACTING_HELPERS(name, type) \ +static void react_##name(type curr_state, type event) \ +{ \ + rv_react(&rv_##name, \ + "rv: monitor %s does not allow event %s on state %s\n", \ + #name, \ + model_get_event_name_##name(event), \ + model_get_state_name_##name(curr_state)); \ +} \ \ /* \ * da_monitor_reset_##name - reset a monitor and setting it to init state \ @@ -126,7 +111,7 @@ da_event_##name(struct da_monitor *da_mon, enum events_##name event) \ for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) { \ next_state = model_get_next_state_##name(curr_state, event); \ if (next_state == INVALID_STATE) { \ - cond_react_##name(curr_state, event); \ + react_##name(curr_state, event); \ trace_error_##name(model_get_state_name_##name(curr_state), \ model_get_event_name_##name(event)); \ return false; \ @@ -165,7 +150,7 @@ static inline bool da_event_##name(struct da_monitor *da_mon, struct task_struct for (int i = 0; i < MAX_DA_RETRY_RACING_EVENTS; i++) { \ next_state = model_get_next_state_##name(curr_state, event); \ if (next_state == INVALID_STATE) { \ - cond_react_##name(curr_state, event); \ + react_##name(curr_state, event); \ trace_error_##name(tsk->pid, \ model_get_state_name_##name(curr_state), \ model_get_event_name_##name(event)); \ diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h index 5368cf5fd623..00c42b36f961 100644 --- a/include/rv/ltl_monitor.h +++ b/include/rv/ltl_monitor.h @@ -16,21 +16,12 @@ #error "Please include $(MODEL_NAME).h generated by rvgen" #endif -#ifdef CONFIG_RV_REACTORS #define RV_MONITOR_NAME CONCATENATE(rv_, MONITOR_NAME) -static struct rv_monitor RV_MONITOR_NAME; -static void rv_cond_react(struct task_struct *task) -{ - if (!rv_reacting_on() || !RV_MONITOR_NAME.react) - return; - RV_MONITOR_NAME.react("rv: "__stringify(MONITOR_NAME)": %s[%d]: violation detected\n", - task->comm, task->pid); -} +#ifdef CONFIG_RV_REACTORS +static struct rv_monitor RV_MONITOR_NAME; #else -static void rv_cond_react(struct task_struct *task) -{ -} +extern struct rv_monitor RV_MONITOR_NAME; #endif static int ltl_monitor_slot = RV_PER_TASK_MONITOR_INIT; @@ -98,7 +89,8 @@ static void ltl_monitor_destroy(void) static void ltl_illegal_state(struct task_struct *task, struct ltl_monitor *mon) { CONCATENATE(trace_error_, MONITOR_NAME)(task); - rv_cond_react(task); + rv_react(&RV_MONITOR_NAME, "rv: "__stringify(MONITOR_NAME)": %s[%d]: violation detected\n", + task->comm, task->pid); } static void ltl_attempt_start(struct task_struct *task, struct ltl_monitor *mon) -- cgit v1.2.3 From 68f63cea46d3a410a41d9ab74d338038a22bc2ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 14 Oct 2025 07:51:57 +0200 Subject: rv: Make rv_reacting_on() static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no external users left. Signed-off-by: Thomas Weißschuh Link: https://lore.kernel.org/r/20251014-rv-lockdep-v1-2-0b9e51919ea8@linutronix.de Signed-off-by: Gabriele Monaco --- include/linux/rv.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/rv.h b/include/linux/rv.h index b567b0191e67..92fd467547e7 100644 --- a/include/linux/rv.h +++ b/include/linux/rv.h @@ -116,17 +116,11 @@ int rv_get_task_monitor_slot(void); void rv_put_task_monitor_slot(int slot); #ifdef CONFIG_RV_REACTORS -bool rv_reacting_on(void); int rv_unregister_reactor(struct rv_reactor *reactor); int rv_register_reactor(struct rv_reactor *reactor); __printf(2, 3) void rv_react(struct rv_monitor *monitor, const char *msg, ...); #else -static inline bool rv_reacting_on(void) -{ - return false; -} - __printf(2, 3) static inline void rv_react(struct rv_monitor *monitor, const char *msg, ...) { -- cgit v1.2.3 From 69f3474a01e9867dd99fc4b703fa834ea1835c7d Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 11 Nov 2025 13:08:49 +0000 Subject: ASoC: cs35l56: Add control to read CAL_SET_STATUS Create an ALSA control to read the value of the firmware CAL_SET_STATUS control. This reports whether the firmware is using a calibration blob or the default calibration from the .bin file. The firmware only reports a valid value in this register while audio is actually playing and the internal PLL is locked to the audio clock. Otherwise it returns a status of "unknown". Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20251111130850.513969-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 0a740a99ad31..bd13958bf19d 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -16,6 +16,8 @@ #include #include +struct snd_ctl_elem_value; + #define CS35L56_DEVID 0x0000000 #define CS35L56_REVID 0x0000004 #define CS35L56_RELID 0x000000C @@ -268,6 +270,10 @@ #define CS35L56_CAL_STATUS_SUCCESS 1 #define CS35L56_CAL_STATUS_OUT_OF_RANGE 3 +#define CS35L56_CAL_SET_STATUS_UNKNOWN 0 +#define CS35L56_CAL_SET_STATUS_DEFAULT 1 +#define CS35L56_CAL_SET_STATUS_SET 2 + #define CS35L56_CONTROL_PORT_READY_US 2200 #define CS35L56_HALO_STATE_POLL_US 1000 #define CS35L56_HALO_STATE_TIMEOUT_US 250000 @@ -363,6 +369,7 @@ extern const struct regmap_config cs35l63_regmap_i2c; extern const struct regmap_config cs35l63_regmap_sdw; extern const struct cirrus_amp_cal_controls cs35l56_calibration_controls; +extern const char * const cs35l56_cal_set_status_text[3]; extern const char * const cs35l56_tx_input_texts[CS35L56_NUM_INPUT_SRC]; extern const unsigned int cs35l56_tx_input_values[CS35L56_NUM_INPUT_SRC]; @@ -396,6 +403,8 @@ ssize_t cs35l56_cal_data_debugfs_write(struct cs35l56_base *cs35l56_base, void cs35l56_create_cal_debugfs(struct cs35l56_base *cs35l56_base, const struct cs35l56_cal_debugfs_fops *fops); void cs35l56_remove_cal_debugfs(struct cs35l56_base *cs35l56_base); +int cs35l56_cal_set_status_get(struct cs35l56_base *cs35l56_base, + struct snd_ctl_elem_value *uvalue); int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base, bool *fw_missing, unsigned int *fw_version); void cs35l56_log_tuning(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp); -- cgit v1.2.3 From 32172cf3cb543a04c41a1677c97a38e60cad05b6 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Tue, 11 Nov 2025 13:08:50 +0000 Subject: ASoC: cs35l56: Allow restoring factory calibration through ALSA control Add an ALSA control (CAL_DATA) that can be used to restore amp calibration, instead of using debugfs. A readback control (CAL_DATA_RB) is also added for factory testing. On ChromeOS the process that restores amp calibration from NVRAM has limited permissions and cannot access debugfs. It requires an ALSA control that it can write the calibration blob into. ChromeOS also restricts access to ALSA controls, which avoids the risk of accidental or malicious overwriting of good calibration data with bad data. As this control is not needed for normal Linux-based distros it is a Kconfig option. A separate control, CAL_DATA_RB, provides a readback of the current calibration data, which could be either from a write to CAL_DATA or the result of factory production-line calibration. The write and read are intentionally separate controls to defeat "dumb" save-and-restore tools like alsa-restore that assume it is safe to save all control values and write them back in any order at some undefined future time. Such behavior carries the risk of restoring stale or bad data over the top of good data. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20251111130850.513969-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index bd13958bf19d..883f6a7e50aa 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -388,6 +388,8 @@ int cs35l56_runtime_suspend_common(struct cs35l56_base *cs35l56_base); int cs35l56_runtime_resume_common(struct cs35l56_base *cs35l56_base, bool is_soundwire); void cs35l56_init_cs_dsp(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp); int cs35l56_get_calibration(struct cs35l56_base *cs35l56_base); +int cs35l56_stash_calibration(struct cs35l56_base *cs35l56_base, + const struct cirrus_amp_cal_data *data); ssize_t cs35l56_calibrate_debugfs_write(struct cs35l56_base *cs35l56_base, const char __user *from, size_t count, loff_t *ppos); -- cgit v1.2.3 From c07a491c1b735e0c27454ea5c27a446d43401b1e Mon Sep 17 00:00:00 2001 From: David Wei Date: Fri, 31 Oct 2025 19:24:48 -0700 Subject: net: export netdev_get_by_index_lock() Need to call netdev_get_by_index_lock() from io_uring/zcrx.c, but it is currently private to net. Export the function in linux/netdevice.h. Signed-off-by: David Wei Acked-by: Jakub Kicinski Signed-off-by: Jens Axboe --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d1a687444b27..77c46a2823ec 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3401,6 +3401,7 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp); +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, -- cgit v1.2.3 From 3b521bf8c51246466e2c337f1f2b60acfdfe82d6 Mon Sep 17 00:00:00 2001 From: Laurentiu Mihalcea Date: Tue, 4 Nov 2025 04:02:55 -0800 Subject: dt-bindings: clock: document 8ULP's SIM LPAV Add documentation for i.MX8ULP's SIM LPAV module. Reviewed-by: Krzysztof Kozlowski Reviewed-by: Daniel Baluta Signed-off-by: Laurentiu Mihalcea Link: https://lore.kernel.org/r/20251104120301.913-3-laurentiumihalcea111@gmail.com Signed-off-by: Abel Vesa --- include/dt-bindings/clock/imx8ulp-clock.h | 5 +++++ include/dt-bindings/reset/fsl,imx8ulp-sim-lpav.h | 16 ++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 include/dt-bindings/reset/fsl,imx8ulp-sim-lpav.h (limited to 'include') diff --git a/include/dt-bindings/clock/imx8ulp-clock.h b/include/dt-bindings/clock/imx8ulp-clock.h index 827404fadf5c..c62d84d093a9 100644 --- a/include/dt-bindings/clock/imx8ulp-clock.h +++ b/include/dt-bindings/clock/imx8ulp-clock.h @@ -255,4 +255,9 @@ #define IMX8ULP_CLK_PCC5_END 56 +/* LPAV SIM */ +#define IMX8ULP_CLK_SIM_LPAV_HIFI_CORE 0 +#define IMX8ULP_CLK_SIM_LPAV_HIFI_PBCLK 1 +#define IMX8ULP_CLK_SIM_LPAV_HIFI_PLAT 2 + #endif diff --git a/include/dt-bindings/reset/fsl,imx8ulp-sim-lpav.h b/include/dt-bindings/reset/fsl,imx8ulp-sim-lpav.h new file mode 100644 index 000000000000..adf95bb26d21 --- /dev/null +++ b/include/dt-bindings/reset/fsl,imx8ulp-sim-lpav.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright 2025 NXP + */ + +#ifndef DT_BINDING_RESET_IMX8ULP_SIM_LPAV_H +#define DT_BINDING_RESET_IMX8ULP_SIM_LPAV_H + +#define IMX8ULP_SIM_LPAV_HIFI4_DSP_DBG_RST 0 +#define IMX8ULP_SIM_LPAV_HIFI4_DSP_RST 1 +#define IMX8ULP_SIM_LPAV_HIFI4_DSP_STALL 2 +#define IMX8ULP_SIM_LPAV_DSI_RST_BYTE_N 3 +#define IMX8ULP_SIM_LPAV_DSI_RST_ESC_N 4 +#define IMX8ULP_SIM_LPAV_DSI_RST_DPI_N 5 + +#endif /* DT_BINDING_RESET_IMX8ULP_SIM_LPAV_H */ -- cgit v1.2.3 From 781f60e45bdfe351aad692ac0fa89e36f8bf4a36 Mon Sep 17 00:00:00 2001 From: Conor Dooley Date: Mon, 10 Nov 2025 11:23:50 +0000 Subject: reset: mpfs: add non-auxiliary bus probing While the auxiliary bus was a nice bandaid, and meant that re-writing the representation of the clock regions in devicetree was not required, it has run its course. The "mss_top_sysreg" region that contains the clock and reset regions, also contains pinctrl and an interrupt controller, so the time has come rewrite the devicetree and probe the reset controller from an mfd devicetree node, rather than implement those drivers using the auxiliary bus. Wanting to avoid propagating this naive/incorrect description of the hardware to the new pic64gx SoC is a major motivating factor here. Reviewed-by: Philipp Zabel Acked-by: Philipp Zabel Signed-off-by: Conor Dooley --- include/soc/microchip/mpfs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/soc/microchip/mpfs.h b/include/soc/microchip/mpfs.h index 0bd67e10b704..ec04c98a8b63 100644 --- a/include/soc/microchip/mpfs.h +++ b/include/soc/microchip/mpfs.h @@ -14,6 +14,7 @@ #include #include +#include struct mpfs_sys_controller; @@ -44,7 +45,7 @@ struct mtd_info *mpfs_sys_controller_get_flash(struct mpfs_sys_controller *mpfs_ #if IS_ENABLED(CONFIG_MCHP_CLK_MPFS) #if IS_ENABLED(CONFIG_RESET_POLARFIRE_SOC) -int mpfs_reset_controller_register(struct device *clk_dev, void __iomem *base); +int mpfs_reset_controller_register(struct device *clk_dev, struct regmap *map); #else static inline int mpfs_reset_controller_register(struct device *clk_dev, void __iomem *base) { return 0; } #endif /* if IS_ENABLED(CONFIG_RESET_POLARFIRE_SOC) */ -- cgit v1.2.3 From cb46a58d77e5b433e9f4538faaa2a73970157e8d Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 10 Oct 2025 03:44:04 -0700 Subject: efi/memattr: Convert efi_memattr_init() return type to void The efi_memattr_init() function's return values (0 and -ENOMEM) are never checked by callers. Convert the function to return void since the return status is unused. Signed-off-by: Breno Leitao Acked-by: Ard Biesheuvel Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index a98cc39e7aaa..0b9eb3d2ff97 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -772,7 +772,7 @@ extern unsigned long efi_mem_attr_table; */ typedef int (*efi_memattr_perm_setter)(struct mm_struct *, efi_memory_desc_t *, bool); -extern int efi_memattr_init(void); +extern void efi_memattr_init(void); extern int efi_memattr_apply_permissions(struct mm_struct *mm, efi_memattr_perm_setter fn); -- cgit v1.2.3 From a2860501203cf7a2116adf3bb4e4c456c5750872 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 15 Oct 2025 22:56:37 +0200 Subject: efi/runtime-wrappers: Keep track of the efi_runtime_lock owner The EFI runtime wrappers use a file local semaphore to serialize access to the EFI runtime services. This means that any calls to the arch wrappers around the runtime services will also be serialized, removing the need for redundant locking. For robustness, add a facility that allows those arch wrappers to assert that the semaphore was taken by the current task. Signed-off-by: Ard Biesheuvel Acked-by: Catalin Marinas Signed-off-by: Catalin Marinas --- include/linux/efi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index a98cc39e7aaa..b23ff8b83219 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1126,6 +1126,8 @@ static inline bool efi_runtime_disabled(void) { return true; } extern void efi_call_virt_check_flags(unsigned long flags, const void *caller); extern unsigned long efi_call_virt_save_flags(void); +void efi_runtime_assert_lock_held(void); + enum efi_secureboot_mode { efi_secureboot_mode_unset, efi_secureboot_mode_unknown, -- cgit v1.2.3 From 3d176751e541362ff40c2478d6a2de41f8c62318 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 9 Nov 2025 15:47:17 -0800 Subject: lib/crypto: polyval: Add POLYVAL library Add support for POLYVAL to lib/crypto/. This will replace the polyval crypto_shash algorithm and its use in the hctr2 template, simplifying the code and reducing overhead. Specifically, this commit introduces the POLYVAL library API and a generic implementation of it. Later commits will migrate the existing architecture-optimized implementations of POLYVAL into lib/crypto/ and add a KUnit test suite. I've also rewritten the generic implementation completely, using a more modern approach instead of the traditional table-based approach. It's now constant-time, requires no precomputation or dynamic memory allocations, decreases the per-key memory usage from 4096 bytes to 16 bytes, and is faster than the old polyval-generic even on bulk data reusing the same key (at least on x86_64, where I measured 15% faster). We should do this for GHASH too, but for now just do it for POLYVAL. Reviewed-by: Ard Biesheuvel Tested-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251109234726.638437-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/polyval.h | 171 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 168 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h index d2e63743e592..5ba4c248cad1 100644 --- a/include/crypto/polyval.h +++ b/include/crypto/polyval.h @@ -1,14 +1,179 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +/* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * Common values for the Polyval hash algorithm + * POLYVAL library API * - * Copyright 2021 Google LLC + * Copyright 2025 Google LLC */ #ifndef _CRYPTO_POLYVAL_H #define _CRYPTO_POLYVAL_H +#include +#include + #define POLYVAL_BLOCK_SIZE 16 #define POLYVAL_DIGEST_SIZE 16 +/** + * struct polyval_elem - An element of the POLYVAL finite field + * @bytes: View of the element as a byte array (unioned with @lo and @hi) + * @lo: The low 64 terms of the element's polynomial + * @hi: The high 64 terms of the element's polynomial + * + * This represents an element of the finite field GF(2^128), using the POLYVAL + * convention: little-endian byte order and natural bit order. + */ +struct polyval_elem { + union { + u8 bytes[POLYVAL_BLOCK_SIZE]; + struct { + __le64 lo; + __le64 hi; + }; + }; +}; + +/** + * struct polyval_key - Prepared key for POLYVAL + * + * This may contain just the raw key H, or it may contain precomputed key + * powers, depending on the platform's POLYVAL implementation. Use + * polyval_preparekey() to initialize this. + */ +struct polyval_key { +#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH +#error "Unhandled arch" +#else /* CONFIG_CRYPTO_LIB_POLYVAL_ARCH */ + /** @h: The hash key H */ + struct polyval_elem h; +#endif /* !CONFIG_CRYPTO_LIB_POLYVAL_ARCH */ +}; + +/** + * struct polyval_ctx - Context for computing a POLYVAL value + * @key: Pointer to the prepared POLYVAL key. The user of the API is + * responsible for ensuring that the key lives as long as the context. + * @acc: The accumulator + * @partial: Number of data bytes processed so far modulo POLYVAL_BLOCK_SIZE + */ +struct polyval_ctx { + const struct polyval_key *key; + struct polyval_elem acc; + size_t partial; +}; + +/** + * polyval_preparekey() - Prepare a POLYVAL key + * @key: (output) The key structure to initialize + * @raw_key: The raw hash key + * + * Initialize a POLYVAL key structure from a raw key. This may be a simple + * copy, or it may involve precomputing powers of the key, depending on the + * platform's POLYVAL implementation. + * + * Context: Any context. + */ +#ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH +void polyval_preparekey(struct polyval_key *key, + const u8 raw_key[POLYVAL_BLOCK_SIZE]); + +#else +static inline void polyval_preparekey(struct polyval_key *key, + const u8 raw_key[POLYVAL_BLOCK_SIZE]) +{ + /* Just a simple copy, so inline it. */ + memcpy(key->h.bytes, raw_key, POLYVAL_BLOCK_SIZE); +} #endif + +/** + * polyval_init() - Initialize a POLYVAL context for a new message + * @ctx: The context to initialize + * @key: The key to use. Note that a pointer to the key is saved in the + * context, so the key must live at least as long as the context. + */ +static inline void polyval_init(struct polyval_ctx *ctx, + const struct polyval_key *key) +{ + *ctx = (struct polyval_ctx){ .key = key }; +} + +/** + * polyval_import_blkaligned() - Import a POLYVAL accumulator value + * @ctx: The context to initialize + * @key: The key to import. Note that a pointer to the key is saved in the + * context, so the key must live at least as long as the context. + * @acc: The accumulator value to import. + * + * This imports an accumulator that was saved by polyval_export_blkaligned(). + * The same key must be used. + */ +static inline void +polyval_import_blkaligned(struct polyval_ctx *ctx, + const struct polyval_key *key, + const struct polyval_elem *acc) +{ + *ctx = (struct polyval_ctx){ .key = key, .acc = *acc }; +} + +/** + * polyval_export_blkaligned() - Export a POLYVAL accumulator value + * @ctx: The context to export the accumulator value from + * @acc: (output) The exported accumulator value + * + * This exports the accumulator from a POLYVAL context. The number of data + * bytes processed so far must be a multiple of POLYVAL_BLOCK_SIZE. + */ +static inline void polyval_export_blkaligned(const struct polyval_ctx *ctx, + struct polyval_elem *acc) +{ + *acc = ctx->acc; +} + +/** + * polyval_update() - Update a POLYVAL context with message data + * @ctx: The context to update; must have been initialized + * @data: The message data + * @len: The data length in bytes. Doesn't need to be block-aligned. + * + * This can be called any number of times. + * + * Context: Any context. + */ +void polyval_update(struct polyval_ctx *ctx, const u8 *data, size_t len); + +/** + * polyval_final() - Finish computing a POLYVAL value + * @ctx: The context to finalize + * @out: The output value + * + * If the total data length isn't a multiple of POLYVAL_BLOCK_SIZE, then the + * final block is automatically zero-padded. + * + * After finishing, this zeroizes @ctx. So the caller does not need to do it. + * + * Context: Any context. + */ +void polyval_final(struct polyval_ctx *ctx, u8 out[POLYVAL_BLOCK_SIZE]); + +/** + * polyval() - Compute a POLYVAL value + * @key: The prepared key + * @data: The message data + * @len: The data length in bytes. Doesn't need to be block-aligned. + * @out: The output value + * + * Context: Any context. + */ +static inline void polyval(const struct polyval_key *key, + const u8 *data, size_t len, + u8 out[POLYVAL_BLOCK_SIZE]) +{ + struct polyval_ctx ctx; + + polyval_init(&ctx, key); + polyval_update(&ctx, data, len); + polyval_final(&ctx, out); +} + +#endif /* _CRYPTO_POLYVAL_H */ -- cgit v1.2.3 From 37919e239ebb2cba573cca56292f7c39fa6d7415 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 9 Nov 2025 15:47:19 -0800 Subject: lib/crypto: arm64/polyval: Migrate optimized code into library Migrate the arm64 implementation of POLYVAL into lib/crypto/, wiring it up to the POLYVAL library interface. This makes the POLYVAL library be properly optimized on arm64. This drops the arm64 optimizations of polyval in the crypto_shash API. That's fine, since polyval will be removed from crypto_shash entirely since it is unneeded there. But even if it comes back, the crypto_shash API could just be implemented on top of the library API, as usual. Adjust the names and prototypes of the assembly functions to align more closely with the rest of the library code. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251109234726.638437-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/polyval.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h index 5ba4c248cad1..f8aaf4275fbd 100644 --- a/include/crypto/polyval.h +++ b/include/crypto/polyval.h @@ -39,10 +39,18 @@ struct polyval_elem { * This may contain just the raw key H, or it may contain precomputed key * powers, depending on the platform's POLYVAL implementation. Use * polyval_preparekey() to initialize this. + * + * By H^i we mean H^(i-1) * H * x^-128, with base case H^1 = H. I.e. the + * exponentiation repeats the POLYVAL dot operation, with its "extra" x^-128. */ struct polyval_key { #ifdef CONFIG_CRYPTO_LIB_POLYVAL_ARCH +#ifdef CONFIG_ARM64 + /** @h_powers: Powers of the hash key H^8 through H^1 */ + struct polyval_elem h_powers[8]; +#else #error "Unhandled arch" +#endif #else /* CONFIG_CRYPTO_LIB_POLYVAL_ARCH */ /** @h: The hash key H */ struct polyval_elem h; -- cgit v1.2.3 From 4d8da35579daad0392d238460ed7e9629d49ca35 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 9 Nov 2025 15:47:20 -0800 Subject: lib/crypto: x86/polyval: Migrate optimized code into library Migrate the x86_64 implementation of POLYVAL into lib/crypto/, wiring it up to the POLYVAL library interface. This makes the POLYVAL library be properly optimized on x86_64. This drops the x86_64 optimizations of polyval in the crypto_shash API. That's fine, since polyval will be removed from crypto_shash entirely since it is unneeded there. But even if it comes back, the crypto_shash API could just be implemented on top of the library API, as usual. Adjust the names and prototypes of the assembly functions to align more closely with the rest of the library code. Also replace a movaps instruction with movups to remove the assumption that the key struct is 16-byte aligned. Users can still align the key if they want (and at least in this case, movups is just as fast as movaps), but it's inconvenient to require it. Reviewed-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20251109234726.638437-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/polyval.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/crypto/polyval.h b/include/crypto/polyval.h index f8aaf4275fbd..b28b8ef11353 100644 --- a/include/crypto/polyval.h +++ b/include/crypto/polyval.h @@ -48,6 +48,9 @@ struct polyval_key { #ifdef CONFIG_ARM64 /** @h_powers: Powers of the hash key H^8 through H^1 */ struct polyval_elem h_powers[8]; +#elif defined(CONFIG_X86) + /** @h_powers: Powers of the hash key H^8 through H^1 */ + struct polyval_elem h_powers[8]; #else #error "Unhandled arch" #endif -- cgit v1.2.3 From 693d1eaca940f277af24c74873ef2313816ff444 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Tue, 11 Nov 2025 18:58:35 +0000 Subject: coresight: Change device mode to atomic type The device mode is defined as local type. This type cannot promise SMP-safe access. Change to atomic type and impose relax ordering, which ensures the SMP-safe synchronisation and the ordering between the mode setting and relevant operations. Fixes: 22fd532eaa0c ("coresight: etm3x: adding operation mode for etm_enable()") Reviewed-by: Mike Leach Tested-by: James Clark Signed-off-by: Leo Yan Signed-off-by: Suzuki K Poulose Link: https://lore.kernel.org/r/20251111-arm_coresight_power_management_fix-v6-1-f55553b6c8b3@arm.com --- include/linux/coresight.h | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 56d0108658db..2b48be97fcd0 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -251,15 +251,11 @@ struct coresight_trace_id_map { * by @coresight_ops. * @access: Device i/o access abstraction for this device. * @dev: The device entity associated to this component. - * @mode: This tracer's mode, i.e sysFS, Perf or disabled. This is - * actually an 'enum cs_mode', but is stored in an atomic type. - * This is always accessed through local_read() and local_set(), - * but wherever it's done from within the Coresight device's lock, - * a non-atomic read would also work. This is the main point of - * synchronisation between code happening inside the sysfs mode's - * coresight_mutex and outside when running in Perf mode. A compare - * and exchange swap is done to atomically claim one mode or the - * other. + * @mode: The device mode, i.e sysFS, Perf or disabled. This is actually + * an 'enum cs_mode' but stored in an atomic type. Access is always + * through atomic APIs, ensuring SMP-safe synchronisation between + * racing from sysFS and Perf mode. A compare-and-exchange + * operation is done to atomically claim one mode or the other. * @refcnt: keep track of what is in use. Only access this outside of the * device's spinlock when the coresight_mutex held and mode == * CS_MODE_SYSFS. Otherwise it must be accessed from inside the @@ -288,7 +284,7 @@ struct coresight_device { const struct coresight_ops *ops; struct csdev_access access; struct device dev; - local_t mode; + atomic_t mode; int refcnt; bool orphan; /* sink specific fields */ @@ -624,13 +620,14 @@ static inline bool coresight_is_percpu_sink(struct coresight_device *csdev) static inline bool coresight_take_mode(struct coresight_device *csdev, enum cs_mode new_mode) { - return local_cmpxchg(&csdev->mode, CS_MODE_DISABLED, new_mode) == - CS_MODE_DISABLED; + int curr = CS_MODE_DISABLED; + + return atomic_try_cmpxchg_acquire(&csdev->mode, &curr, new_mode); } static inline enum cs_mode coresight_get_mode(struct coresight_device *csdev) { - return local_read(&csdev->mode); + return atomic_read_acquire(&csdev->mode); } static inline void coresight_set_mode(struct coresight_device *csdev, @@ -646,7 +643,7 @@ static inline void coresight_set_mode(struct coresight_device *csdev, WARN(new_mode != CS_MODE_DISABLED && current_mode != CS_MODE_DISABLED && current_mode != new_mode, "Device already in use\n"); - local_set(&csdev->mode, new_mode); + atomic_set_release(&csdev->mode, new_mode); } struct coresight_device *coresight_register(struct coresight_desc *desc); -- cgit v1.2.3 From 5422318e27d7a4662701f518e2e51b9f73a331b1 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Tue, 11 Nov 2025 14:24:48 +0200 Subject: net/mlx5: Expose definition for 1600Gbps link mode This patch exposes new link mode for 1600Gbps, utilizing 8 lanes at 200Gbps per lane. Co-developed-by: Yael Chemla Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1762863888-1092798-1-git-send-email-tariqt@nvidia.com Signed-off-by: Leon Romanovsky --- include/linux/mlx5/port.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/port.h b/include/linux/mlx5/port.h index 58770b86f793..1df9d9a57bbc 100644 --- a/include/linux/mlx5/port.h +++ b/include/linux/mlx5/port.h @@ -112,6 +112,7 @@ enum mlx5e_ext_link_mode { MLX5E_400GAUI_2_400GBASE_CR2_KR2 = 17, MLX5E_800GAUI_8_800GBASE_CR8_KR8 = 19, MLX5E_800GAUI_4_800GBASE_CR4_KR4 = 20, + MLX5E_1600TAUI_8_1600TBASE_CR8_KR8 = 23, MLX5E_EXT_LINK_MODES_NUMBER, }; -- cgit v1.2.3 From 4be9f3cc582a24b08f6580f65fa48a4d70332ab5 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:43 -0500 Subject: filelock: rework the __break_lease API to use flags Currently __break_lease takes both a type and an openmode. With the addition of directory leases, that makes less sense. Declare a set of LEASE_BREAK_* flags that can be used to control how lease breaks work instead of requiring a type and an openmode. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-2-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- include/linux/filelock.h | 52 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index c2ce8ba05d06..47da6aa28d8d 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -212,7 +212,14 @@ int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); void locks_init_lease(struct file_lease *); void locks_free_lease(struct file_lease *fl); struct file_lease *locks_alloc_lease(void); -int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); + +#define LEASE_BREAK_LEASE BIT(0) // break leases and delegations +#define LEASE_BREAK_DELEG BIT(1) // break delegations only +#define LEASE_BREAK_LAYOUT BIT(2) // break layouts only +#define LEASE_BREAK_NONBLOCK BIT(3) // non-blocking break +#define LEASE_BREAK_OPEN_RDONLY BIT(4) // readonly open event + +int __break_lease(struct inode *inode, unsigned int flags); void lease_get_mtime(struct inode *, struct timespec64 *time); int generic_setlease(struct file *, int, struct file_lease **, void **priv); int kernel_setlease(struct file *, int, struct file_lease **, void **); @@ -367,7 +374,7 @@ static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *f return -ENOLCK; } -static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) +static inline int __break_lease(struct inode *inode, unsigned int flags) { return 0; } @@ -428,6 +435,17 @@ static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) } #ifdef CONFIG_FILE_LOCKING +static inline unsigned int openmode_to_lease_flags(unsigned int mode) +{ + unsigned int flags = 0; + + if ((mode & O_ACCMODE) == O_RDONLY) + flags |= LEASE_BREAK_OPEN_RDONLY; + if (mode & O_NONBLOCK) + flags |= LEASE_BREAK_NONBLOCK; + return flags; +} + static inline int break_lease(struct inode *inode, unsigned int mode) { struct file_lock_context *flctx; @@ -443,11 +461,11 @@ static inline int break_lease(struct inode *inode, unsigned int mode) return 0; smp_mb(); if (!list_empty_careful(&flctx->flc_lease)) - return __break_lease(inode, mode, FL_LEASE); + return __break_lease(inode, LEASE_BREAK_LEASE | openmode_to_lease_flags(mode)); return 0; } -static inline int break_deleg(struct inode *inode, unsigned int mode) +static inline int break_deleg(struct inode *inode, unsigned int flags) { struct file_lock_context *flctx; @@ -461,8 +479,10 @@ static inline int break_deleg(struct inode *inode, unsigned int mode) if (!flctx) return 0; smp_mb(); - if (!list_empty_careful(&flctx->flc_lease)) - return __break_lease(inode, mode, FL_DELEG); + if (!list_empty_careful(&flctx->flc_lease)) { + flags |= LEASE_BREAK_DELEG; + return __break_lease(inode, flags); + } return 0; } @@ -470,7 +490,7 @@ static inline int try_break_deleg(struct inode *inode, struct inode **delegated_ { int ret; - ret = break_deleg(inode, O_WRONLY|O_NONBLOCK); + ret = break_deleg(inode, LEASE_BREAK_NONBLOCK); if (ret == -EWOULDBLOCK && delegated_inode) { *delegated_inode = inode; ihold(inode); @@ -482,7 +502,7 @@ static inline int break_deleg_wait(struct inode **delegated_inode) { int ret; - ret = break_deleg(*delegated_inode, O_WRONLY); + ret = break_deleg(*delegated_inode, 0); iput(*delegated_inode); *delegated_inode = NULL; return ret; @@ -491,20 +511,24 @@ static inline int break_deleg_wait(struct inode **delegated_inode) static inline int break_layout(struct inode *inode, bool wait) { smp_mb(); - if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) - return __break_lease(inode, - wait ? O_WRONLY : O_WRONLY | O_NONBLOCK, - FL_LAYOUT); + if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease)) { + unsigned int flags = LEASE_BREAK_LAYOUT; + + if (!wait) + flags |= LEASE_BREAK_NONBLOCK; + + return __break_lease(inode, flags); + } return 0; } #else /* !CONFIG_FILE_LOCKING */ -static inline int break_lease(struct inode *inode, unsigned int mode) +static inline int break_lease(struct inode *inode, bool wait) { return 0; } -static inline int break_deleg(struct inode *inode, unsigned int mode) +static inline int break_deleg(struct inode *inode, unsigned int flags) { return 0; } -- cgit v1.2.3 From 6976ed2dd0d59086d16d853ac9b21776be68aaad Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:44 -0500 Subject: filelock: add struct delegated_inode The current API requires a pointer to an inode pointer. It's easy for callers to get this wrong. Add a new delegated_inode structure and use that to pass back any inode that needs to be waited on. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-3-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- include/linux/filelock.h | 36 +++++++++++++++++++++++++++--------- include/linux/fs.h | 9 +++++---- include/linux/xattr.h | 4 ++-- 3 files changed, 34 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 47da6aa28d8d..208d108df2d7 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -486,25 +486,35 @@ static inline int break_deleg(struct inode *inode, unsigned int flags) return 0; } -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +struct delegated_inode { + struct inode *di_inode; +}; + +static inline bool is_delegated(struct delegated_inode *di) +{ + return di->di_inode; +} + +static inline int try_break_deleg(struct inode *inode, + struct delegated_inode *di) { int ret; ret = break_deleg(inode, LEASE_BREAK_NONBLOCK); - if (ret == -EWOULDBLOCK && delegated_inode) { - *delegated_inode = inode; + if (ret == -EWOULDBLOCK && di) { + di->di_inode = inode; ihold(inode); } return ret; } -static inline int break_deleg_wait(struct inode **delegated_inode) +static inline int break_deleg_wait(struct delegated_inode *di) { int ret; - ret = break_deleg(*delegated_inode, 0); - iput(*delegated_inode); - *delegated_inode = NULL; + ret = break_deleg(di->di_inode, 0); + iput(di->di_inode); + di->di_inode = NULL; return ret; } @@ -523,6 +533,13 @@ static inline int break_layout(struct inode *inode, bool wait) } #else /* !CONFIG_FILE_LOCKING */ +struct delegated_inode { }; + +static inline bool is_delegated(struct delegated_inode *di) +{ + return false; +} + static inline int break_lease(struct inode *inode, bool wait) { return 0; @@ -533,12 +550,13 @@ static inline int break_deleg(struct inode *inode, unsigned int flags) return 0; } -static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode) +static inline int try_break_deleg(struct inode *inode, + struct delegated_inode *delegated_inode) { return 0; } -static inline int break_deleg_wait(struct inode **delegated_inode) +static inline int break_deleg_wait(struct delegated_inode *delegated_inode) { BUG(); return 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..909a88e3979d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -80,6 +80,7 @@ struct fs_context; struct fs_parameter_spec; struct file_kattr; struct iomap_ops; +struct delegated_inode; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2119,10 +2120,10 @@ int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, - struct dentry *, struct inode **); + struct dentry *, struct delegated_inode *); int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, - struct inode **); + struct delegated_inode *); /** * struct renamedata - contains all information required for renaming @@ -2140,7 +2141,7 @@ struct renamedata { struct dentry *old_dentry; struct dentry *new_parent; struct dentry *new_dentry; - struct inode **delegated_inode; + struct delegated_inode *delegated_inode; unsigned int flags; } __randomize_layout; @@ -3071,7 +3072,7 @@ static inline int bmap(struct inode *inode, sector_t *block) #endif int notify_change(struct mnt_idmap *, struct dentry *, - struct iattr *, struct inode **); + struct iattr *, struct delegated_inode *); int inode_permission(struct mnt_idmap *, struct inode *, int); int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 86b0d47984a1..64e9afe7d647 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -85,12 +85,12 @@ int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int, - struct inode **); + struct delegated_inode *); int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *, - const char *, struct inode **); + const char *, struct delegated_inode *); int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); -- cgit v1.2.3 From e12d203b8c880061c0bf0339cad51e5851a33442 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:47 -0500 Subject: vfs: allow mkdir to wait for delegation break on parent In order to add directory delegation support, we need to break delegations on the parent whenever there is going to be a change in the directory. Add a new delegated_inode parameter to vfs_mkdir. All of the existing callers set that to NULL for now, except for do_mkdirat which will properly block until the lease is gone. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-6-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 909a88e3979d..20bb4c8a4e8e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2114,7 +2114,7 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap, int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t); + struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int vfs_symlink(struct mnt_idmap *, struct inode *, -- cgit v1.2.3 From 4fa76319cd0cc97ca54ff71c94814dc5b1983ad2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:48 -0500 Subject: vfs: allow rmdir to wait for delegation break on parent In order to add directory delegation support, we need to break delegations on the parent whenever there is going to be a change in the directory. Add a delegated_inode struct to vfs_rmdir() and populate that pointer with the parent inode if it's non-NULL. Most existing in-kernel callers pass in a NULL pointer. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-7-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 20bb4c8a4e8e..12873214e1c7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2121,7 +2121,8 @@ int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct delegated_inode *); -int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); +int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *, + struct delegated_inode *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, struct delegated_inode *); -- cgit v1.2.3 From 85bbffcad7307e2ca6136be657cc21b0e1c42241 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:50 -0500 Subject: vfs: clean up argument list for vfs_create() As Neil points out: "I would be in favour of dropping the "dir" arg because it is always d_inode(dentry->d_parent) which is stable." ...and... "Also *every* caller of vfs_create() passes ".excl = true". So maybe we don't need that arg at all." Drop both arguments from vfs_create() and fix up the callers. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-9-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 12873214e1c7..21876ef1fec9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2111,8 +2111,7 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap, /* * VFS helper functions.. */ -int vfs_create(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t, bool); +int vfs_create(struct mnt_idmap *, struct dentry *, umode_t); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, -- cgit v1.2.3 From c826229c6a82fe1fe7b7752692f87a881eb4b545 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:51 -0500 Subject: vfs: make vfs_create break delegations on parent directory In order to add directory delegation support, we need to break delegations on the parent whenever there is going to be a change in the directory. Add a delegated_inode parameter to vfs_create. Most callers are converted to pass in NULL, but do_mknodat() is changed to wait for a delegation break if there is one. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-10-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 21876ef1fec9..83b05aec4e10 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2111,7 +2111,8 @@ bool inode_owner_or_capable(struct mnt_idmap *idmap, /* * VFS helper functions.. */ -int vfs_create(struct mnt_idmap *, struct dentry *, umode_t); +int vfs_create(struct mnt_idmap *, struct dentry *, umode_t, + struct delegated_inode *); struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, -- cgit v1.2.3 From e8960c1b2ee9ba75d65492b8e90e851d11e5f215 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:52 -0500 Subject: vfs: make vfs_mknod break delegations on parent directory In order to add directory delegation support, we need to break delegations on the parent whenever there is going to be a change in the directory. Add a new delegated_inode pointer to vfs_mknod() and have the appropriate callers wait when there is an outstanding delegation. All other callers just set the pointer to NULL. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-11-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 83b05aec4e10..1a5d86cfafaa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2116,7 +2116,7 @@ int vfs_create(struct mnt_idmap *, struct dentry *, umode_t, struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, struct delegated_inode *); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t, dev_t); + umode_t, dev_t, struct delegated_inode *); int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, @@ -2152,7 +2152,7 @@ static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, - WHITEOUT_DEV); + WHITEOUT_DEV, NULL); } struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, -- cgit v1.2.3 From 92bf53577f01aad988f7f39f69163b41f94cfb7d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:53 -0500 Subject: vfs: make vfs_symlink break delegations on parent dir In order to add directory delegation support, we must break delegations on the parent on any change to the directory. Add a delegated_inode parameter to vfs_symlink() and have it break the delegation. do_symlinkat() can then wait on the delegation break before proceeding. Reviewed-by: Jan Kara Reviewed-by: NeilBrown Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-12-52f3feebb2f2@kernel.org Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a5d86cfafaa..64323e618724 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2118,7 +2118,7 @@ struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *, int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t, struct delegated_inode *); int vfs_symlink(struct mnt_idmap *, struct inode *, - struct dentry *, const char *); + struct dentry *, const char *, struct delegated_inode *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct delegated_inode *); int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *, -- cgit v1.2.3 From 1602bad16d7df82faca6d7c70821117684a66f49 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 11 Nov 2025 09:12:58 -0500 Subject: vfs: expose delegation support to userland Now that support for recallable directory delegations is available, expose this functionality to userland with new F_SETDELEG and F_GETDELEG commands for fcntl(). Note that this also allows userland to request a FL_DELEG type lease on files too. Userland applications that do will get signalled when there are metadata changes in addition to just data changes (which is a limitation of FL_LEASE leases). These commands accept a new "struct delegation" argument that contains a flags field for future expansion. Signed-off-by: Jeff Layton Link: https://patch.msgid.link/20251111-dir-deleg-ro-v6-17-52f3feebb2f2@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/filelock.h | 12 ++++++++++++ include/uapi/linux/fcntl.h | 11 +++++++++++ 2 files changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/filelock.h b/include/linux/filelock.h index 208d108df2d7..54b824c05299 100644 --- a/include/linux/filelock.h +++ b/include/linux/filelock.h @@ -159,6 +159,8 @@ int fcntl_setlk64(unsigned int, struct file *, unsigned int, int fcntl_setlease(unsigned int fd, struct file *filp, int arg); int fcntl_getlease(struct file *filp); +int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg); +int fcntl_getdeleg(struct file *filp, struct delegation *deleg); static inline bool lock_is_unlock(struct file_lock *fl) { @@ -278,6 +280,16 @@ static inline int fcntl_getlease(struct file *filp) return F_UNLCK; } +static inline int fcntl_setdeleg(unsigned int fd, struct file *filp, struct delegation *deleg) +{ + return -EINVAL; +} + +static inline int fcntl_getdeleg(struct file *filp, struct delegation *deleg) +{ + return -EINVAL; +} + static inline bool lock_is_unlock(struct file_lock *fl) { return false; diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 3741ea1b73d8..008fac15e573 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -79,6 +79,17 @@ */ #define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET +/* Set/Get delegations */ +#define F_GETDELEG (F_LINUX_SPECIFIC_BASE + 15) +#define F_SETDELEG (F_LINUX_SPECIFIC_BASE + 16) + +/* Argument structure for F_GETDELEG and F_SETDELEG */ +struct delegation { + uint32_t d_flags; /* Must be 0 */ + uint16_t d_type; /* F_RDLCK, F_WRLCK, F_UNLCK */ + uint16_t __pad; /* Must be 0 */ +}; + /* * Types of directory notifications that may be requested. */ -- cgit v1.2.3 From a3f8f8662771285511ae26c4c8d3ba1cd22159b9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 5 Nov 2025 14:39:45 +0100 Subject: power: always freeze efivarfs The efivarfs filesystems must always be frozen and thawed to resync variable state. Make it so. Link: https://patch.msgid.link/20251105-vorbild-zutreffen-fe00d1dd98db@brauner Signed-off-by: Christian Brauner --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 3ea98c6cce81..249a1da8440e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2689,6 +2689,7 @@ struct file_system_type { #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_LBS 128 /* FS supports LBS */ +#define FS_POWER_FREEZE 256 /* Always freeze on suspend/hibernate */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; @@ -3606,7 +3607,7 @@ extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -void filesystems_freeze(void); +void filesystems_freeze(bool freeze_all); void filesystems_thaw(void); extern int dcache_dir_open(struct inode *, struct file *); -- cgit v1.2.3 From ad9c62bd8946621ed02ac94131a921222508a8bc Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Mon, 13 Oct 2025 18:59:01 +0000 Subject: KVM: arm64: VM exit to userspace to handle SEA When APEI fails to handle a stage-2 synchronous external abort (SEA), today KVM injects an asynchronous SError to the VCPU then resumes it, which usually results in unpleasant guest kernel panic. One major situation of guest SEA is when vCPU consumes recoverable uncorrected memory error (UER). Although SError and guest kernel panic effectively stops the propagation of corrupted memory, guest may re-use the corrupted memory if auto-rebooted; in worse case, guest boot may run into poisoned memory. So there is room to recover from an UER in a more graceful manner. Alternatively KVM can redirect the synchronous SEA event to VMM to - Reduce blast radius if possible. VMM can inject a SEA to VCPU via KVM's existing KVM_SET_VCPU_EVENTS API. If the memory poison consumption or fault is not from guest kernel, blast radius can be limited to the triggering thread in guest userspace, so VM can keep running. - Allow VMM to protect from future memory poison consumption by unmapping the page from stage-2, or to interrupt guest of the poisoned page so guest kernel can unmap it from stage-1 page table. - Allow VMM to track SEA events that VM customers care about, to restart VM when certain number of distinct poison events have happened, to provide observability to customers in log management UI. Introduce an userspace-visible feature to enable VMM handle SEA: - KVM_CAP_ARM_SEA_TO_USER. As the alternative fallback behavior when host APEI fails to claim a SEA, userspace can opt in this new capability to let KVM exit to userspace during SEA if it is not owned by host. - KVM_EXIT_ARM_SEA. A new exit reason is introduced for this. KVM fills kvm_run.arm_sea with as much as possible information about the SEA, enabling VMM to emulate SEA to guest by itself. - Sanitized ESR_EL2. The general rule is to keep only the bits useful for userspace and relevant to guest memory. - Flags indicating if faulting guest physical address is valid. - Faulting guest physical and virtual addresses if valid. Signed-off-by: Jiaqi Yan Co-developed-by: Oliver Upton Signed-off-by: Oliver Upton Link: https://msgid.link/20251013185903.1372553-2-jiaqiyan@google.com Signed-off-by: Oliver Upton --- include/uapi/linux/kvm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52f6000ab020..1e541193e98d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -179,6 +179,7 @@ struct kvm_xen_exit { #define KVM_EXIT_LOONGARCH_IOCSR 38 #define KVM_EXIT_MEMORY_FAULT 39 #define KVM_EXIT_TDX 40 +#define KVM_EXIT_ARM_SEA 41 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -473,6 +474,14 @@ struct kvm_run { } setup_event_notify; }; } tdx; + /* KVM_EXIT_ARM_SEA */ + struct { +#define KVM_EXIT_ARM_SEA_FLAG_GPA_VALID (1ULL << 0) + __u64 flags; + __u64 esr; + __u64 gva; + __u64 gpa; + } arm_sea; /* Fix the size of the union. */ char padding[256]; }; @@ -963,6 +972,7 @@ struct kvm_enable_cap { #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 #define KVM_CAP_GUEST_MEMFD_FLAGS 244 +#define KVM_CAP_ARM_SEA_TO_USER 245 struct kvm_irq_routing_irqchip { __u32 irqchip; -- cgit v1.2.3 From 4e5cba5bb6f37ceaba6a2628a171cbede02f969c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 11 Nov 2025 22:29:08 -0800 Subject: RDMA/cm: Correct typedef and bad line warnings In include/rdma/ib_cm.h: Correct a typedef's kernel-doc notation by adding the 'typedef' keyword to it to avoid a warning. Add a leading " *" to a kernel-doc line to avoid a warning. Warning: ib_cm.h:289 function parameter 'ib_cm_handler' not described in 'int' Warning: ib_cm.h:289 expecting prototype for ib_cm_handler(). Prototype was for int() instead Warning: ib_cm.h:484 bad line: connection message in case duplicates are received. Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251112062908.2711007-1-rdunlap@infradead.org Signed-off-by: Leon Romanovsky --- include/rdma/ib_cm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index 1fa3786f82f4..4808a355de41 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -271,7 +271,7 @@ struct ib_cm_event { #define CM_APR_ATTR_ID cpu_to_be16(0x001A) /** - * ib_cm_handler - User-defined callback to process communication events. + * typedef ib_cm_handler - User-defined callback to process communication events. * @cm_id: Communication identifier associated with the reported event. * @event: Information about the communication event. * @@ -482,7 +482,7 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, /** * ib_prepare_cm_mra - Prepares to send a message receipt acknowledgment to a - connection message in case duplicates are received. + * connection message in case duplicates are received. * @cm_id: Connection identifier associated with the connection message. */ int ib_prepare_cm_mra(struct ib_cm_id *cm_id); -- cgit v1.2.3 From 78f0e33cd6c939a555aa80dbed2fec6b333a7660 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 11 Nov 2025 06:28:15 +0000 Subject: fs/namespace: correctly handle errors returned by grab_requested_mnt_ns grab_requested_mnt_ns was changed to return error codes on failure, but its callers were not updated to check for error pointers, still checking only for a NULL return value. This commit updates the callers to use IS_ERR() or IS_ERR_OR_NULL() and PTR_ERR() to correctly check for and propagate errors. This also makes sure that the logic actually works and mount namespace file descriptors can be used to refere to mounts. Christian Brauner says: Rework the patch to be more ergonomic and in line with our overall error handling patterns. Fixes: 7b9d14af8777 ("fs: allow mount namespace fd") Cc: Christian Brauner Signed-off-by: Andrei Vagin Link: https://patch.msgid.link/20251111062815.2546189-1-avagin@google.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/uapi/linux/mount.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h index 7fa67c2031a5..5d3f8c9e3a62 100644 --- a/include/uapi/linux/mount.h +++ b/include/uapi/linux/mount.h @@ -197,7 +197,7 @@ struct statmount { */ struct mnt_id_req { __u32 size; - __u32 spare; + __u32 mnt_ns_fd; __u64 mnt_id; __u64 param; __u64 mnt_ns_id; -- cgit v1.2.3 From 12741624645e098b2234a5ae341045a97473caf1 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Nov 2025 22:20:24 +0100 Subject: fs: add iput_not_last() Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251105212025.807549-1-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 249a1da8440e..dd3b57cfadee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2824,6 +2824,7 @@ extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); +void iput_not_last(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); int generic_update_time(struct inode *, int); -- cgit v1.2.3 From 7e6cea5ae2f5e62112fce69acc07ee8b694b6dd0 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:52 -0800 Subject: docs: document iomap writeback's iomap_finish_folio_write() requirement Document that iomap_finish_folio_write() must be called after writeback on the range completes. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-4-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 8b1ac08c7474..a5032e456079 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -435,6 +435,10 @@ struct iomap_writeback_ops { * An existing mapping from a previous call to this method can be reused * by the file system if it is still valid. * + * If this succeeds, iomap_finish_folio_write() must be called once + * writeback completes for the range, regardless of whether the + * writeback succeeded or failed. + * * Returns the number of bytes processed or a negative errno. */ ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc, -- cgit v1.2.3 From 6b1fd2281fb0873ec56f8791d4e4898302070804 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:53 -0800 Subject: iomap: optimize pending async writeback accounting Pending writebacks must be accounted for to determine when all requests have completed and writeback on the folio should be ended. Currently this is done by atomically incrementing ifs->write_bytes_pending for every range to be written back. Instead, the number of atomic operations can be minimized by setting ifs->write_bytes_pending to the folio size, internally tracking how many bytes are written back asynchronously, and then after sending off all the requests, decrementing ifs->write_bytes_pending by the number of bytes not written back asynchronously. Now, for N ranges written back, only N + 2 atomic operations are required instead of 2N + 2. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-5-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index a5032e456079..b49e47f069db 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -478,8 +478,6 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error); void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error); -void iomap_start_folio_write(struct inode *inode, struct folio *folio, - size_t len); void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len); -- cgit v1.2.3 From f8eaf79406fe9415db0e7a5c175b50cb01265199 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Tue, 11 Nov 2025 11:36:54 -0800 Subject: iomap: simplify ->read_folio_range() error handling for reads Instead of requiring that the caller calls iomap_finish_folio_read() even if the ->read_folio_range() callback returns an error, account for this internally in iomap instead, which makes the interface simpler and makes it match writeback's ->read_folio_range() error handling expectations. Signed-off-by: Joanne Koong Link: https://patch.msgid.link/20251111193658.3495942-6-joannelkoong@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- include/linux/iomap.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b49e47f069db..520e967cb501 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -495,9 +495,8 @@ struct iomap_read_ops { /* * Read in a folio range. * - * The caller is responsible for calling iomap_finish_folio_read() after - * reading in the folio range. This should be done even if an error is - * encountered during the read. + * If this succeeds, iomap_finish_folio_read() must be called after the + * range is read in, regardless of whether the read succeeded or failed. * * Returns 0 on success or a negative error on failure. */ -- cgit v1.2.3 From 395b95530343e7f4bdd2870190d985a222997fb6 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Tue, 16 Sep 2025 14:53:07 +0100 Subject: dcache: export shrink_dentry_list() and add new helper d_dispose_if_unused() Add and export a new helper d_dispose_if_unused() which is simply a wrapper around to_shrink_list(), to add an entry to a dispose list if it's not used anymore. Also export shrink_dentry_list() to kill all dentries in a dispose list. Suggested-by: Miklos Szeredi Signed-off-by: Luis Henriques Signed-off-by: Miklos Szeredi --- include/linux/dcache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c83e02b94389..2bc1339bf6d0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -268,6 +268,8 @@ extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); +extern void d_dispose_if_unused(struct dentry *, struct list_head *); +extern void shrink_dentry_list(struct list_head *); extern struct dentry *d_find_alias_rcu(struct inode *); -- cgit v1.2.3 From 854e8df2ce6b02c8be40d6f26bd8aa700b375bb2 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 23 Oct 2025 10:21:42 +0200 Subject: fs/pipe: stop duplicating union pipe_index declaration Now that we build with -fms-extensions, union pipe_index can be included as an anonymous member in struct pipe_inode_info, avoiding the duplication. Signed-off-by: Rasmus Villemoes Link: https://patch.msgid.link/20251023082142.2104456-1-linux@rasmusvillemoes.dk Signed-off-by: Nathan Chancellor Signed-off-by: Christian Brauner --- include/linux/pipe_fs_i.h | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 9d42d473d201..7f6a92ac9704 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -44,11 +44,11 @@ typedef unsigned int pipe_index_t; typedef unsigned short pipe_index_t; #endif -/* - * We have to declare this outside 'struct pipe_inode_info', - * but then we can't use 'union pipe_index' for an anonymous - * union, so we end up having to duplicate this declaration - * below. Annoying. +/** + * struct pipe_index - pipe indeces + * @head: The point of buffer production + * @tail: The point of buffer consumption + * @head_tail: unsigned long union of @head and @tail */ union pipe_index { unsigned long head_tail; @@ -63,9 +63,7 @@ union pipe_index { * @mutex: mutex protecting the whole thing * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe - * @head: The point of buffer production - * @tail: The point of buffer consumption - * @head_tail: unsigned long union of @head and @tail + * @pipe_index: the pipe indeces * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) @@ -87,14 +85,7 @@ struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; - /* This has to match the 'union pipe_index' above */ - union { - unsigned long head_tail; - struct { - pipe_index_t head; - pipe_index_t tail; - }; - }; + union pipe_index; unsigned int max_usage; unsigned int ring_size; -- cgit v1.2.3 From e631df89cd5d638a9d7c152dd9b0a92643efab3e Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 7 Nov 2025 15:21:47 +0100 Subject: fs: speed up path lookup with cheaper handling of MAY_EXEC The generic inode_permission() routine does work which is known to be of no significance for lookup. There are checks for MAY_WRITE, while the requested permission is MAY_EXEC. Additionally devcgroup_inode_permission() is called to check for devices, but it is an invariant the inode is a directory. Absent a ->permission func, execution lands in generic_permission() which checks upfront if the requested permission is granted for everyone. We can elide the branches which are guaranteed to be false and cut straight to the check if everyone happens to be allowed MAY_EXEC on the inode (which holds true most of the time). Moreover, filesystems which provide their own ->permission routine can take advantage of the optimization by setting the IOP_FASTPERM_MAY_EXEC flag on their inodes, which they can legitimately do if their MAY_EXEC handling matches generic_permission(). As a simple benchmark, as part of compilation gcc issues access(2) on numerous long paths, for example /usr/lib/gcc/x86_64-linux-gnu/12/crtendS.o Issuing access(2) on it in a loop on ext4 on Sapphire Rapids (ops/s): before: 3797556 after: 3987789 (+5%) Note: this depends on the not-yet-landed ext4 patch to mark inodes with cache_no_acl() Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251107142149.989998-2-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..ff69734b9fde 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -659,13 +659,14 @@ is_uncached_acl(struct posix_acl *acl) return (long)acl & 1; } -#define IOP_FASTPERM 0x0001 -#define IOP_LOOKUP 0x0002 -#define IOP_NOFOLLOW 0x0004 -#define IOP_XATTR 0x0008 +#define IOP_FASTPERM 0x0001 +#define IOP_LOOKUP 0x0002 +#define IOP_NOFOLLOW 0x0004 +#define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 -#define IOP_MGTIME 0x0020 -#define IOP_CACHED_LINK 0x0040 +#define IOP_MGTIME 0x0020 +#define IOP_CACHED_LINK 0x0040 +#define IOP_FASTPERM_MAY_EXEC 0x0080 /* * Inode state bits. Protected by inode->i_lock -- cgit v1.2.3 From 21b561dab1406e63740ebe240c7b69f19e1bcf58 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Nov 2025 16:36:22 +0100 Subject: fs: hide dentry_cache behind runtime const machinery Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251105153622.758836-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/asm-generic/vmlinux.lds.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8a9a2e732a65..20939d2445e7 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -955,7 +955,8 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG) #define RUNTIME_CONST_VARIABLES \ RUNTIME_CONST(shift, d_hash_shift) \ - RUNTIME_CONST(ptr, dentry_hashtable) + RUNTIME_CONST(ptr, dentry_hashtable) \ + RUNTIME_CONST(ptr, __dentry_cache) /* Alignment must be consistent with (kunit_suite *) in include/kunit/test.h */ #define KUNIT_TABLE() \ -- cgit v1.2.3 From f99eb098090e4c8bfca4190b545e20450fee8250 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:12 +0100 Subject: platform/x86: asus-armoury: move existing tunings to asus-armoury module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fw_attributes_class provides a much cleaner interface to all of the attributes introduced to asus-wmi. This patch moves all of these extra attributes over to fw_attributes_class, and shifts the bulk of these definitions to a new kernel module to reduce the clutter of asus-wmi with the intention of deprecating the asus-wmi attributes in future. The work applies only to WMI methods which don't have a clearly defined place within the sysfs and as a result ended up lumped together in /sys/devices/platform/asus-nb-wmi/ with no standard API. Where possible the fw attrs now implement defaults, min, max, scalar, choices, etc. As en example dgpu_disable becomes: /sys/class/firmware-attributes/asus-armoury/attributes/dgpu_disable/ ├── current_value ├── display_name ├── possible_values └── type as do other attributes. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Link: https://patch.msgid.link/20251102215319.3126879-3-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- .../linux/platform_data/x86/asus-wmi-leds-ids.h | 50 ++++++++++++++++++++++ include/linux/platform_data/x86/asus-wmi.h | 44 ++----------------- 2 files changed, 53 insertions(+), 41 deletions(-) create mode 100644 include/linux/platform_data/x86/asus-wmi-leds-ids.h (limited to 'include') diff --git a/include/linux/platform_data/x86/asus-wmi-leds-ids.h b/include/linux/platform_data/x86/asus-wmi-leds-ids.h new file mode 100644 index 000000000000..034a039c4e37 --- /dev/null +++ b/include/linux/platform_data/x86/asus-wmi-leds-ids.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H +#define __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H + +#include +#include + +/* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */ +#if IS_REACHABLE(CONFIG_ASUS_WMI) || IS_REACHABLE(CONFIG_HID_ASUS) +static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Zephyrus"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Strix"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt P16"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GA403U"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "GU605M"), + }, + }, + { + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "RC71L"), + }, + }, + { }, +}; +#endif + +#endif /* __PLATFORM_DATA_X86_ASUS_WMI_LEDS_IDS_H */ diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index dbd44d9fbb6f..8ea8925a0fc5 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -4,7 +4,9 @@ #include #include -#include + +#define ASUS_WMI_MGMT_GUID "97845ED0-4E6D-11DE-8A39-0800200C9A66" +#define ASUS_ACPI_UID_ASUSWMI "ASUSWMI" /* WMI Methods */ #define ASUS_WMI_METHODID_SPEC 0x43455053 /* BIOS SPECification */ @@ -191,44 +193,4 @@ static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, } #endif -/* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */ -static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Zephyrus"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Strix"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ROG Flow"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_PRODUCT_FAMILY, "ProArt P16"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "GA403U"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "GU605M"), - }, - }, - { - .matches = { - DMI_MATCH(DMI_BOARD_NAME, "RC71L"), - }, - }, - { }, -}; - #endif /* __PLATFORM_DATA_X86_ASUS_WMI_H */ -- cgit v1.2.3 From 628cb03b15f2a0f10534979b3ea9c8befe87c381 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:13 +0100 Subject: platform/x86: asus-armoury: add panel_hd_mode attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add panel_hd_mode to toggle the panel mode between single and high definition modes. Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/20251102215319.3126879-4-denis.benato@linux.dev Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 8ea8925a0fc5..3cc235b20be4 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -75,6 +75,7 @@ #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO 0x00110019 /* Misc */ +#define ASUS_WMI_DEVID_PANEL_HD 0x0005001C #define ASUS_WMI_DEVID_PANEL_OD 0x00050019 #define ASUS_WMI_DEVID_CAMERA 0x00060013 #define ASUS_WMI_DEVID_LID_FLIP 0x00060062 -- cgit v1.2.3 From bfd3749d489ec0df27ed94ee3dfd9475fea27bf9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Nov 2025 09:18:04 -1000 Subject: sched_ext: Use shorter slice in bypass mode There have been reported cases of bypass mode not making forward progress fast enough. The 20ms default slice is unnecessarily long for bypass mode where the primary goal is ensuring all tasks can make forward progress. Introduce SCX_SLICE_BYPASS set to 5ms and make the scheduler automatically switch to it when entering bypass mode. Also make the bypass slice value tunable through the slice_bypass_us module parameter (adjustable between 100us and 100ms) to make it easier to test whether slice durations are a factor in problem cases. v3: Use READ_ONCE/WRITE_ONCE for scx_slice_dfl access (Dan). v2: Removed slice_dfl_us module parameter. Fixed typos (Andrea). Reviewed-by: Emil Tsalapatis Reviewed-by: Andrea Righi Cc: Dan Schatzberg Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index eb776b094d36..60285c3d07cf 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -17,7 +17,18 @@ enum scx_public_consts { SCX_OPS_NAME_LEN = 128, + /* + * %SCX_SLICE_DFL is used to refill slices when the BPF scheduler misses + * to set the slice for a task that is selected for execution. + * %SCX_EV_REFILL_SLICE_DFL counts the number of times the default slice + * refill has been triggered. + * + * %SCX_SLICE_BYPASS is used as the slice for all tasks in the bypass + * mode. As making forward progress for all tasks is the main goal of + * the bypass mode, a shorter slice is used. + */ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ + SCX_SLICE_BYPASS = 5 * 1000000, /* 5ms */ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ }; -- cgit v1.2.3 From 61debc251c1c9150c7bdfd5c028bc2d078e17d22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Nov 2025 09:18:06 -1000 Subject: sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode Bypass mode routes tasks through fallback dispatch queues. Originally a single global DSQ, b7b3b2dbae73 ("sched_ext: Split the global DSQ per NUMA node") changed this to per-node DSQs to resolve NUMA-related livelocks. Dan Schatzberg found per-node DSQs can still livelock when many threads are pinned to different small CPU subsets: each CPU must scan many incompatible tasks to find runnable ones, causing severe contention with high CPU counts. Switch to per-CPU bypass DSQs. Each task queues on its current CPU. Default idle CPU selection and direct dispatch handle most cases well. This introduces a failure mode when tasks concentrate on one CPU in over-saturated systems. If the BPF scheduler severely skews placement before triggering bypass, that CPU's queue may be too long to drain, causing RCU stalls. A load balancer in a future patch will address this. The bypass DSQ is separate from local DSQ to enable load balancing: local DSQs use rq locks, preventing efficient scanning and transfer across CPUs, especially problematic when systems are already contended. v2: Clarified why bypass DSQ is separate from local DSQ (Andrea Righi). Reported-by: Dan Schatzberg Reviewed-by: Dan Schatzberg Reviewed-by: Andrea Righi Reviewed-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 60285c3d07cf..3d3216ff9188 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -57,6 +57,7 @@ enum scx_dsq_id_flags { SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, + SCX_DSQ_BYPASS = SCX_DSQ_FLAG_BUILTIN | 3, SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, }; -- cgit v1.2.3 From 582f700e1bdc5978f41e3d8d65d3e16e34e9be8a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Nov 2025 09:18:12 -1000 Subject: sched_ext: Hook up hardlockup detector A poorly behaving BPF scheduler can trigger hard lockup. For example, on a large system with many tasks pinned to different subsets of CPUs, if the BPF scheduler puts all tasks in a single DSQ and lets all CPUs at it, the DSQ lock can be contended to the point where hardlockup triggers. Unfortunately, hardlockup can be the first signal out of such situations, thus requiring hardlockup handling. Hook scx_hardlockup() into the hardlockup detector to try kicking out the current scheduler in an attempt to recover the system to a good state. The handling strategy can delay watchdog taking its own action by one polling period; however, given that the only remediation for hardlockup is crash, this is likely an acceptable trade-off. v2: Add missing dummy scx_hardlockup() definition for !CONFIG_SCHED_CLASS_EXT (kernel test bot). Reported-by: Dan Schatzberg Cc: Emil Tsalapatis Cc: Douglas Anderson Cc: Andrew Morton Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 3d3216ff9188..d6c152475f5b 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -223,6 +223,7 @@ struct sched_ext_entity { void sched_ext_dead(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); void scx_softlockup(u32 dur_s); +bool scx_hardlockup(void); bool scx_rcu_cpu_stall(void); #else /* !CONFIG_SCHED_CLASS_EXT */ @@ -230,6 +231,7 @@ bool scx_rcu_cpu_stall(void); static inline void sched_ext_dead(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} +static inline bool scx_hardlockup(void) { return false; } static inline bool scx_rcu_cpu_stall(void) { return false; } #endif /* CONFIG_SCHED_CLASS_EXT */ -- cgit v1.2.3 From d2974cc79f7139cc851b84ad4f77805e93c40fe1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Nov 2025 09:18:14 -1000 Subject: sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR macro in preparation for additional users. Reviewed-by: Emil Tsalapatis Cc: Dan Schatzberg Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index d6c152475f5b..70ee5c28a74d 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -149,6 +149,13 @@ struct scx_dsq_list_node { u32 priv; /* can be used by iter cursor */ }; +#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv) \ + (struct scx_dsq_list_node) { \ + .node = LIST_HEAD_INIT((__node).node), \ + .flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags), \ + .priv = (__priv), \ + } + /* * The following is embedded in task_struct and contains all fields necessary * for a task to be scheduled by SCX. -- cgit v1.2.3 From 95d1df610cdc7497510cc710435a5c8c4e3db606 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Nov 2025 09:18:16 -1000 Subject: sched_ext: Implement load balancer for bypass mode In bypass mode, tasks are queued on per-CPU bypass DSQs. While this works well in most cases, there is a failure mode where a BPF scheduler can skew task placement severely before triggering bypass in highly over-saturated systems. If most tasks end up concentrated on a few CPUs, those CPUs can accumulate queues that are too long to drain in a reasonable time, leading to RCU stalls and hung tasks. Implement a simple timer-based load balancer that redistributes tasks across CPUs within each NUMA node. The balancer runs periodically (default 500ms, tunable via bypass_lb_intv_us module parameter) and moves tasks from overloaded CPUs to underloaded ones. When moving tasks between bypass DSQs, the load balancer holds nested DSQ locks to avoid dropping and reacquiring the donor DSQ lock on each iteration, as donor DSQs can be very long and highly contended. Add the SCX_ENQ_NESTED flag and use raw_spin_lock_nested() in dispatch_enqueue() to support this. The load balancer timer function reads scx_bypass_depth locklessly to check whether bypass mode is active. Use WRITE_ONCE() when updating scx_bypass_depth to pair with the READ_ONCE() in the timer function. This has been tested on a 192 CPU dual socket AMD EPYC machine with ~20k runnable tasks running scx_cpu0. As scx_cpu0 queues all tasks to CPU0, almost all tasks end up on CPU0 creating severe imbalance. Without the load balancer, disabling the scheduler can lead to RCU stalls and hung tasks, taking a very long time to complete. With the load balancer, disable completes in about a second. The load balancing operation can be monitored using the sched_ext_bypass_lb tracepoint and disabled by setting bypass_lb_intv_us to 0. v2: Lock both rq and DSQ in bypass_lb_cpu() and use dispatch_dequeue_locked() to prevent races with dispatch_dequeue() (Andrea Righi). Cc: Andrea Righi Cc: Dan Schatzberg Cc: Emil Tsalapatis Reviewed_by: Emil Tsalapatis Signed-off-by: Tejun Heo --- include/trace/events/sched_ext.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include') diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h index 50e4b712735a..d1bf5acd59c5 100644 --- a/include/trace/events/sched_ext.h +++ b/include/trace/events/sched_ext.h @@ -45,6 +45,45 @@ TRACE_EVENT(sched_ext_event, ) ); +TRACE_EVENT(sched_ext_bypass_lb, + + TP_PROTO(__u32 node, __u32 nr_cpus, __u32 nr_tasks, __u32 nr_balanced, + __u32 before_min, __u32 before_max, + __u32 after_min, __u32 after_max), + + TP_ARGS(node, nr_cpus, nr_tasks, nr_balanced, + before_min, before_max, after_min, after_max), + + TP_STRUCT__entry( + __field( __u32, node ) + __field( __u32, nr_cpus ) + __field( __u32, nr_tasks ) + __field( __u32, nr_balanced ) + __field( __u32, before_min ) + __field( __u32, before_max ) + __field( __u32, after_min ) + __field( __u32, after_max ) + ), + + TP_fast_assign( + __entry->node = node; + __entry->nr_cpus = nr_cpus; + __entry->nr_tasks = nr_tasks; + __entry->nr_balanced = nr_balanced; + __entry->before_min = before_min; + __entry->before_max = before_max; + __entry->after_min = after_min; + __entry->after_max = after_max; + ), + + TP_printk("node %u: nr_cpus=%u nr_tasks=%u nr_balanced=%u min=%u->%u max=%u->%u", + __entry->node, __entry->nr_cpus, + __entry->nr_tasks, __entry->nr_balanced, + __entry->before_min, __entry->after_min, + __entry->before_max, __entry->after_max + ) +); + #endif /* _TRACE_SCHED_EXT_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 05d6f1cc2dc214c1491181be13f37d2a3a26f694 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 13 Oct 2025 11:12:02 +0200 Subject: compiler.h: remove ARCH_SEL() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Its last user was removed in commit 8ea815399c3f ("compiler: remove __ADDRESSABLE_ASM{_STR,}() again"). Link: https://lkml.kernel.org/r/20251013-arch-sel-v1-1-7eef9b22ceb0@linutronix.de Signed-off-by: Thomas Weißschuh Cc: Luc Van Oostenryck Signed-off-by: Andrew Morton --- include/linux/compiler.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 5b45ea7dff3e..a9a2f8aae821 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -269,12 +269,6 @@ static inline void *offset_to_ptr(const int *off) #endif /* __ASSEMBLY__ */ -#ifdef CONFIG_64BIT -#define ARCH_SEL(a,b) a -#else -#define ARCH_SEL(a,b) b -#endif - /* * Force the compiler to emit 'sym' as a symbol, so that we can reference * it from inline assembler. Necessary in case 'sym' could be inlined -- cgit v1.2.3 From adc15829fb73e402903b7030729263b6ee4a7232 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Thu, 16 Oct 2025 19:58:31 +0530 Subject: crash: let architecture decide crash memory export to iomem_resource With the generic crashkernel reservation, the kernel emits the following warning on powerpc: WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/mem.c:341 add_system_ram_resources+0xfc/0x180 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.17.0-auto-12607-g5472d60c129f #1 VOLUNTARY Hardware name: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1110.01 (NH1110_069) hv:phyp pSeries NIP: c00000000201de3c LR: c00000000201de34 CTR: 0000000000000000 REGS: c000000127cef8a0 TRAP: 0700 Not tainted (6.17.0-auto-12607-g5472d60c129f) MSR: 8000000002029033 CR: 84000840 XER: 20040010 CFAR: c00000000017eed0 IRQMASK: 0 GPR00: c00000000201de34 c000000127cefb40 c0000000016a8100 0000000000000001 GPR04: c00000012005aa00 0000000020000000 c000000002b705c8 0000000000000000 GPR08: 000000007fffffff fffffffffffffff0 c000000002db8100 000000011fffffff GPR12: c00000000201dd40 c000000002ff0000 c0000000000112bc 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 c0000000015a3808 GPR24: c00000000200468c c000000001699888 0000000000000106 c0000000020d1950 GPR28: c0000000014683f8 0000000081000200 c0000000015c1868 c000000002b9f710 NIP [c00000000201de3c] add_system_ram_resources+0xfc/0x180 LR [c00000000201de34] add_system_ram_resources+0xf4/0x180 Call Trace: add_system_ram_resources+0xf4/0x180 (unreliable) do_one_initcall+0x60/0x36c do_initcalls+0x120/0x220 kernel_init_freeable+0x23c/0x390 kernel_init+0x34/0x26c ret_from_kernel_user_thread+0x14/0x1c This warning occurs due to a conflict between crashkernel and System RAM iomem resources. The generic crashkernel reservation adds the crashkernel memory range to /proc/iomem during early initialization. Later, all memblock ranges are added to /proc/iomem as System RAM. If the crashkernel region overlaps with any memblock range, it causes a conflict while adding those memblock regions as iomem resources, triggering the above warning. The conflicting memblock regions are then omitted from /proc/iomem. For example, if the following crashkernel region is added to /proc/iomem: 20000000-11fffffff : Crash kernel then the following memblock regions System RAM regions fail to be inserted: 00000000-7fffffff : System RAM 80000000-257fffffff : System RAM Fix this by not adding the crashkernel memory to /proc/iomem on powerpc. Introduce an architecture hook to let each architecture decide whether to export the crashkernel region to /proc/iomem. For more info checkout commit c40dd2f766440 ("powerpc: Add System RAM to /proc/iomem") and commit bce074bdbc36 ("powerpc: insert System RAM resource to prevent crashkernel conflict") Note: Before switching to the generic crashkernel reservation, powerpc never exported the crashkernel region to /proc/iomem. Link: https://lkml.kernel.org/r/20251016142831.144515-1-sourabhjain@linux.ibm.com Fixes: e3185ee438c2 ("powerpc/crash: use generic crashkernel reservation"). Signed-off-by: Sourabh Jain Reported-by: Venkat Rao Bagalkote Closes: https://lore.kernel.org/all/90937fe0-2e76-4c82-b27e-7b8a7fe3ac69@linux.ibm.com/ Tested-by: Venkat Rao Bagalkote Cc: Baoquan he Cc: Hari Bathini Cc: Madhavan Srinivasan Cc: Mahesh Salgaonkar Cc: Michael Ellerman Cc: Ritesh Harjani (IBM) Cc: Vivek Goyal Cc: Dave Young Cc: Mike Rapoport Cc: Signed-off-by: Andrew Morton --- include/linux/crash_reserve.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/crash_reserve.h b/include/linux/crash_reserve.h index 7b44b41d0a20..f0dc03d94ca2 100644 --- a/include/linux/crash_reserve.h +++ b/include/linux/crash_reserve.h @@ -32,6 +32,12 @@ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, void __init reserve_crashkernel_cma(unsigned long long cma_size); #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION +#ifndef arch_add_crash_res_to_iomem +static inline bool arch_add_crash_res_to_iomem(void) +{ + return true; +} +#endif #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE #define DEFAULT_CRASH_KERNEL_LOW_SIZE (128UL << 20) #endif -- cgit v1.2.3 From 37ade54f386c829597f74b54bad335c12bd2a698 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Wed, 22 Oct 2025 10:28:04 +0200 Subject: taint/module: remove unnecessary taint_flag.module field The TAINT_RANDSTRUCT and TAINT_FWCTL flags are mistakenly set in the taint_flags table as per-module flags. While this can be trivially corrected, the issue can be avoided altogether by removing the taint_flag.module field. This is possible because, since commit 7fd8329ba502 ("taint/module: Clean up global and module taint flags handling") in 2016, the handling of module taint flags has been fully generic. Specifically, module_flags_taint() can print all flags, and the required output buffer size is properly defined in terms of TAINT_FLAGS_COUNT. The actual per-module flags are always those added to module.taints by calls to add_taint_module(). Link: https://lkml.kernel.org/r/20251022082938.26670-1-petr.pavlu@suse.com Signed-off-by: Petr Pavlu Acked-by: Petr Mladek Reviewed-by: Randy Dunlap Cc: Aaron Tomlin Cc: Luis Chamberalin Cc: Petr Pavlu Cc: Sami Tolvanen Signed-off-by: Andrew Morton --- include/linux/panic.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/panic.h b/include/linux/panic.h index 6f972a66c13e..a00bc0937698 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -86,7 +86,6 @@ static inline void set_arch_panic_timeout(int timeout, int arch_default_timeout) struct taint_flag { char c_true; /* character printed when tainted */ char c_false; /* character printed when not tainted */ - bool module; /* also show as a per-module taint flag */ const char *desc; /* verbose description of the set taint flag */ }; -- cgit v1.2.3 From d99dc586ca7c7729450af2ed39ca1483c0eb7b5c Mon Sep 17 00:00:00 2001 From: "Yury Norov (NVIDIA)" Date: Thu, 23 Oct 2025 13:16:06 -0400 Subject: uaccess: decouple INLINE_COPY_FROM_USER and CONFIG_RUST MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1f9a8286bc0c ("uaccess: always export _copy_[from|to]_user with CONFIG_RUST") exports _copy_{from,to}_user() unconditionally, if RUST is enabled. This pollutes exported symbols namespace, and spreads RUST ifdefery in core files. It's better to declare a corresponding helper under the rust/helpers, similarly to how non-underscored copy_{from,to}_user() is handled. [yury.norov@gmail.com: drop rust part of comment for _copy_from_user(), per Alice] Link: https://lkml.kernel.org/r/20251024154754.99768-1-yury.norov@gmail.com Link: https://lkml.kernel.org/r/20251023171607.1171534-1-yury.norov@gmail.com Signed-off-by: Yury Norov (NVIDIA) Acked-by: Arnd Bergmann Acked-by: Miguel Ojeda Reviewed-by: Alice Ryhl Tested-by: Alice Ryhl Cc: Alex Gaynor Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Danilo Krummrich Cc: Gary Guo Cc: John Hubbard Cc: Trevor Gross Signed-off-by: Andrew Morton --- include/linux/uaccess.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 1beb5b395d81..01cbd7dd0ba3 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -152,8 +152,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) * directly in the normal copy_to/from_user(), the other ones go * through an extern _copy_to/from_user(), which expands the same code * here. - * - * Rust code always uses the extern definition. */ static inline __must_check unsigned long _inline_copy_from_user(void *to, const void __user *from, unsigned long n) -- cgit v1.2.3 From 6c2e6e2c1af1809d1d9cdbd50ac80f54f5995bdb Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 25 Oct 2025 16:00:03 +0800 Subject: dynamic_debug: add support for print stack In practical problem diagnosis, especially during the boot phase, it is often desirable to know the call sequence. However, currently, apart from adding print statements and recompiling the kernel, there seems to be no good alternative. If dynamic_debug supported printing the call stack, it would be very helpful for diagnosing issues. This patch add support '+d' for dump stack. Link: https://lkml.kernel.org/r/20251025080003.312536-1-yebin@huaweicloud.com Signed-off-by: Ye Bin Cc: Jason Baron Cc: Jim Cromie Signed-off-by: Andrew Morton --- include/linux/dynamic_debug.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index ff44ec346162..05743900a116 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -38,11 +38,12 @@ struct _ddebug { #define _DPRINTK_FLAGS_INCL_LINENO (1<<3) #define _DPRINTK_FLAGS_INCL_TID (1<<4) #define _DPRINTK_FLAGS_INCL_SOURCENAME (1<<5) +#define _DPRINTK_FLAGS_INCL_STACK (1<<6) #define _DPRINTK_FLAGS_INCL_ANY \ (_DPRINTK_FLAGS_INCL_MODNAME | _DPRINTK_FLAGS_INCL_FUNCNAME |\ _DPRINTK_FLAGS_INCL_LINENO | _DPRINTK_FLAGS_INCL_TID |\ - _DPRINTK_FLAGS_INCL_SOURCENAME) + _DPRINTK_FLAGS_INCL_SOURCENAME | _DPRINTK_FLAGS_INCL_STACK) #if defined DEBUG #define _DPRINTK_FLAGS_DEFAULT _DPRINTK_FLAGS_PRINT @@ -160,6 +161,12 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, const struct ib_device *ibdev, const char *fmt, ...); +#define __dynamic_dump_stack(desc) \ +{ \ + if (desc.flags & _DPRINTK_FLAGS_INCL_STACK) \ + dump_stack(); \ +} + #define DEFINE_DYNAMIC_DEBUG_METADATA_CLS(name, cls, fmt) \ static struct _ddebug __aligned(8) \ __section("__dyndbg") name = { \ @@ -220,8 +227,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, */ #define __dynamic_func_call_cls(id, cls, fmt, func, ...) do { \ DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt); \ - if (DYNAMIC_DEBUG_BRANCH(id)) \ + if (DYNAMIC_DEBUG_BRANCH(id)) { \ func(&id, ##__VA_ARGS__); \ + __dynamic_dump_stack(id); \ + } \ } while (0) #define __dynamic_func_call(id, fmt, func, ...) \ __dynamic_func_call_cls(id, _DPRINTK_CLASS_DFLT, fmt, \ @@ -229,8 +238,10 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor, #define __dynamic_func_call_cls_no_desc(id, cls, fmt, func, ...) do { \ DEFINE_DYNAMIC_DEBUG_METADATA_CLS(id, cls, fmt); \ - if (DYNAMIC_DEBUG_BRANCH(id)) \ + if (DYNAMIC_DEBUG_BRANCH(id)) { \ func(__VA_ARGS__); \ + __dynamic_dump_stack(id); \ + } \ } while (0) #define __dynamic_func_call_no_desc(id, fmt, func, ...) \ __dynamic_func_call_cls_no_desc(id, _DPRINTK_CLASS_DFLT, \ -- cgit v1.2.3 From a0b8c6af29a4be3ca2ff9a95cf71e54db5d73e65 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 24 Oct 2025 21:51:20 +0100 Subject: lib/xxhash: remove more unused xxh functions xxh32_reset() and xxh32_copy_state() are unused, and with those gone, the xxh32_state struct is also unused. xxh64_copy_state() is also unused. Remove them all. (Also fixes a comment above the xxh64_state that referred to it as xxh32_state). Link: https://lkml.kernel.org/r/20251024205120.454508-1-linux@treblig.org Signed-off-by: Dr. David Alan Gilbert Suggested-by: Christoph Hellwig Reviewed-by: Kuan-Wei Chiu Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/xxhash.h | 46 +--------------------------------------------- 1 file changed, 1 insertion(+), 45 deletions(-) (limited to 'include') diff --git a/include/linux/xxhash.h b/include/linux/xxhash.h index 27f57eca8cb1..587122e2c29c 100644 --- a/include/linux/xxhash.h +++ b/include/linux/xxhash.h @@ -141,21 +141,7 @@ static inline unsigned long xxhash(const void *input, size_t length, */ /** - * struct xxh32_state - private xxh32 state, do not use members directly - */ -struct xxh32_state { - uint32_t total_len_32; - uint32_t large_len; - uint32_t v1; - uint32_t v2; - uint32_t v3; - uint32_t v4; - uint32_t mem32[4]; - uint32_t memsize; -}; - -/** - * struct xxh32_state - private xxh64 state, do not use members directly + * struct xxh64_state - private xxh64 state, do not use members directly */ struct xxh64_state { uint64_t total_len; @@ -167,16 +153,6 @@ struct xxh64_state { uint32_t memsize; }; -/** - * xxh32_reset() - reset the xxh32 state to start a new hashing operation - * - * @state: The xxh32 state to reset. - * @seed: Initialize the hash state with this seed. - * - * Call this function on any xxh32_state to prepare for a new hashing operation. - */ -void xxh32_reset(struct xxh32_state *state, uint32_t seed); - /** * xxh64_reset() - reset the xxh64 state to start a new hashing operation * @@ -210,24 +186,4 @@ int xxh64_update(struct xxh64_state *state, const void *input, size_t length); */ uint64_t xxh64_digest(const struct xxh64_state *state); -/*-************************** - * Utils - ***************************/ - -/** - * xxh32_copy_state() - copy the source state into the destination state - * - * @src: The source xxh32 state. - * @dst: The destination xxh32 state. - */ -void xxh32_copy_state(struct xxh32_state *dst, const struct xxh32_state *src); - -/** - * xxh64_copy_state() - copy the source state into the destination state - * - * @src: The source xxh64 state. - * @dst: The destination xxh64 state. - */ -void xxh64_copy_state(struct xxh64_state *dst, const struct xxh64_state *src); - #endif /* XXHASH_H */ -- cgit v1.2.3 From 113557b0406818a8a5df3479b0a89125d2b2a04c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:17 -0400 Subject: vfio: Provide a get_region_info op Instead of hooking the general ioctl op, have the core code directly decode VFIO_DEVICE_GET_REGION_INFO and call an op just for it. This is intended to allow mechanical changes to the drivers to pull their VFIO_DEVICE_GET_REGION_INFO int oa function. Later patches will improve the function signature to consolidate more code. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 ++ include/linux/vfio_pci_core.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee..be5fcf8432e8 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -132,6 +132,8 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); + int (*get_region_info)(struct vfio_device *vdev, + struct vfio_region_info __user *arg); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2..160bc2e31ece 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -115,6 +115,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg); int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); +int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info __user *arg); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, -- cgit v1.2.3 From d604e1ec246d236deff57ac7e89e073dd911d60b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 31 Oct 2025 13:39:09 -0700 Subject: scsi: core: Support allocating reserved commands Quite some drivers are using management commands internally. These commands typically use the same tag pool as regular SCSI commands. Tags for these management commands are set aside before allocating the block-mq tag bitmap for regular SCSI commands. The block layer already supports this via the reserved tag mechanism. Add a new field 'nr_reserved_cmds' to the SCSI host template to instruct the block layer to set aside a tag space for these management commands by using reserved tags. Exclude reserved commands from .can_queue because .can_queue is visible in sysfs. [ bvanassche: modified patch title and patch description. Left out the following statements: "if (sht->nr_reserved_cmds)" and also "if (sdev->host->nr_reserved_cmds) flags |= BLK_MQ_REQ_RESERVED;". Moved nr_reserved_cmds declarations and statements close to the corresponding can_queue declarations and statements. See also https://lore.kernel.org/linux-scsi/20210503150333.130310-11-hare@suse.de/ ] Signed-off-by: Hannes Reinecke Reviewed-by: John Garry Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20251031204029.2883185-2-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/scsi/scsi_host.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index f5a243261236..7b8f144ccf7d 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -375,10 +375,19 @@ struct scsi_host_template { /* * This determines if we will use a non-interrupt driven * or an interrupt driven scheme. It is set to the maximum number - * of simultaneous commands a single hw queue in HBA will accept. + * of simultaneous commands a single hw queue in HBA will accept + * excluding internal commands. */ int can_queue; + /* + * This determines how many commands the HBA will set aside + * for internal commands. This number will be added to + * @can_queue to calculate the maximum number of simultaneous + * commands sent to the host. + */ + int nr_reserved_cmds; + /* * In many instances, especially where disconnect / reconnect are * supported, our host also has an ID on the SCSI bus. If this is @@ -611,7 +620,17 @@ struct Scsi_Host { unsigned short max_cmd_len; int this_id; + + /* + * Number of commands this host can handle at the same time. + * This excludes reserved commands as specified by nr_reserved_cmds. + */ int can_queue; + /* + * Number of reserved commands to allocate, if any. + */ + unsigned int nr_reserved_cmds; + short cmd_per_lun; short unsigned int sg_tablesize; short unsigned int sg_prot_tablesize; -- cgit v1.2.3 From d630fbf6fc8ce2fc95de7784de5499387b682dc1 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 31 Oct 2025 13:39:12 -0700 Subject: scsi: core: Support allocating a pseudo SCSI device Allocate a pseudo SCSI device if 'nr_reserved_cmds' has been set. Pseudo SCSI devices have the SCSI ID :U64_MAX so they won't clash with any devices the LLD might create. Pseudo SCSI devices are excluded from scanning and will not show up in sysfs. Additionally, pseudo SCSI devices are skipped by shost_for_each_device(). This prevents that the SCSI error handler tries to submit a reset to a non-existent logical unit. Do not allocate a budget map for pseudo SCSI devices since the cmd_per_lun limit does not apply to pseudo SCSI devices. Do not perform queue depth ramp up / ramp down for pseudo SCSI devices. Pseudo SCSI devices will be used to send internal commands to a storage device. [ bvanassche: edited patch description / renamed host_sdev into pseudo_sdev / unexported scsi_get_host_dev() / modified error path in scsi_get_pseudo_dev() / skip pseudo devices in __scsi_iterate_devices() and also when calling sdev_init(), sdev_configure() and sdev_destroy(). See also https://lore.kernel.org/linux-scsi/20211125151048.103910-2-hare@suse.de/ ] Reviewed-by: John Garry Signed-off-by: Hannes Reinecke Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20251031204029.2883185-5-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/scsi/scsi_device.h | 16 ++++++++++++++++ include/scsi/scsi_host.h | 6 ++++++ 2 files changed, 22 insertions(+) (limited to 'include') diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 4c106342c4ae..918631088711 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -589,6 +589,22 @@ static inline unsigned int sdev_id(struct scsi_device *sdev) #define scmd_id(scmd) sdev_id((scmd)->device) #define scmd_channel(scmd) sdev_channel((scmd)->device) +/** + * scsi_device_is_pseudo_dev() - Whether a device is a pseudo SCSI device. + * @sdev: SCSI device to examine + * + * A pseudo SCSI device can be used to allocate SCSI commands but does not show + * up in sysfs. Additionally, the logical unit information in *@sdev is made up. + * + * This function tests the LUN number instead of comparing @sdev with + * @sdev->host->pseudo_sdev because this function may be called before + * @sdev->host->pseudo_sdev has been initialized. + */ +static inline bool scsi_device_is_pseudo_dev(struct scsi_device *sdev) +{ + return sdev->lun == U64_MAX; +} + /* * checks for positions of the SCSI state machine */ diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 7b8f144ccf7d..4f945a20d198 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -721,6 +721,12 @@ struct Scsi_Host { /* ldm bits */ struct device shost_gendev, shost_dev; + /* + * A SCSI device structure used for sending internal commands to the + * HBA. There is no corresponding logical unit inside the SCSI device. + */ + struct scsi_device *pseudo_sdev; + /* * Points to the transport data (if any) which is allocated * separately -- cgit v1.2.3 From 11ea1de3fc4ba94127034cb01df63a666c4c9836 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 31 Oct 2025 13:39:13 -0700 Subject: scsi: core: Introduce .queue_reserved_command() Reserved commands will be used by SCSI LLDs for submitting internal commands. Since the SCSI host, target and device limits do not apply to the reserved command use cases, bypass the SCSI host limit checks for reserved commands. Introduce the .queue_reserved_command() callback for reserved commands. Additionally, do not activate the SCSI error handler if a reserved command fails such that reserved commands can be submitted from inside the SCSI error handler. [ bvanassche: modified patch title and patch description. Renamed .reserved_queuecommand() into .queue_reserved_command(). Changed the second argument of __blk_mq_end_request() from 0 into error code in the completion path if cmd->result != 0. Rewrote the scsi_queue_rq() changes. See also https://lore.kernel.org/linux-scsi/1666693096-180008-5-git-send-email-john.garry@huawei.com/ ] Cc: Hannes Reinecke Signed-off-by: John Garry Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Link: https://patch.msgid.link/20251031204029.2883185-6-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/scsi/scsi_host.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 4f945a20d198..e87cf7eadd26 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -86,6 +86,12 @@ struct scsi_host_template { */ int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *); + /* + * Queue a reserved command (BLK_MQ_REQ_RESERVED). The .queuecommand() + * documentation also applies to the .queue_reserved_command() callback. + */ + int (*queue_reserved_command)(struct Scsi_Host *, struct scsi_cmnd *); + /* * The commit_rqs function is used to trigger a hardware * doorbell after some requests have been queued with -- cgit v1.2.3 From a2ab4e33286de37f3fe8f28f86f5f71d6b0ae3b0 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 31 Oct 2025 13:39:14 -0700 Subject: scsi: core: Add scsi_{get,put}_internal_cmd() helpers Add helper functions to allow LLDDs to allocate and free internal commands. [ bvanassche: changed the 'nowait' argument into a 'flags' argument. See also https://lore.kernel.org/linux-scsi/20211125151048.103910-3-hare@suse.de/ ] Reviewed-by: John Garry Signed-off-by: Hannes Reinecke Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20251031204029.2883185-7-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/scsi/scsi_device.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 918631088711..1e2e599517e9 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -558,6 +558,10 @@ int scsi_execute_cmd(struct scsi_device *sdev, const unsigned char *cmd, const struct scsi_exec_args *args); void scsi_failures_reset_retries(struct scsi_failures *failures); +struct scsi_cmnd *scsi_get_internal_cmd(struct scsi_device *sdev, + enum dma_data_direction data_direction, + blk_mq_req_flags_t flags); +void scsi_put_internal_cmd(struct scsi_cmnd *scmd); extern void sdev_disable_disk_events(struct scsi_device *sdev); extern void sdev_enable_disk_events(struct scsi_device *sdev); extern int scsi_vpd_lun_id(struct scsi_device *, char *, size_t); -- cgit v1.2.3 From 22089c218037ca7cd50d4fa20e8b5bd746a9b397 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 31 Oct 2025 13:39:31 -0700 Subject: scsi: ufs: core: Optimize the hot path Set .cmd_size in the SCSI host template such that the SCSI core makes struct scsi_cmnd and struct ufshcd_lrb adjacent. Convert the cmd->lrbp and lrbp->cmd memory loads into pointer offset calculations. Remove the data structure members that became superfluous, namely ufshcd_lrb.cmd and ufs_hba.lrb. Since ufshcd_lrb.cmd is removed, this pointer cannot be used anymore to test whether or not a command is a SCSI command. Introduce a new function for this purpose, namely ufshcd_is_scsi_cmd(). Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20251031204029.2883185-24-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 00152e135fc9..fbed47b6c61f 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -161,7 +161,6 @@ struct ufs_pm_lvl_states { * @ucd_prdt_dma_addr: PRDT dma address for debug * @ucd_rsp_dma_addr: UPIU response dma address for debug * @ucd_req_dma_addr: UPIU request dma address for debug - * @cmd: pointer to SCSI command * @scsi_status: SCSI status of the command * @command_type: SCSI, UFS, Query. * @task_tag: Task tag of the command @@ -186,7 +185,6 @@ struct ufshcd_lrb { dma_addr_t ucd_rsp_dma_addr; dma_addr_t ucd_prdt_dma_addr; - struct scsi_cmnd *cmd; int scsi_status; int command_type; @@ -833,7 +831,6 @@ enum ufshcd_mcq_opr { * @spm_lvl: desired UFS power management level during system PM. * @pm_op_in_progress: whether or not a PM operation is in progress. * @ahit: value of Auto-Hibernate Idle Timer register. - * @lrb: local reference block * @outstanding_tasks: Bits representing outstanding task requests * @outstanding_lock: Protects @outstanding_reqs. * @outstanding_reqs: Bits representing outstanding transfer requests @@ -976,8 +973,6 @@ struct ufs_hba { /* Auto-Hibernate Idle Timer register value */ u32 ahit; - struct ufshcd_lrb *lrb; - unsigned long outstanding_tasks; spinlock_t outstanding_lock; unsigned long outstanding_reqs; -- cgit v1.2.3 From 9a2c9500921d5ebbe96f7531adc73d9205c76485 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 31 Oct 2025 13:39:33 -0700 Subject: scsi: ufs: core: Remove the ufshcd_lrb task_tag member Remove the ufshcd_lrb task_tag member and use scsi_cmd_to_rq(cmd)->tag instead. Use rq->tag instead of lrbp->task_tag. This patch reduces the size of struct ufshcd_lrb. Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20251031204029.2883185-26-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index fbed47b6c61f..a92062f65455 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -188,7 +188,6 @@ struct ufshcd_lrb { int scsi_status; int command_type; - int task_tag; u8 lun; /* UPIU LUN id field is only 8-bit wide */ bool intr_cmd; bool req_abort_skip; -- cgit v1.2.3 From 08b12cda6c44dc015bcc152613c35ee0ae8f37b9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 31 Oct 2025 13:39:36 -0700 Subject: scsi: ufs: core: Switch to scsi_get_internal_cmd() Instead of storing the tag of the reserved command in hba->reserved_slot, use scsi_get_internal_cmd() and scsi_put_internal_cmd() to allocate the tag for the reserved command dynamically. Add ufshcd_queue_reserved_command() for submitting reserved commands. Add support in ufshcd_abort() for device management commands. Use blk_execute_rq() for submitting reserved commands. Remove the code and data structures that became superfluous. This includes ufshcd_wait_for_dev_cmd(), hba->reserved_slot and ufs_dev_cmd.complete. Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20251031204029.2883185-29-bvanassche@acm.org Signed-off-by: Martin K. Petersen --- include/ufs/ufshcd.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index a92062f65455..c07ba003a5cb 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -236,13 +236,11 @@ struct ufs_query { * struct ufs_dev_cmd - all assosiated fields with device management commands * @type: device management command type - Query, NOP OUT * @lock: lock to allow one command at a time - * @complete: internal commands completion * @query: Device management query information */ struct ufs_dev_cmd { enum dev_cmd_type type; struct mutex lock; - struct completion complete; struct ufs_query query; }; @@ -838,7 +836,6 @@ enum ufshcd_mcq_opr { * @nutrs: Transfer Request Queue depth supported by controller * @nortt - Max outstanding RTTs supported by controller * @nutmrs: Task Management Queue depth supported by controller - * @reserved_slot: Used to submit device commands. Protected by @dev_cmd.lock. * @ufs_version: UFS Version to which controller complies * @vops: pointer to variant specific operations * @vps: pointer to variant specific parameters @@ -929,7 +926,6 @@ enum ufshcd_mcq_opr { * @res: array of resource info of MCQ registers * @mcq_base: Multi circular queue registers base address * @uhq: array of supported hardware queues - * @dev_cmd_queue: Queue for issuing device management commands * @mcq_opr: MCQ operation and runtime registers * @ufs_rtc_update_work: A work for UFS RTC periodic update * @pm_qos_req: PM QoS request handle @@ -981,7 +977,6 @@ struct ufs_hba { int nortt; u32 mcq_capabilities; int nutmrs; - u32 reserved_slot; u32 ufs_version; const struct ufs_hba_variant_ops *vops; struct ufs_hba_variant_params *vps; @@ -1099,7 +1094,6 @@ struct ufs_hba { bool mcq_esi_enabled; void __iomem *mcq_base; struct ufs_hw_queue *uhq; - struct ufs_hw_queue *dev_cmd_queue; struct ufshcd_mcq_opr_info_t mcq_opr[OPR_MAX]; struct delayed_work ufs_rtc_update_work; -- cgit v1.2.3 From 775f726a742a60d8d0ed2b4733a5b6a796d9d1dd Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:31 -0400 Subject: vfio: Add get_region_info_caps op This op does the copy to/from user for the info and can return back a cap chain through a vfio_info_cap * result. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/15-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index be5fcf8432e8..6311ddc83770 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -21,6 +21,7 @@ struct kvm; struct iommufd_ctx; struct iommufd_device; struct iommufd_access; +struct vfio_info_cap; /* * VFIO devices can be placed in a set, this allows all devices to share this @@ -134,6 +135,9 @@ struct vfio_device_ops { unsigned long arg); int (*get_region_info)(struct vfio_device *vdev, struct vfio_region_info __user *arg); + int (*get_region_info_caps)(struct vfio_device *vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps); int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma); void (*request)(struct vfio_device *vdev, unsigned int count); int (*match)(struct vfio_device *vdev, char *buf); -- cgit v1.2.3 From 1b0ecb5baf4af3baa8627144bbcf9848806aa5f1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:35 -0400 Subject: vfio/pci: Convert all PCI drivers to get_region_info_caps Since the core function signature changes it has to flow up to all drivers. Reviewed-by: Kevin Tian Reviewed-by: Pranjal Shrivastava Reviewed-by: Brett Creeley Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/19-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 160bc2e31ece..e74f94c17fbe 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -116,7 +116,8 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz); int vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - struct vfio_region_info __user *arg); + struct vfio_region_info *info, + struct vfio_info_cap *caps); ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, -- cgit v1.2.3 From 56c069307dfd0a5e39b685e0aeee6c40d1d7ddfc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Nov 2025 13:41:38 -0400 Subject: vfio: Remove the get_region_info op No driver uses it now, all are using get_region_info_caps(). Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/22-v2-2a9e24d62f1b+e10a-vfio_get_region_info_op_jgg@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 6311ddc83770..8e1ddb48b9b5 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -133,8 +133,6 @@ struct vfio_device_ops { size_t count, loff_t *size); long (*ioctl)(struct vfio_device *vdev, unsigned int cmd, unsigned long arg); - int (*get_region_info)(struct vfio_device *vdev, - struct vfio_region_info __user *arg); int (*get_region_info_caps)(struct vfio_device *vdev, struct vfio_region_info *info, struct vfio_info_cap *caps); -- cgit v1.2.3 From 044b9f1a7f4f3d41563007d0762c83a7d7505ac0 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 12 Nov 2025 08:46:14 +0100 Subject: PCI/PTM: Enable only if device advertises relevant role We have a Switch Upstream Port (2b:00.0) that has a PTM Capability, but doesn't advertise support for any PTM roles: Capabilities: [220 v1] Precision Time Measurement PTMCap: Requester- Responder- Root- Linux enables PTM without looking into what roles it actually supports, and apparently the Port immediately sends PTM Requests even though it doesn't support the PTM Requester role. The messages include an invalid bus number, so the Root Port detects an ACS Violation (see the PCIe r7.0, sec 6.12.1.1, implementation note): pci 0000:2b:00.0: [8086:5786] type 01 class 0x060400 PCIe Switch Upstream Port pci 0000:2b:00.0: PTM enabled, 4ns granularity pcieport 0000:00:07.1: AER: Multiple Uncorrectable (Non-Fatal) error message received from 0000:00:07.1 pcieport 0000:00:07.1: PCIe Bus Error: severity=Uncorrectable (Non-Fatal), type=Transaction Layer, (Receiver ID) pcieport 0000:00:07.1: device [8086:e44f] error status/mask=00200000/00000000 pcieport 0000:00:07.1: [21] ACSViol (First) pcieport 0000:00:07.1: AER: TLP Header: 0x34000000 0x00000052 0x00000000 0x00000000 The TLP Header shows a 4 DW header, no data (001b) Msg with Local routing (1 0100b) with Requester ID 0x0000 and PTM Request code (0x52). Fix this by enabling PTM only if the following conditions are true (see sec 6.21.1 figure 6-21): - Endpoint must advertise PTM Requester Capable - Switch Upstream Port must advertise PTM Responder Capable - Root Port must advertise PTM Root Capable Signed-off-by: Mika Westerberg [bhelgaas: commit log, comments] Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20251112074614.1440266-1-mika.westerberg@linux.intel.com --- include/linux/pci.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..d5018cb5c331 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -500,6 +500,8 @@ struct pci_dev { #ifdef CONFIG_PCIE_PTM u16 ptm_cap; /* PTM Capability */ unsigned int ptm_root:1; + unsigned int ptm_responder:1; + unsigned int ptm_requester:1; unsigned int ptm_enabled:1; u8 ptm_granularity; #endif -- cgit v1.2.3 From 6276c67f2bc4aeaf350a7cf889c33c38b3330ea9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 12 Nov 2025 09:39:44 -0800 Subject: x86: Restrict KVM-induced symbol exports to KVM modules where obvious/possible Extend KVM's export macro framework to provide EXPORT_SYMBOL_FOR_KVM(), and use the helper macro to export symbols for KVM throughout x86 if and only if KVM will build one or more modules, and only for those modules. To avoid unnecessary exports when CONFIG_KVM=m but kvm.ko will not be built (because no vendor modules are selected), let arch code #define EXPORT_SYMBOL_FOR_KVM to suppress/override the exports. Note, the set of symbols to restrict to KVM was generated by manual search and audit; any "misses" are due to human error, not some grand plan. Signed-off-by: Sean Christopherson Signed-off-by: Dave Hansen Acked-by: Kai Huang Tested-by: Kai Huang Link: https://patch.msgid.link/20251112173944.1380633-5-seanjc%40google.com --- include/linux/kvm_types.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 490464c205b4..a568d8e6f4e8 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -11,8 +11,22 @@ #ifdef KVM_SUB_MODULES #define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) \ EXPORT_SYMBOL_FOR_MODULES(symbol, __stringify(KVM_SUB_MODULES)) +#define EXPORT_SYMBOL_FOR_KVM(symbol) \ + EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm," __stringify(KVM_SUB_MODULES)) #else #define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol) +/* + * Allow architectures to provide a custom EXPORT_SYMBOL_FOR_KVM, but only + * if there are no submodules, e.g. to allow suppressing exports if KVM=m, but + * kvm.ko won't actually be built (due to lack of at least one submodule). + */ +#ifndef EXPORT_SYMBOL_FOR_KVM +#if IS_MODULE(CONFIG_KVM) +#define EXPORT_SYMBOL_FOR_KVM(symbol) EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm") +#else +#define EXPORT_SYMBOL_FOR_KVM(symbol) +#endif /* IS_MODULE(CONFIG_KVM) */ +#endif /* EXPORT_SYMBOL_FOR_KVM */ #endif #ifndef __ASSEMBLER__ -- cgit v1.2.3 From f6a8919d61484ae9ca6b1855035fcfb2ba6e2af9 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 12 Nov 2025 15:47:48 -0800 Subject: vmlinux.lds: Fix TEXT_MAIN to include .text.start and friends Since: 6568f14cb5ae ("vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN") the TEXT_MAIN macro uses a series of patterns to prevent the .text.startup[.*] and .text.exit[.*] sections from getting linked into the vmlinux runtime .text. That commit is a tad too aggressive: it also inadvertently filters out valid runtime text sections like .text.start and .text.start.constprop.0, which can be generated for a function named start() when -ffunction-sections is enabled. As a result, those sections become orphans when building with CONFIG_LD_DEAD_CODE_DATA_ELIMINATION for arm: arm-linux-gnueabi-ld: warning: orphan section `.text.start.constprop.0' from `drivers/usb/host/sl811-hcd.o' being placed in section `.text.start.constprop.0' arm-linux-gnueabi-ld: warning: orphan section `.text.start.constprop.0' from `drivers/media/dvb-frontends/drxk_hard.o' being placed in section `.text.start.constprop.0' arm-linux-gnueabi-ld: warning: orphan section `.text.start' from `drivers/media/dvb-frontends/stv0910.o' being placed in section `.text.start' arm-linux-gnueabi-ld: warning: orphan section `.text.start.constprop.0' from `drivers/media/pci/ddbridge/ddbridge-sx8.o' being placed in section `.text.start.constprop.0' Fix that by explicitly adding the partial "substring" sections (.text.s, .text.st, .text.sta, etc) and their cloned derivatives. While this unfortunately means that TEXT_MAIN continues to grow, these changes are ultimately necessary for proper support of -ffunction-sections. Fixes: 6568f14cb5ae ("vmlinux.lds: Exclude .text.startup and .text.exit from TEXT_MAIN") Reported-by: kernel test robot Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: live-patching@vger.kernel.org Cc: Linus Torvalds Link: https://patch.msgid.link/cd588144e63df901a656b06b566855019c4a931d.1762991150.git.jpoimboe@kernel.org Closes: https://lore.kernel.org/oe-kbuild-all/202511040812.DFGedJiy-lkp@intel.com/ --- include/asm-generic/vmlinux.lds.h | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index cc060adfdc75..8f92d665cb0f 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -90,8 +90,9 @@ * Support -ffunction-sections by matching .text and .text.*, * but exclude '.text..*', .text.startup[.*], and .text.exit[.*]. * - * .text.startup and .text.startup.* are matched later by INIT_TEXT. - * .text.exit and .text.exit.* are matched later by EXIT_TEXT. + * .text.startup and .text.startup.* are matched later by INIT_TEXT, and + * .text.exit and .text.exit.* are matched later by EXIT_TEXT, so they must be + * explicitly excluded here. * * Other .text.* sections that are typically grouped separately, such as * .text.unlikely or .text.hot, must be matched explicitly before using @@ -100,16 +101,16 @@ #define TEXT_MAIN \ .text \ .text.[_0-9A-Za-df-rt-z]* \ - .text.s[_0-9A-Za-su-z]* \ - .text.st[_0-9A-Zb-z]* \ - .text.sta[_0-9A-Za-qs-z]* \ - .text.star[_0-9A-Za-su-z]* \ - .text.start[_0-9A-Za-tv-z]* \ - .text.startu[_0-9A-Za-oq-z]* \ + .text.s[_0-9A-Za-su-z]* .text.s .text.s.* \ + .text.st[_0-9A-Zb-z]* .text.st .text.st.* \ + .text.sta[_0-9A-Za-qs-z]* .text.sta .text.sta.* \ + .text.star[_0-9A-Za-su-z]* .text.star .text.star.* \ + .text.start[_0-9A-Za-tv-z]* .text.start .text.start.* \ + .text.startu[_0-9A-Za-oq-z]* .text.startu .text.startu.* \ .text.startup[_0-9A-Za-z]* \ - .text.e[_0-9A-Za-wy-z]* \ - .text.ex[_0-9A-Za-hj-z]* \ - .text.exi[_0-9A-Za-su-z]* \ + .text.e[_0-9A-Za-wy-z]* .text.e .text.e.* \ + .text.ex[_0-9A-Za-hj-z]* .text.ex .text.ex.* \ + .text.exi[_0-9A-Za-su-z]* .text.exi .text.exi.* \ .text.exit[_0-9A-Za-z]* /* -- cgit v1.2.3 From 9c7dc1dd897a1cdcade9566ea4664b03fbabf4a4 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 12 Nov 2025 15:47:51 -0800 Subject: objtool: Warn on functions with ambiguous -ffunction-sections section names When compiled with -ffunction-sections, a function named startup() will be placed in .text.startup. However, .text.startup is also used by the compiler for functions with __attribute__((constructor)). That creates an ambiguity for the vmlinux linker script, which needs to differentiate those two cases. Similar naming conflicts exist for functions named exit(), split(), unlikely(), hot() and unknown(). One potential solution would be to use '#ifdef CC_USING_FUNCTION_SECTIONS' to create two distinct implementations of the TEXT_MAIN macro. However, -ffunction-sections can be (and is) enabled or disabled on a per-object basis (for example via ccflags-y or AUTOFDO_PROFILE). So the recently unified TEXT_MAIN macro (commit 1ba9f8979426 ("vmlinux.lds: Unify TEXT_MAIN, DATA_MAIN, and related macros")) is necessary. This means there's no way for the linker script to disambiguate things. Instead, use objtool to warn on any function names whose resulting section names might create ambiguity when the kernel is compiled (in whole or in part) with -ffunction-sections. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: live-patching@vger.kernel.org Cc: Linus Torvalds Link: https://patch.msgid.link/65fedea974fe14be487c8867a0b8d0e4a294ce1e.1762991150.git.jpoimboe@kernel.org --- include/asm-generic/vmlinux.lds.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 8f92d665cb0f..5efe1de2209b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -97,6 +97,21 @@ * Other .text.* sections that are typically grouped separately, such as * .text.unlikely or .text.hot, must be matched explicitly before using * TEXT_MAIN. + * + * NOTE: builds *with* and *without* -ffunction-sections are both supported by + * this single macro. Even with -ffunction-sections, there may be some objects + * NOT compiled with the flag due to the use of a specific Makefile override + * like cflags-y or AUTOFDO_PROFILE_foo.o. So this single catchall rule is + * needed to support mixed object builds. + * + * One implication is that functions named startup(), exit(), split(), + * unlikely(), hot(), and unknown() are not allowed in the kernel due to the + * ambiguity of their section names with -ffunction-sections. For example, + * .text.startup could be __attribute__((constructor)) code in a *non* + * ffunction-sections object, which should be placed in .init.text; or it could + * be an actual function named startup() in an ffunction-sections object, which + * should be placed in .text. Objtool will detect and complain about any such + * ambiguously named functions. */ #define TEXT_MAIN \ .text \ -- cgit v1.2.3 From fd3f646e1c9d783d1f4ef30e5376ccf315a8ae30 Mon Sep 17 00:00:00 2001 From: Isaac Scott Date: Wed, 29 Oct 2025 18:03:18 +0000 Subject: media: v4l: Add helper to get number of active lanes via a pad Sometimes, users will not use all of the MIPI CSI 2 lanes available when connecting to the MIPI CSI receiver of their device. Add a helper function that checks the mbus_config for the device driver to allow users to define the number of active data lanes through the get_mbus_config op. If the driver does not implement this op, fall back to using the maximum number of lanes available. Reviewed-by: Frank Li Reviewed-by: Laurent Pinchart Signed-off-by: Isaac Scott Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- include/media/v4l2-common.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h index 5c0a7f6b5bb6..f8b1faced79c 100644 --- a/include/media/v4l2-common.h +++ b/include/media/v4l2-common.h @@ -581,6 +581,26 @@ int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt, u32 pixelformat, #ifdef CONFIG_MEDIA_CONTROLLER s64 v4l2_get_link_freq(const struct media_pad *pad, unsigned int mul, unsigned int div); + +/** + * v4l2_get_active_data_lanes - Get number of active data lanes from driver + * + * @pad: The transmitter's media pad. + * @max_data_lanes: The maximum number of active data lanes supported by + * the MIPI CSI link in hardware. + * + * This function is intended for obtaining the number of data lanes that are + * actively being used by the driver for a MIPI CSI-2 device on a given media pad. + * This information is derived from a mbus_config fetched from a device driver + * using the get_mbus_config v4l2_subdev pad op. + * + * Return: + * * >0: Number of active data lanes + * * %-EINVAL: Number of active data lanes is invalid, as it exceeds the maximum + * supported data lanes. + */ +int v4l2_get_active_data_lanes(const struct media_pad *pad, + unsigned int max_data_lanes); #endif void v4l2_simplify_fraction(u32 *numerator, u32 *denominator, -- cgit v1.2.3 From 2bcd3800f2da1be13b972858f63c66d035b1ec6d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:15 +0000 Subject: slab: Reimplement page_slab() In order to separate slabs from folios, we need to convert from any page in a slab to the slab directly without going through a page to folio conversion first. Up to this point, page_slab() has followed the example of other memdesc converters (page_folio(), page_ptdesc() etc) and just cast the pointer to the requested type, regardless of whether the pointer is actually a pointer to the correct type or not. That changes with this commit; we check that the page actually belongs to a slab and return NULL if it does not. Other memdesc converters will adopt this convention in future. kfence was the only user of page_slab(), so adjust it to the new way of working. It will need to be touched again when we separate slab from page. Signed-off-by: Matthew Wilcox (Oracle) Cc: Alexander Potapenko Cc: Marco Elver Cc: kasan-dev@googlegroups.com Link: https://patch.msgid.link/20251113000932.1589073-2-willy@infradead.org Acked-by: David Hildenbrand (Red Hat) Tested-by: Marco Elver Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/page-flags.h | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0091ad1986bf..6d5e44968eab 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1048,19 +1048,7 @@ PAGE_TYPE_OPS(Table, table, pgtable) */ PAGE_TYPE_OPS(Guard, guard, guard) -FOLIO_TYPE_OPS(slab, slab) - -/** - * PageSlab - Determine if the page belongs to the slab allocator - * @page: The page to test. - * - * Context: Any context. - * Return: True for slab pages, false for any other kind of page. - */ -static inline bool PageSlab(const struct page *page) -{ - return folio_test_slab(page_folio(page)); -} +PAGE_TYPE_OPS(Slab, slab, slab) #ifdef CONFIG_HUGETLB_PAGE FOLIO_TYPE_OPS(hugetlb, hugetlb) -- cgit v1.2.3 From ee1ee8abc4197e21594ca29348629ccbfff4daec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Nov 2025 00:09:16 +0000 Subject: slab: Remove folio references from __ksize() In the future, we will separate slab, folio and page from each other and calling virt_to_folio() on an address allocated from slab will return NULL. Delay the conversion from struct page to struct slab until we know we're not dealing with a large kmalloc allocation. There's a minor win for large kmalloc allocations as we avoid the compound_head() hidden in virt_to_folio(). This deprecates calling ksize() on memory allocated by alloc_pages(). Today it becomes a warning and support will be removed entirely in the future. Introduce large_kmalloc_size() to abstract how we represent the size of a large kmalloc allocation. For now, this is the same as page_size(), but it will change with separately allocated memdescs. Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251113000932.1589073-3-willy@infradead.org Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6d5e44968eab..f7a0e4af0c73 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -1064,7 +1064,7 @@ PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) -FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) +PAGE_TYPE_OPS(LargeKmalloc, large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs -- cgit v1.2.3 From 4f49088c162579a4ed049c555fe0cd188fd928c4 Mon Sep 17 00:00:00 2001 From: Khairul Anuar Romli Date: Wed, 8 Oct 2025 17:09:05 +0800 Subject: firmware: stratix10-svc: Add definition for voltage and temperature sensor Add entry in Stratix 10 Service Layer to support temperature and voltage sensor. Signed-off-by: Khairul Anuar Romli Signed-off-by: Dinh Nguyen --- include/linux/firmware/intel/stratix10-smc.h | 34 ++++++++++++++++++++++ .../linux/firmware/intel/stratix10-svc-client.h | 8 ++++- 2 files changed, 41 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index ee80ca4bb0d0..7306dd243b2a 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -620,4 +620,38 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) #define INTEL_SIP_SMC_FCS_GET_PROVISION_DATA \ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FCS_GET_PROVISION_DATA) +/** + * Request INTEL_SIP_SMC_HWMON_READTEMP + * Sync call to request temperature + * + * Call register usage: + * a0 Temperature Channel + * a1-a7 not used + * + * Return status + * a0 INTEL_SIP_SMC_STATUS_OK + * a1 Temperature Value + * a2-a3 not used + */ +#define INTEL_SIP_SMC_FUNCID_HWMON_READTEMP 32 +#define INTEL_SIP_SMC_HWMON_READTEMP \ + INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READTEMP) + +/** + * Request INTEL_SIP_SMC_HWMON_READVOLT + * Sync call to request voltage + * + * Call register usage: + * a0 Voltage Channel + * a1-a7 not used + * + * Return status + * a0 INTEL_SIP_SMC_STATUS_OK + * a1 Voltage Value + * a2-a3 not used + */ +#define INTEL_SIP_SMC_FUNCID_HWMON_READVOLT 33 +#define INTEL_SIP_SMC_HWMON_READVOLT \ + INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READVOLT) + #endif diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 60ed82112680..520004a5f15d 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -11,12 +11,14 @@ * * fpga: for FPGA configuration * rsu: for remote status update + * hwmon: for hardware monitoring (voltage and temperature) */ #define SVC_CLIENT_FPGA "fpga" #define SVC_CLIENT_RSU "rsu" #define SVC_CLIENT_FCS "fcs" +#define SVC_CLIENT_HWMON "hwmon" -/* +/** * Status of the sent command, in bit number * * SVC_STATUS_OK: @@ -70,6 +72,7 @@ #define SVC_RSU_REQUEST_TIMEOUT_MS 300 #define SVC_FCS_REQUEST_TIMEOUT_MS 2000 #define SVC_COMPLETED_TIMEOUT_MS 30000 +#define SVC_HWMON_REQUEST_TIMEOUT_MS 300 struct stratix10_svc_chan; @@ -171,6 +174,9 @@ enum stratix10_svc_command_code { COMMAND_MBOX_SEND_CMD = 100, /* Non-mailbox SMC Call */ COMMAND_SMC_SVC_VERSION = 200, + /* for HWMON */ + COMMAND_HWMON_READTEMP, + COMMAND_HWMON_READVOLT }; /** -- cgit v1.2.3 From bcb9f4f0706147afc62c48533276a18fe7b8f354 Mon Sep 17 00:00:00 2001 From: Mahesh Rao Date: Mon, 27 Oct 2025 22:54:41 +0800 Subject: firmware: stratix10-svc: Add support for async communication Introduce support for asynchronous communication with the Stratix10 service channel. Define new structures to enable asynchronous messaging with the Secure Device Manager (SDM). Add and remove asynchronous support for existing channels. Implement initialization and cleanup routines for the asynchronous framework. Enable sending and polling of messages to the SDM asynchronously. The new public functions added are: - stratix10_svc_add_async_client: Adds a client to the service channel. - stratix10_svc_remove_async_client: Removes an asynchronous client from the service channel. - stratix10_svc_async_send: Sends an asynchronous message to the SDM mailbox in EL3 secure firmware. - stratix10_svc_async_poll: Polls the status of an asynchronous service request in EL3 secure firmware. - stratix10_svc_async_done: Marks an asynchronous transaction as complete and frees up the resources. These changes enhance the functionality of the Stratix10 service channel by allowing for more efficient and flexible communication with the firmware. Signed-off-by: Mahesh Rao Reviewed-by: Matthew Gerlach Signed-off-by: Dinh Nguyen --- include/linux/firmware/intel/stratix10-smc.h | 25 ++++++ .../linux/firmware/intel/stratix10-svc-client.h | 88 ++++++++++++++++++++++ 2 files changed, 113 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index 7306dd243b2a..3995d5d70cce 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2017-2018, Intel Corporation + * Copyright (C) 2025, Altera Corporation */ #ifndef __STRATIX10_SMC_H @@ -47,6 +48,10 @@ ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, ARM_SMCCC_SMC_64, \ ARM_SMCCC_OWNER_SIP, (func_num)) +#define INTEL_SIP_SMC_ASYNC_VAL(func_name) \ + ARM_SMCCC_CALL_VAL(ARM_SMCCC_STD_CALL, ARM_SMCCC_SMC_64, \ + ARM_SMCCC_OWNER_SIP, (func_name)) + /** * Return values in INTEL_SIP_SMC_* call * @@ -654,4 +659,24 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) #define INTEL_SIP_SMC_HWMON_READVOLT \ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_HWMON_READVOLT) +/** + * Request INTEL_SIP_SMC_ASYNC_POLL + * Async call used by service driver at EL1 to query mailbox response from SDM. + * + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_POLL + * a1 transaction job id + * a2-17 will be used to return the response data + * + * Return status + * a0 INTEL_SIP_SMC_STATUS_OK + * a1-17 will contain the response values from mailbox for the previous send + * transaction + * Or + * a0 INTEL_SIP_SMC_STATUS_NO_RESPONSE + * a1-17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL (0xC8) +#define INTEL_SIP_SMC_ASYNC_POLL \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL) #endif diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 520004a5f15d..532dd4bd76dd 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2017-2018, Intel Corporation + * Copyright (C) 2025, Altera Corporation */ #ifndef __STRATIX10_SVC_CLIENT_H @@ -290,5 +291,92 @@ int stratix10_svc_send(struct stratix10_svc_chan *chan, void *msg); * request process. */ void stratix10_svc_done(struct stratix10_svc_chan *chan); + +/** + * typedef async_callback_t - A type definition for an asynchronous callback function. + * + * This type defines a function pointer for an asynchronous callback. + * The callback function takes a single argument, which is a pointer to + * user-defined data. + * + * @param cb_arg A pointer to user-defined data passed to the callback function. + */ +typedef void (*async_callback_t)(void *cb_arg); + +/** + * stratix10_svc_add_async_client - Add an asynchronous client to a Stratix 10 + * service channel. + * @chan: Pointer to the Stratix 10 service channel structure. + * @use_unique_clientid: Boolean flag indicating whether to use a unique client ID. + * + * This function registers an asynchronous client with the specified Stratix 10 + * service channel. If the use_unique_clientid flag is set to true, a unique client + * ID will be assigned to the client. + * + * Return: 0 on success, or a negative error code on failure: + * -EINVAL if the channel is NULL or the async controller is not initialized. + * -EALREADY if the async channel is already allocated. + * -ENOMEM if memory allocation fails. + * Other negative values if ID allocation fails + */ +int stratix10_svc_add_async_client(struct stratix10_svc_chan *chan, bool use_unique_clientid); + +/** + * stratix10_svc_remove_async_client - Remove an asynchronous client from the Stratix 10 + * service channel. + * @chan: Pointer to the Stratix 10 service channel structure. + * + * This function removes an asynchronous client from the specified Stratix 10 service channel. + * It is typically used to clean up and release resources associated with the client. + * + * Return: 0 on success, -EINVAL if the channel or asynchronous channel is invalid. + */ +int stratix10_svc_remove_async_client(struct stratix10_svc_chan *chan); + +/** + * stratix10_svc_async_send - Send an asynchronous message to the SDM mailbox + * in EL3 secure firmware. + * @chan: Pointer to the service channel structure. + * @msg: Pointer to the message to be sent. + * @handler: Pointer to the handler object used by caller to track the transaction. + * @cb: Callback function to be called upon completion. + * @cb_arg: Argument to be passed to the callback function. + * + * This function sends a message asynchronously to the SDM mailbox in EL3 secure firmware. + * and registers a callback function to be invoked when the operation completes. + * + * Return: 0 on success,and negative error codes on failure. + */ +int stratix10_svc_async_send(struct stratix10_svc_chan *chan, void *msg, void **handler, + async_callback_t cb, void *cb_arg); + +/** + * stratix10_svc_async_poll - Polls the status of an asynchronous service request. + * @chan: Pointer to the service channel structure. + * @tx_handle: Handle to the transaction being polled. + * @data: Pointer to the callback data structure to be filled with the result. + * + * This function checks the status of an asynchronous service request + * and fills the provided callback data structure with the result. + * + * Return: 0 on success, -EINVAL if any input parameter is invalid or if the + * async controller is not initialized, -EAGAIN if the transaction is + * still in progress, or other negative error codes on failure. + */ +int stratix10_svc_async_poll(struct stratix10_svc_chan *chan, void *tx_handle, + struct stratix10_svc_cb_data *data); + +/** + * stratix10_svc_async_done - Complete an asynchronous transaction + * @chan: Pointer to the service channel structure + * @tx_handle: Pointer to the transaction handle + * + * This function completes an asynchronous transaction by removing the + * transaction from the hash table and deallocating the associated resources. + * + * Return: 0 on success, -EINVAL on invalid input or errors. + */ +int stratix10_svc_async_done(struct stratix10_svc_chan *chan, void *tx_handle); + #endif -- cgit v1.2.3 From ec52379341a1209826c3e0ae53674393724d2071 Mon Sep 17 00:00:00 2001 From: Mahesh Rao Date: Mon, 27 Oct 2025 22:54:42 +0800 Subject: firmware: stratix10-svc: Add support for RSU commands in asynchronous framework Integrate Remote System Update(RSU) service commands into the asynchronous framework for communicating with SDM. This allows the RSU commands to be processed asynchronously, improving the responsiveness of the Stratix10 service channel. The asynchronous framework now supports the following RSU commands: * COMMAND_RSU_GET_SPT_TABLE * COMMAND_RSU_STATUS * COMMAND_RSU_NOTIFY Signed-off-by: Mahesh Rao Reviewed-by: Matthew Gerlach Signed-off-by: Dinh Nguyen --- include/linux/firmware/intel/stratix10-smc.h | 52 ++++++++++++++++++++++ .../linux/firmware/intel/stratix10-svc-client.h | 4 ++ 2 files changed, 56 insertions(+) (limited to 'include') diff --git a/include/linux/firmware/intel/stratix10-smc.h b/include/linux/firmware/intel/stratix10-smc.h index 3995d5d70cce..935dba3633b5 100644 --- a/include/linux/firmware/intel/stratix10-smc.h +++ b/include/linux/firmware/intel/stratix10-smc.h @@ -679,4 +679,56 @@ INTEL_SIP_SMC_FAST_CALL_VAL(INTEL_SIP_SMC_FUNCID_FPGA_CONFIG_COMPLETED_WRITE) #define INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL (0xC8) #define INTEL_SIP_SMC_ASYNC_POLL \ INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_POLL) + +/** + * Request INTEL_SIP_SMC_ASYNC_RSU_GET_SPT + * Async call to get RSU SPT from SDM. + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_RSU_GET_SPT + * a1 transaction job id + * a2-a17 not used + * + * Return status: + * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED + * or INTEL_SIP_SMC_STATUS_BUSY + * a1-a17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_SPT (0xEA) +#define INTEL_SIP_SMC_ASYNC_RSU_GET_SPT \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_SPT) + +/** + * Request INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS + * Async call to get RSU error status from SDM. + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS + * a1 transaction job id + * a2-a17 not used + * + * Return status: + * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED + * or INTEL_SIP_SMC_STATUS_BUSY + * a1-a17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_ERROR_STATUS (0xEB) +#define INTEL_SIP_SMC_ASYNC_RSU_GET_ERROR_STATUS \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_GET_ERROR_STATUS) + +/** + * Request INTEL_SIP_SMC_ASYNC_RSU_NOTIFY + * Async call to send NOTIFY value to SDM. + * Call register usage: + * a0 INTEL_SIP_SMC_ASYNC_RSU_NOTIFY + * a1 transaction job id + * a2 notify value + * a3-a17 not used + * + * Return status: + * a0 INTEL_SIP_SMC_STATUS_OK ,INTEL_SIP_SMC_STATUS_REJECTED + * or INTEL_SIP_SMC_STATUS_BUSY + * a1-a17 not used + */ +#define INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_NOTIFY (0xEC) +#define INTEL_SIP_SMC_ASYNC_RSU_NOTIFY \ + INTEL_SIP_SMC_ASYNC_VAL(INTEL_SIP_SMC_ASYNC_FUNC_ID_RSU_NOTIFY) #endif diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 532dd4bd76dd..1bcc56d14080 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -128,6 +128,9 @@ struct stratix10_svc_chan; * @COMMAND_RSU_DCMF_STATUS: query firmware for the DCMF status * return status is SVC_STATUS_OK or SVC_STATUS_ERROR * + * @COMMAND_RSU_GET_SPT_TABLE: query firmware for SPT table + * return status is SVC_STATUS_OK or SVC_STATUS_ERROR + * * @COMMAND_FCS_REQUEST_SERVICE: request validation of image from firmware, * return status is SVC_STATUS_OK, SVC_STATUS_INVALID_PARAM * @@ -162,6 +165,7 @@ enum stratix10_svc_command_code { COMMAND_RSU_DCMF_VERSION, COMMAND_RSU_DCMF_STATUS, COMMAND_FIRMWARE_VERSION, + COMMAND_RSU_GET_SPT_TABLE, /* for FCS */ COMMAND_FCS_REQUEST_SERVICE = 20, COMMAND_FCS_SEND_CERTIFICATE, -- cgit v1.2.3 From 524c3853831cf4f7e1db579e487c757c3065165c Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 22 Oct 2025 20:11:37 +0900 Subject: jbd2: use a per-journal lock_class_key for jbd2_trans_commit_key syzbot is reporting possibility of deadlock due to sharing lock_class_key for jbd2_handle across ext4 and ocfs2. But this is a false positive, for one disk partition can't have two filesystems at the same time. Reported-by: syzbot+6e493c165d26d6fcbf72@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6e493c165d26d6fcbf72 Signed-off-by: Tetsuo Handa Tested-by: syzbot+6e493c165d26d6fcbf72@syzkaller.appspotmail.com Reviewed-by: Jan Kara Message-ID: <987110fc-5470-457a-a218-d286a09dd82f@I-love.SAKURA.ne.jp> Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- include/linux/jbd2.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 43b9297fe8a7..f5eaf76198f3 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1253,6 +1253,12 @@ struct journal_s */ struct lockdep_map j_trans_commit_map; #endif + /** + * @jbd2_trans_commit_key: + * + * "struct lock_class_key" for @j_trans_commit_map + */ + struct lock_class_key jbd2_trans_commit_key; /** * @j_fc_cleanup_callback: -- cgit v1.2.3 From f694d215d34035cc64b1d176fd82db0d1f2428d4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 11 Nov 2025 11:26:44 +0000 Subject: net: stmmac: always allocate mac_device_info The ->setup() method implemented by dwmac-loongson and dwmac-sun8i allocate the mac_device_info structure, as does stmmac_hwif_init(). This makes no sense. Have stmmac_hwif_init() always allocate this structure, and pass it to the ->setup() method to initialise when it is provided. Rename this method to "mac_setup" to more accurately describe what it is doing. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vImWK-0000000DrIx-28vO@rmk-PC.armlinux.org.uk Signed-off-by: Paolo Abeni --- include/linux/stmmac.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 48e9f1d4e17e..4f70a6551e68 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -192,6 +192,8 @@ enum dwmac_core_type { #define STMMAC_FLAG_EN_TX_LPI_CLK_PHY_CAP BIT(12) #define STMMAC_FLAG_HWTSTAMP_CORRECT_LATENCY BIT(13) +struct mac_device_info; + struct plat_stmmacenet_data { enum dwmac_core_type core_type; int bus_id; @@ -266,7 +268,7 @@ struct plat_stmmacenet_data { void (*exit)(struct platform_device *pdev, void *priv); int (*suspend)(struct device *dev, void *priv); int (*resume)(struct device *dev, void *priv); - struct mac_device_info *(*setup)(void *priv); + int (*mac_setup)(void *priv, struct mac_device_info *mac); int (*clks_config)(void *priv, bool enabled); int (*crosststamp)(ktime_t *device, struct system_counterval_t *system, void *ctx); -- cgit v1.2.3 From 0f2620ffc41d117cc28bc053efe2dc837cf748dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:42 +0100 Subject: fault-inject: make enum fault_flags available unconditionally This will allow using should_fail_ex from code without having to make it conditional on CONFIG_FAULT_INJECTION. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-2-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/fault-inject.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 8c829d28dcf3..58fd14c82270 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -8,6 +8,10 @@ struct dentry; struct kmem_cache; +enum fault_flags { + FAULT_NOWARN = 1 << 0, +}; + #ifdef CONFIG_FAULT_INJECTION #include @@ -36,10 +40,6 @@ struct fault_attr { struct dentry *dname; }; -enum fault_flags { - FAULT_NOWARN = 1 << 0, -}; - #define FAULT_ATTR_INITIALIZER { \ .interval = 1, \ .times = ATOMIC_INIT(1), \ -- cgit v1.2.3 From 2647e2ecc096d2330d6b6a34a3a1f0a99828c14c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:48:57 +0000 Subject: io_uring/query: introduce zcrx query Add a new query type IO_URING_QUERY_ZCRX returning the user some basic information about the interface, which includes allowed flags for areas and registration and supported IORING_REGISTER_ZCRX_CTRL subcodes. There is also a chicken-egg problem with user provided refill queue memory, where offsets and size information is returned after registration, but to properly allocate memory you need to know it beforehand, which is why the userspace currently has to guess the RQ headers size and severely overestimates it. Return the size information. It's split into "size" and "alignment" fields because for default placement modes the user is interested in the aligned size, however if it gets support for more flexible placement, it'll need to only know the actual header size. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/query.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h index 3539ccbfd064..fc0cb1580e47 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -18,6 +18,7 @@ struct io_uring_query_hdr { enum { IO_URING_QUERY_OPCODES = 0, + IO_URING_QUERY_ZCRX = 1, __IO_URING_QUERY_MAX, }; @@ -41,4 +42,19 @@ struct io_uring_query_opcode { __u32 __pad; }; +struct io_uring_query_zcrx { + /* Bitmask of supported ZCRX_REG_* flags, */ + __u64 register_flags; + /* Bitmask of all supported IORING_ZCRX_AREA_* flags */ + __u64 area_flags; + /* The number of supported ZCRX_CTRL_* opcodes */ + __u32 nr_ctrl_opcodes; + __u32 __resv1; + /* The refill ring header size */ + __u32 rq_hdr_size; + /* The alignment for the header */ + __u32 rq_hdr_alignment; + __u64 __resv2; +}; + #endif -- cgit v1.2.3 From 4aaa9bc4d5921363490d95fe66c4db086a915799 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:48:58 +0000 Subject: io_uring/query: introduce rings info query Same problem as with zcrx in the previous patch, the user needs to know SQ/CQ header sizes to allocated memory before setup to use it for user provided rings, i.e. IORING_SETUP_NO_MMAP, however that information is only returned after registration, hence the user is guessing kernel implementation details. Return the header size and alignment, which is split with the same motivation, to allow the user to know the real structure size without alignment in case there will be more flexible placement schemes in the future. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring/query.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring/query.h b/include/uapi/linux/io_uring/query.h index fc0cb1580e47..2456e6c5ebb5 100644 --- a/include/uapi/linux/io_uring/query.h +++ b/include/uapi/linux/io_uring/query.h @@ -19,6 +19,7 @@ struct io_uring_query_hdr { enum { IO_URING_QUERY_OPCODES = 0, IO_URING_QUERY_ZCRX = 1, + IO_URING_QUERY_SCQ = 2, __IO_URING_QUERY_MAX, }; @@ -57,4 +58,11 @@ struct io_uring_query_zcrx { __u64 __resv2; }; +struct io_uring_query_scq { + /* The SQ/CQ rings header size */ + __u64 hdr_size; + /* The alignment for the header */ + __u64 hdr_alignment; +}; + #endif -- cgit v1.2.3 From d663976dad68de9b2e3df59cc31f0a24ee4c4511 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:12 +0000 Subject: io_uring/zcrx: introduce IORING_REGISTER_ZCRX_CTRL It'll be annoying and take enough of boilerplate code to implement new zcrx features as separate io_uring register opcode. Introduce IORING_REGISTER_ZCRX_CTRL that will multiplex such calls to zcrx. Note, there are no real users of the opcode in this patch. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index e96080db3e4d..0e1d353fab1d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -697,6 +697,9 @@ enum io_uring_register_op { /* query various aspects of io_uring, see linux/io_uring/query.h */ IORING_REGISTER_QUERY = 35, + /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */ + IORING_REGISTER_ZCRX_CTRL = 36, + /* this goes last */ IORING_REGISTER_LAST, @@ -1078,6 +1081,16 @@ struct io_uring_zcrx_ifq_reg { __u64 __resv[3]; }; +enum zcrx_ctrl_op { + __ZCRX_CTRL_LAST, +}; + +struct zcrx_ctrl { + __u32 zcrx_id; + __u32 op; /* see enum zcrx_ctrl_op */ + __u64 __resv[8]; +}; + #ifdef __cplusplus } #endif -- cgit v1.2.3 From 475eb39b00478b1898bc9080344dcd8e86c53c7a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:13 +0000 Subject: io_uring/zcrx: add sync refill queue flushing Add an zcrx interface via IORING_REGISTER_ZCRX_CTRL that forces the kernel to flush / consume entries from the refill queue. Just as with the IORING_REGISTER_ZCRX_REFILL attempt, the motivation is to address cases where the refill queue becomes full, and the user can't return buffers and needs to stash them. It's still a slow path, and the user should size refill queue appropriately, but it should be helpful for handling temporary traffic spikes and other unpredictable conditions. The interface is simpler comparing to ZCRX_REFILL as it doesn't need temporary refill entry arrays and gives natural batching, whereas ZCRX_REFILL requires even more user logic to be somewhat efficient. Also, add a structure for the operation. It's not currently used but can serve for future improvements like limiting the number of buffers to process, etc. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0e1d353fab1d..db47fced2cc6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1082,13 +1082,21 @@ struct io_uring_zcrx_ifq_reg { }; enum zcrx_ctrl_op { + ZCRX_CTRL_FLUSH_RQ, + __ZCRX_CTRL_LAST, }; +struct zcrx_ctrl_flush_rq { + __u64 __resv[6]; +}; + struct zcrx_ctrl { __u32 zcrx_id; __u32 op; /* see enum zcrx_ctrl_op */ - __u64 __resv[8]; + __u64 __resv[2]; + + struct zcrx_ctrl_flush_rq zc_flush; }; #ifdef __cplusplus -- cgit v1.2.3 From d7af80b213e5675664b14f12240cb282e81773d5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Nov 2025 10:46:16 +0000 Subject: io_uring/zcrx: export zcrx via a file Add an option to wrap a zcrx instance into a file and expose it to the user space. Currently, users can't do anything meaningful with the file, but it'll be used in a next patch to import it into another io_uring instance. It's implemented as a new op called ZCRX_CTRL_EXPORT for the IORING_REGISTER_ZCRX_CTRL registration opcode. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index db47fced2cc6..4bedc0310a55 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1083,6 +1083,7 @@ struct io_uring_zcrx_ifq_reg { enum zcrx_ctrl_op { ZCRX_CTRL_FLUSH_RQ, + ZCRX_CTRL_EXPORT, __ZCRX_CTRL_LAST, }; @@ -1091,12 +1092,20 @@ struct zcrx_ctrl_flush_rq { __u64 __resv[6]; }; +struct zcrx_ctrl_export { + __u32 zcrx_fd; + __u32 __resv1[11]; +}; + struct zcrx_ctrl { __u32 zcrx_id; __u32 op; /* see enum zcrx_ctrl_op */ __u64 __resv[2]; - struct zcrx_ctrl_flush_rq zc_flush; + union { + struct zcrx_ctrl_export zc_export; + struct zcrx_ctrl_flush_rq zc_flush; + }; }; #ifdef __cplusplus -- cgit v1.2.3 From 00d91481279fb2df8c46d19090578afd523ca630 Mon Sep 17 00:00:00 2001 From: David Wei Date: Thu, 13 Nov 2025 10:46:18 +0000 Subject: io_uring/zcrx: share an ifq between rings Add a way to share an ifq from a src ring that is real (i.e. bound to a HW RX queue) with other rings. This is done by passing a new flag IORING_ZCRX_IFQ_REG_IMPORT in the registration struct io_uring_zcrx_ifq_reg, alongside the fd of an exported zcrx ifq. Signed-off-by: David Wei Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4bedc0310a55..deb772222b6d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1063,6 +1063,10 @@ struct io_uring_zcrx_area_reg { __u64 __resv2[2]; }; +enum zcrx_reg_flags { + ZCRX_REG_IMPORT = 1, +}; + /* * Argument for IORING_REGISTER_ZCRX_IFQ */ -- cgit v1.2.3 From 36640d21fdfe0152c96e6cb9b58e3336291dfbaa Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Wed, 29 Oct 2025 13:34:49 +0530 Subject: PCI: Export pci_get_host_bridge_device() for use by pci-keystone The pci-keystone.c driver uses the 'pci_get_host_bridge_device()' helper. Export it in preparation for enabling the pci-keystone.c driver to be built as a loadable module. Signed-off-by: Siddharth Vadapalli Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20251029080547.1253757-2-s-vadapalli@ti.com --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..b253cbc27d36 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -646,6 +646,7 @@ struct pci_host_bridge *pci_alloc_host_bridge(size_t priv); struct pci_host_bridge *devm_pci_alloc_host_bridge(struct device *dev, size_t priv); void pci_free_host_bridge(struct pci_host_bridge *bridge); +struct device *pci_get_host_bridge_device(struct pci_dev *dev); struct pci_host_bridge *pci_find_host_bridge(struct pci_bus *bus); void pci_set_host_bridge_release(struct pci_host_bridge *bridge, -- cgit v1.2.3 From 8d63e85c5b50f1dbfa0ccb214bd91fe5d7e2e860 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 4 Nov 2025 11:26:53 -0800 Subject: firmware: cs_dsp: fix kernel-doc warnings in a header file Use correct kernel-doc format to avoid kernel-doc warnings in nclude/linux/firmware/cirrus/cs_dsp_test_utils.h: - mark one struct member as private: since the comment says that it is private - add ending ':' to struct members where needed Warning: include/linux/firmware/cirrus/cs_dsp_test_utils.h:30 struct member 'saw_bus_write' not described in 'cs_dsp_test' Warning: include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'id' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'ver' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'xm_base_words' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'xm_size_words' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'ym_base_words' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'ym_size_words' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'zm_base_words' not described in 'cs_dsp_mock_alg_def' Warning: ../include/linux/firmware/cirrus/cs_dsp_test_utils.h:53 struct member 'zm_size_words' not described in 'cs_dsp_mock_alg_def' Signed-off-by: Randy Dunlap Reviewed-by: Richard Fitzgerald Link: https://patch.msgid.link/20251104192653.929157-1-rdunlap@infradead.org Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp_test_utils.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/firmware/cirrus/cs_dsp_test_utils.h b/include/linux/firmware/cirrus/cs_dsp_test_utils.h index ecd821ed8064..1f97764fdfd7 100644 --- a/include/linux/firmware/cirrus/cs_dsp_test_utils.h +++ b/include/linux/firmware/cirrus/cs_dsp_test_utils.h @@ -26,21 +26,21 @@ struct cs_dsp_test { struct cs_dsp_test_local *local; - /* Following members are private */ + /* private: Following members are private */ bool saw_bus_write; }; /** * struct cs_dsp_mock_alg_def - Info for creating a mock algorithm entry. * - * @id Algorithm ID. - * @ver; Algorithm version. - * @xm_base_words XM base address in DSP words. - * @xm_size_words XM size in DSP words. - * @ym_base_words YM base address in DSP words. - * @ym_size_words YM size in DSP words. - * @zm_base_words ZM base address in DSP words. - * @zm_size_words ZM size in DSP words. + * @id: Algorithm ID. + * @ver: Algorithm version. + * @xm_base_words: XM base address in DSP words. + * @xm_size_words: XM size in DSP words. + * @ym_base_words: YM base address in DSP words. + * @ym_size_words: YM size in DSP words. + * @zm_base_words: ZM base address in DSP words. + * @zm_size_words: ZM size in DSP words. */ struct cs_dsp_mock_alg_def { unsigned int id; -- cgit v1.2.3 From 280b7cdddc3d96c4887fdb31b6766e4db1b2f2a3 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Sat, 1 Nov 2025 05:00:31 +0000 Subject: dt-bindings: clock: renesas,r9a09g057-cpg: Add USB3.0 core clocks Add definitions for USB3.0 core clocks in the R9A09G057 CPG DT bindings header file. Signed-off-by: Lad Prabhakar Acked-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Link: https://patch.msgid.link/20251101050034.738807-2-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/renesas,r9a09g057-cpg.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/renesas,r9a09g057-cpg.h b/include/dt-bindings/clock/renesas,r9a09g057-cpg.h index 5346a898ab60..f91d7f72922a 100644 --- a/include/dt-bindings/clock/renesas,r9a09g057-cpg.h +++ b/include/dt-bindings/clock/renesas,r9a09g057-cpg.h @@ -22,5 +22,9 @@ #define R9A09G057_GBETH_0_CLK_PTP_REF_I 11 #define R9A09G057_GBETH_1_CLK_PTP_REF_I 12 #define R9A09G057_SPI_CLK_SPI 13 +#define R9A09G057_USB3_0_REF_ALT_CLK_P 14 +#define R9A09G057_USB3_0_CLKCORE 15 +#define R9A09G057_USB3_1_REF_ALT_CLK_P 16 +#define R9A09G057_USB3_1_CLKCORE 17 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G057_CPG_H__ */ -- cgit v1.2.3 From a95ce05cd0cc8b53f1559390c4e690bb8f79562f Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Sat, 1 Nov 2025 05:00:32 +0000 Subject: dt-bindings: clock: renesas,r9a09g056-cpg: Add USB3.0 core clocks Add definitions for USB3.0 core clocks in the R9A09G056 CPG DT bindings header file. Signed-off-by: Lad Prabhakar Acked-by: Krzysztof Kozlowski Reviewed-by: Geert Uytterhoeven Link: https://patch.msgid.link/20251101050034.738807-3-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- include/dt-bindings/clock/renesas,r9a09g056-cpg.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/renesas,r9a09g056-cpg.h b/include/dt-bindings/clock/renesas,r9a09g056-cpg.h index a9af5af9e3a1..234dcf4f0f91 100644 --- a/include/dt-bindings/clock/renesas,r9a09g056-cpg.h +++ b/include/dt-bindings/clock/renesas,r9a09g056-cpg.h @@ -21,5 +21,7 @@ #define R9A09G056_GBETH_0_CLK_PTP_REF_I 10 #define R9A09G056_GBETH_1_CLK_PTP_REF_I 11 #define R9A09G056_SPI_CLK_SPI 12 +#define R9A09G056_USB3_0_REF_ALT_CLK_P 13 +#define R9A09G056_USB3_0_CLKCORE 14 #endif /* __DT_BINDINGS_CLOCK_RENESAS_R9A09G056_CPG_H__ */ -- cgit v1.2.3 From e5b5f8b7c26f72fe86b59979e51d8e6cf36ea903 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:40 -0800 Subject: PCI/TSM: Drop stub for pci_tsm_doe_transfer() Just like pci_tsm_pf0_{con,de}structor(), in the CONFIG_PCI_TSM=n case there should be no callers of pci_tsm_doe_transfer(). Reported-by: Xu Yilun Closes: http://lore.kernel.org/aRFfk14DJWEVhC/R@yilunxu-OptiPlex-7050 Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-3-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-tsm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h index e921d30f9b6c..d7b078d5e272 100644 --- a/include/linux/pci-tsm.h +++ b/include/linux/pci-tsm.h @@ -147,11 +147,5 @@ static inline int pci_tsm_register(struct tsm_dev *tsm_dev) static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev) { } -static inline int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, - const void *req, size_t req_sz, - void *resp, size_t resp_sz) -{ - return -ENXIO; -} #endif #endif /*__PCI_TSM_H */ -- cgit v1.2.3 From c16af019d9d6d23f211c82b5561f2ecd2a7dff54 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:41 -0800 Subject: resource: Introduce resource_assigned() for discerning active resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A PCI bridge resource lifecycle involves both a "request" and "assign" phase. At any point in time that resource may not yet be assigned, or may have failed to assign (because it does not fit). There are multiple conventions to determine when assignment has not completed: IORESOURCE_UNSET, IORESOURCE_DISABLED, and checking whether the resource is parented. In code paths that are known to not be racing assignment, e.g. post subsys_initcall(), the most reliable method to judge that a bridge resource is assigned is to check the resource is parented [1]. Introduce a resource_assigned() helper for this purpose. Link: http://lore.kernel.org/2b9f7f7b-d6a4-be59-14d4-7b4ffccfe373@linux.intel.com [1] Suggested-by: Ilpo Järvinen Cc: Bjorn Helgaas Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-4-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/ioport.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index e8b2d6aa4013..9afa30f9346f 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -334,6 +334,15 @@ static inline bool resource_union(const struct resource *r1, const struct resour return true; } +/* + * Check if this resource is added to a resource tree or detached. Caller is + * responsible for not racing assignment. + */ +static inline bool resource_assigned(struct resource *res) +{ + return res->parent; +} + int find_resource_space(struct resource *root, struct resource *new, resource_size_t size, struct resource_constraint *constraint); -- cgit v1.2.3 From 4aa73c6051cb65046e6fa601b7877b5c1e6edc85 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 12 Nov 2025 21:46:24 +0100 Subject: net: dsa: remove definition of struct dsa_switch_driver Since 93e86b3bc842 ("net: dsa: Remove legacy probing support") this struct has no user any longer. Signed-off-by: Heiner Kallweit Reviewed-by: Vladimir Oltean Link: https://patch.msgid.link/4053a98f-052f-4dc1-a3d4-ed9b3d3cc7cb@gmail.com Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index 2df2e2ead9a8..97d5f401cfcf 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1314,11 +1314,6 @@ static inline int dsa_devlink_port_to_port(struct devlink_port *port) return port->index; } -struct dsa_switch_driver { - struct list_head list; - const struct dsa_switch_ops *ops; -}; - bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); -- cgit v1.2.3 From 947643509279a605a09959a06d332bf027e8be57 Mon Sep 17 00:00:00 2001 From: Mikhail Kshevetskiy Date: Mon, 10 Nov 2025 06:56:43 +0300 Subject: dt-bindings: clock: airoha: Add reset support to EN7523 clock binding Introduce reset capability to EN7523 device-tree clock binding documentation. Signed-off-by: Mikhail Kshevetskiy Reviewed-by: Rob Herring (Arm) Signed-off-by: Stephen Boyd --- include/dt-bindings/reset/airoha,en7523-reset.h | 61 +++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 include/dt-bindings/reset/airoha,en7523-reset.h (limited to 'include') diff --git a/include/dt-bindings/reset/airoha,en7523-reset.h b/include/dt-bindings/reset/airoha,en7523-reset.h new file mode 100644 index 000000000000..211e8a23a21c --- /dev/null +++ b/include/dt-bindings/reset/airoha,en7523-reset.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2024 iopsys Software Solutions AB. + * Copyright (C) 2025 Genexis AB. + * + * Author: Mikhail Kshevetskiy + * + * based on + * include/dt-bindings/reset/airoha,en7581-reset.h + * by Lorenzo Bianconi + */ + +#ifndef __DT_BINDINGS_RESET_CONTROLLER_AIROHA_EN7523_H_ +#define __DT_BINDINGS_RESET_CONTROLLER_AIROHA_EN7523_H_ + +/* RST_CTRL2 */ +#define EN7523_XPON_PHY_RST 0 +#define EN7523_XSI_MAC_RST 1 +#define EN7523_XSI_PHY_RST 2 +#define EN7523_NPU_RST 3 +#define EN7523_I2S_RST 4 +#define EN7523_TRNG_RST 5 +#define EN7523_TRNG_MSTART_RST 6 +#define EN7523_DUAL_HSI0_RST 7 +#define EN7523_DUAL_HSI1_RST 8 +#define EN7523_HSI_RST 9 +#define EN7523_DUAL_HSI0_MAC_RST 10 +#define EN7523_DUAL_HSI1_MAC_RST 11 +#define EN7523_HSI_MAC_RST 12 +#define EN7523_WDMA_RST 13 +#define EN7523_WOE0_RST 14 +#define EN7523_WOE1_RST 15 +#define EN7523_HSDMA_RST 16 +#define EN7523_I2C2RBUS_RST 17 +#define EN7523_TDMA_RST 18 +/* RST_CTRL1 */ +#define EN7523_PCM1_ZSI_ISI_RST 19 +#define EN7523_FE_PDMA_RST 20 +#define EN7523_FE_QDMA_RST 21 +#define EN7523_PCM_SPIWP_RST 22 +#define EN7523_CRYPTO_RST 23 +#define EN7523_TIMER_RST 24 +#define EN7523_PCM1_RST 25 +#define EN7523_UART_RST 26 +#define EN7523_GPIO_RST 27 +#define EN7523_GDMA_RST 28 +#define EN7523_I2C_MASTER_RST 29 +#define EN7523_PCM2_ZSI_ISI_RST 30 +#define EN7523_SFC_RST 31 +#define EN7523_UART2_RST 32 +#define EN7523_GDMP_RST 33 +#define EN7523_FE_RST 34 +#define EN7523_USB_HOST_P0_RST 35 +#define EN7523_GSW_RST 36 +#define EN7523_SFC2_PCM_RST 37 +#define EN7523_PCIE0_RST 38 +#define EN7523_PCIE1_RST 39 +#define EN7523_PCIE_HB_RST 40 +#define EN7523_XPON_MAC_RST 41 + +#endif /* __DT_BINDINGS_RESET_CONTROLLER_AIROHA_EN7523_H_ */ -- cgit v1.2.3 From a97fbc3ee3e2a536fafaff04f21f45472db71769 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 29 Oct 2025 17:33:30 +0100 Subject: syscore: Pass context data to callbacks Several drivers can benefit from registering per-instance data along with the syscore operations. To achieve this, move the modifiable fields out of the syscore_ops structure and into a separate struct syscore that can be registered with the framework. Add a void * driver data field for drivers to store contextual data that will be passed to the syscore ops. Acked-by: Rafael J. Wysocki (Intel) Signed-off-by: Thierry Reding --- include/linux/syscore_ops.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/syscore_ops.h b/include/linux/syscore_ops.h index ae4d48e4c970..ac6d71be5c38 100644 --- a/include/linux/syscore_ops.h +++ b/include/linux/syscore_ops.h @@ -11,14 +11,19 @@ #include struct syscore_ops { + int (*suspend)(void *data); + void (*resume)(void *data); + void (*shutdown)(void *data); +}; + +struct syscore { struct list_head node; - int (*suspend)(void); - void (*resume)(void); - void (*shutdown)(void); + const struct syscore_ops *ops; + void *data; }; -extern void register_syscore_ops(struct syscore_ops *ops); -extern void unregister_syscore_ops(struct syscore_ops *ops); +extern void register_syscore(struct syscore *syscore); +extern void unregister_syscore(struct syscore *syscore); #ifdef CONFIG_PM_SLEEP extern int syscore_suspend(void); extern void syscore_resume(void); -- cgit v1.2.3 From 0559730b8570259ef948e9083653f8a87baba182 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Mon, 22 Sep 2025 11:43:28 +0200 Subject: pwm: Drop unused function pwm_apply_args() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function pwm_apply_args() was introduced with the concept of atomic PWM configuration and needed for drivers not using this concept yet. Now all drivers are converted accordingly and so no callers are left which allows to remove this function. Signed-off-by: Uwe Kleine-König Link: https://patch.msgid.link/20250922094327.1143944-2-u.kleine-koenig@baylibre.com Signed-off-by: Uwe Kleine-König --- include/linux/pwm.h | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'include') diff --git a/include/linux/pwm.h b/include/linux/pwm.h index 549ac4aaad59..e59be4e382d1 100644 --- a/include/linux/pwm.h +++ b/include/linux/pwm.h @@ -611,39 +611,6 @@ devm_fwnode_pwm_get(struct device *dev, struct fwnode_handle *fwnode, } #endif -static inline void pwm_apply_args(struct pwm_device *pwm) -{ - struct pwm_state state = { }; - - /* - * PWM users calling pwm_apply_args() expect to have a fresh config - * where the polarity and period are set according to pwm_args info. - * The problem is, polarity can only be changed when the PWM is - * disabled. - * - * PWM drivers supporting hardware readout may declare the PWM device - * as enabled, and prevent polarity setting, which changes from the - * existing behavior, where all PWM devices are declared as disabled - * at startup (even if they are actually enabled), thus authorizing - * polarity setting. - * - * To fulfill this requirement, we apply a new state which disables - * the PWM device and set the reference period and polarity config. - * - * Note that PWM users requiring a smooth handover between the - * bootloader and the kernel (like critical regulators controlled by - * PWM devices) will have to switch to the atomic API and avoid calling - * pwm_apply_args(). - */ - - state.enabled = false; - state.polarity = pwm->args.polarity; - state.period = pwm->args.period; - state.usage_power = false; - - pwm_apply_might_sleep(pwm, &state); -} - struct pwm_lookup { struct list_head list; const char *provider; -- cgit v1.2.3 From 37f0c7a8df7ad719a68fa1c2dbf066cfebc391a7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 14 Nov 2025 11:07:04 +0200 Subject: block-dma: properly take MMIO path In commit eadaa8b255f3 ("dma-mapping: introduce new DMA attribute to indicate MMIO memory"), DMA_ATTR_MMIO attribute was added to describe MMIO addresses, which require to avoid any memory cache flushing, as an outcome of the discussion pointed in Link tag below. In case of PCI_P2PDMA_MAP_THRU_HOST_BRIDGE transfer, blk-mq-dm logic treated this as regular page and relied on "struct page" DMA flow. That flow performs CPU cache flushing, which shouldn't be done here, and doesn't set IOMMU_MMIO flag in DMA-IOMMU case. As a solution, let's encode peer-to-peer transaction type in NVMe IOD flags variable and provide it to blk-mq-dma API. Link: https://lore.kernel.org/all/f912c446-1ae9-4390-9c11-00dce7bf0fd3@arm.com/ Reviewed-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Signed-off-by: Leon Romanovsky Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 1 - include/linux/blk-integrity.h | 14 -------------- include/linux/blk-mq-dma.h | 28 +++++++++++++--------------- include/linux/blk_types.h | 2 -- 4 files changed, 13 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 3d05296a5afe..21e4652dcfd2 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -13,7 +13,6 @@ enum bip_flags { BIP_CHECK_GUARD = 1 << 5, /* guard check */ BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ - BIP_P2P_DMA = 1 << 8, /* using P2P address */ BIP_MEMPOOL = 1 << 15, /* buffer backed by mempool */ }; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index c2030fd8ba0a..a6b84206eb94 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -33,14 +33,6 @@ static inline bool queue_limits_stack_integrity_bdev(struct queue_limits *t, #ifdef CONFIG_BLK_DEV_INTEGRITY int blk_rq_map_integrity_sg(struct request *, struct scatterlist *); -static inline bool blk_rq_integrity_dma_unmap(struct request *req, - struct device *dma_dev, struct dma_iova_state *state, - size_t mapped_len) -{ - return blk_dma_unmap(req, dma_dev, state, mapped_len, - bio_integrity(req->bio)->bip_flags & BIP_P2P_DMA); -} - int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes); @@ -129,12 +121,6 @@ static inline int blk_rq_map_integrity_sg(struct request *q, { return 0; } -static inline bool blk_rq_integrity_dma_unmap(struct request *req, - struct device *dma_dev, struct dma_iova_state *state, - size_t mapped_len) -{ - return false; -} static inline int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) diff --git a/include/linux/blk-mq-dma.h b/include/linux/blk-mq-dma.h index 51829958d872..cb88fc791fbd 100644 --- a/include/linux/blk-mq-dma.h +++ b/include/linux/blk-mq-dma.h @@ -16,13 +16,13 @@ struct blk_dma_iter { /* Output address range for this iteration */ dma_addr_t addr; u32 len; + struct pci_p2pdma_map_state p2pdma; /* Status code. Only valid when blk_rq_dma_map_iter_* returned false */ blk_status_t status; /* Internal to blk_rq_dma_map_iter_* */ struct blk_map_iter iter; - struct pci_p2pdma_map_state p2pdma; }; bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, @@ -43,36 +43,34 @@ static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) } /** - * blk_dma_unmap - try to DMA unmap a request + * blk_rq_dma_unmap - try to DMA unmap a request * @req: request to unmap * @dma_dev: device to unmap from * @state: DMA IOVA state * @mapped_len: number of bytes to unmap - * @is_p2p: true if mapped with PCI_P2PDMA_MAP_BUS_ADDR + * @map: peer-to-peer mapping type * * Returns %false if the callers need to manually unmap every DMA segment * mapped using @iter or %true if no work is left to be done. */ -static inline bool blk_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len, bool is_p2p) +static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, + struct dma_iova_state *state, size_t mapped_len, + enum pci_p2pdma_map_type map) { - if (is_p2p) + if (map == PCI_P2PDMA_MAP_BUS_ADDR) return true; if (dma_use_iova(state)) { + unsigned int attrs = 0; + + if (map == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE) + attrs |= DMA_ATTR_MMIO; + dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req), - 0); + attrs); return true; } return !dma_need_unmap(dma_dev); } - -static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, - struct dma_iova_state *state, size_t mapped_len) -{ - return blk_dma_unmap(req, dma_dev, state, mapped_len, - req->cmd_flags & REQ_P2PDMA); -} - #endif /* BLK_MQ_DMA_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 53501ebb0623..d884cc1256ec 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -393,7 +393,6 @@ enum req_flag_bits { __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ __REQ_ATOMIC, /* for atomic write operations */ - __REQ_P2PDMA, /* contains P2P DMA pages */ /* * Command specific flags, keep last: */ @@ -426,7 +425,6 @@ enum req_flag_bits { #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) #define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) -#define REQ_P2PDMA (__force blk_opf_t)(1ULL << __REQ_P2PDMA) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) -- cgit v1.2.3 From cefd55bd2159f427228d44864747243946296739 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 11 Nov 2025 22:29:44 +0100 Subject: nsproxy: fix free_nsproxy() and simplify create_new_namespaces() Make it possible to handle NULL being passed to the reference count helpers instead of forcing the caller to handle this. Afterwards we can nicely allow a cleanup guard to handle nsproxy freeing. Active reference count handling is not done in nsproxy_free() but rather in free_nsproxy() as nsproxy_free() is also called from setns() failure paths where a new nsproxy has been prepared but has not been marked as active via switch_task_namespaces(). Link: https://lore.kernel.org/690bfb9e.050a0220.2e3c35.0013.GAE@google.com Link: https://patch.msgid.link/20251111-sakralbau-guthaben-7dcc277d337f@brauner Fixes: 3c9820d5c64a ("ns: add active reference count") Reported-by: syzbot+0b2e79f91ff6579bfa5b@syzkaller.appspotmail.com Reported-by: syzbot+0a8655a80e189278487e@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/ns_common.h | 11 +++++++---- include/linux/nsproxy.h | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h index 136f6a322e53..825f5865bfc5 100644 --- a/include/linux/ns_common.h +++ b/include/linux/ns_common.h @@ -114,11 +114,14 @@ static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common } #define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns))) -#define ns_ref_inc(__ns) __ns_ref_inc(to_ns_common((__ns))) -#define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) -#define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) +#define ns_ref_inc(__ns) \ + do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0) +#define ns_ref_get(__ns) \ + ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false) +#define ns_ref_put(__ns) \ + ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false) #define ns_ref_put_and_lock(__ns, __ns_lock) \ - __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) + ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false) #define ns_ref_active_read(__ns) \ ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index ac825eddec59..5a67648721c7 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -99,7 +99,7 @@ void get_cred_namespaces(struct task_struct *tsk); void exit_cred_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); -void free_nsproxy(struct nsproxy *ns); +void deactivate_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); @@ -107,7 +107,7 @@ int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { if (refcount_dec_and_test(&ns->count)) - free_nsproxy(ns); + deactivate_nsproxy(ns); } static inline void get_nsproxy(struct nsproxy *ns) -- cgit v1.2.3 From 4037d966f034ba5da2872c413b2ec17eca867e68 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:25 +1100 Subject: VFS: introduce start_dirop() and end_dirop() The fact that directory operations (create,remove,rename) are protected by a lock on the parent is known widely throughout the kernel. In order to change this - to instead lock the target dentry - it is best to centralise this knowledge so it can be changed in one place. This patch introduces start_dirop() which is local to VFS code. It performs the required locking for create and remove. Rename will be handled separately. Various functions with names like start_creating() or start_removing_path(), some of which already exist, will export this functionality beyond the VFS. end_dirop() is the partner of start_dirop(). It drops the lock and releases the reference on the dentry. It *is* exported so that various end_creating etc functions can be inline. As vfs_mkdir() drops the dentry on error we cannot use end_dirop() as that won't unlock when the dentry IS_ERR(). For now we need an explicit unlock when dentry IS_ERR(). I hope to change vfs_mkdir() to unlock when it drops a dentry so that explicit unlock can go away. end_dirop() can always be called on the result of start_dirop(), but not after vfs_mkdir(). After a vfs_mkdir() we still may need the explicit unlock as seen in end_creating_path(). As well as adding start_dirop() and end_dirop() this patch uses them in: - simple_start_creating (which requires sharing lookup_noperm_common() with libfs.c) - start_removing_path / start_removing_user_path_at - filename_create / end_creating_path() - do_rmdir(), do_unlinkat() Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-3-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..f4543612ef1e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3609,6 +3609,8 @@ extern void iterate_supers_type(struct file_system_type *, void filesystems_freeze(void); void filesystems_thaw(void); +void end_dirop(struct dentry *de); + extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); -- cgit v1.2.3 From 7ab96df840e60eb933abfe65fc5fe44e72f16dc0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:27 +1100 Subject: VFS/nfsd/cachefiles/ovl: add start_creating() and end_creating() start_creating() is similar to simple_start_creating() but is not so simple. It takes a qstr for the name, includes permission checking, and does NOT report an error if the name already exists, returning a positive dentry instead. This is currently used by nfsd, cachefiles, and overlayfs. end_creating() is called after the dentry has been used. end_creating() drops the reference to the dentry as it is generally no longer needed. This is exactly the first section of end_creating_path() so that function is changed to call the new end_creating() These calls help encapsulate locking rules so that directory locking can be changed. Occasionally this change means that the parent lock is held for a shorter period of time, for example in cachefiles_commit_tmpfile(). As this function now unlocks after an unlink and before the following lookup, it is possible that the lookup could again find a positive dentry, so a while loop is introduced there. In overlayfs the ovl_lookup_temp() function has ovl_tempname() split out to be used in ovl_start_creating_temp(). The other use of ovl_lookup_temp() is preparing for a rename. When rename handling is updated, ovl_lookup_temp() will be removed. Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-5-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/namei.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index fed86221c69c..3f92c1a16878 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -88,6 +88,39 @@ struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base); +struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name); + +/** + * end_creating - finish action started with start_creating + * @child: dentry returned by start_creating() or vfs_mkdir() + * @parent: dentry given to start_creating(), + * + * Unlock and release the child. + * + * Unlike end_dirop() this can only be called if start_creating() succeeded. + * It handles @child being and error as vfs_mkdir() might have converted the + * dentry to an error - in that case the parent still needs to be unlocked. + * + * If vfs_mkdir() was called then the value returned from that function + * should be given for @child rather than the original dentry, as vfs_mkdir() + * may have provided a new dentry. Even if vfs_mkdir() returns an error + * it must be given to end_creating(). + * + * If vfs_mkdir() was not called, then @child will be a valid dentry and + * @parent will be ignored. + */ +static inline void end_creating(struct dentry *child, struct dentry *parent) +{ + if (IS_ERR(child)) + /* The parent is still locked despite the error from + * vfs_mkdir() - must unlock it. + */ + inode_unlock(parent->d_inode); + else + end_dirop(child); +} + extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); -- cgit v1.2.3 From bd6ede8a06e89ca5a94a8b51cea792705d1b8ca2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:28 +1100 Subject: VFS/nfsd/cachefiles/ovl: introduce start_removing() and end_removing() start_removing() is similar to start_creating() but will only return a positive dentry with the expectation that it will be removed. This is used by nfsd, cachefiles, and overlayfs. They are changed to also use end_removing() to terminate the action begun by start_removing(). This is a simple alias for end_dirop(). Apart from changes to the error paths, as we no longer need to unlock on a lookup error, an effect on callers is that they don't need to test if the found dentry is positive or negative - they can be sure it is positive. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-6-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/namei.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index 3f92c1a16878..9ee76e88f3dd 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -90,6 +90,8 @@ struct dentry *lookup_one_positive_killable(struct mnt_idmap *idmap, struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); +struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, + struct qstr *name); /** * end_creating - finish action started with start_creating @@ -121,6 +123,22 @@ static inline void end_creating(struct dentry *child, struct dentry *parent) end_dirop(child); } +/** + * end_removing - finish action started with start_removing + * @child: dentry returned by start_removing() + * @parent: dentry given to start_removing() + * + * Unlock and release the child. + * + * This is identical to end_dirop(). It can be passed the result of + * start_removing() whether that was successful or not, but it not needed + * if start_removing() failed. + */ +static inline void end_removing(struct dentry *child) +{ + end_dirop(child); +} + extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); extern int follow_up(struct path *); -- cgit v1.2.3 From c9ba789dad15ba65662bba17595c0aeaa0cfcf1c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:29 +1100 Subject: VFS: introduce start_creating_noperm() and start_removing_noperm() xfs, fuse, ipc/mqueue need variants of start_creating or start_removing which do not check permissions. This patch adds _noperm versions of these functions. Note that do_mq_open() was only calling mntget() so it could call path_put() - it didn't really need an extra reference on the mnt. Now it doesn't call mntget() and uses end_creating() which does the dput() half of path_put(). Also mq_unlink() previously passed d_inode(dentry->d_parent) as the dir inode to vfs_unlink(). This is after locking d_inode(mnt->mnt_root) These two inodes are the same, but normally calls use the textual parent. So I've changes the vfs_unlink() call to be given d_inode(mnt->mnt_root). Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown -- changes since v2: - dir arg passed to vfs_unlink() in mq_unlink() changed to match the dir passed to lookup_noperm() - restore assignment to path->mnt even though the mntget() is removed. Link: https://patch.msgid.link/20251113002050.676694-7-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/namei.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index 9ee76e88f3dd..688e157d6afc 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -92,6 +92,8 @@ struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); +struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); +struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); /** * end_creating - finish action started with start_creating -- cgit v1.2.3 From 7bb1eb45e43c4730cbc5a48b9e9295049fccdacb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:31 +1100 Subject: VFS: introduce start_removing_dentry() start_removing_dentry() is similar to start_removing() but instead of providing a name for lookup, the target dentry is given. start_removing_dentry() checks that the dentry is still hashed and in the parent, and if so it locks and increases the refcount so that end_removing() can be used to finish the operation. This is used in cachefiles, overlayfs, smb/server, and apparmor. There will be other users including ecryptfs. As start_removing_dentry() takes an extra reference to the dentry (to be put by end_removing()), there is no need to explicitly take an extra reference to stop d_delete() from using dentry_unlink_inode() to negate the dentry - as in cachefiles_delete_object(), and ksmbd_vfs_unlink(). cachefiles_bury_object() now gets an extra ref to the victim, which is drops. As it includes the needed end_removing() calls, the caller doesn't need them. Reviewed-by: Amir Goldstein Reviewed-by: Namjae Jeon Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-9-neilb@ownmail.net Signed-off-by: Christian Brauner --- include/linux/namei.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index 688e157d6afc..7e916e9d7726 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -94,6 +94,8 @@ struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); +struct dentry *start_removing_dentry(struct dentry *parent, + struct dentry *child); /** * end_creating - finish action started with start_creating -- cgit v1.2.3 From ff7c4ea11a05c886f018fff4a4d4f4d68d951e25 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:32 +1100 Subject: VFS: add start_creating_killable() and start_removing_killable() These are similar to start_creating() and start_removing(), but allow a fatal signal to abort waiting for the lock. They are used in btrfs for subvol creation and removal. btrfs_may_create() no longer needs IS_DEADDIR() and start_creating_killable() includes that check. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-10-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/namei.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index 7e916e9d7726..e5cff89679df 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -92,6 +92,12 @@ struct dentry *start_creating(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); struct dentry *start_removing(struct mnt_idmap *idmap, struct dentry *parent, struct qstr *name); +struct dentry *start_creating_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name); +struct dentry *start_removing_killable(struct mnt_idmap *idmap, + struct dentry *parent, + struct qstr *name); struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); struct dentry *start_removing_dentry(struct dentry *parent, -- cgit v1.2.3 From 5c8752729970cc2323ba86817254749f7f21f163 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:33 +1100 Subject: VFS/nfsd/ovl: introduce start_renaming() and end_renaming() start_renaming() combines name lookup and locking to prepare for rename. It is used when two names need to be looked up as in nfsd and overlayfs - cases where one or both dentries are already available will be handled separately. __start_renaming() avoids the inode_permission check and hash calculation and is suitable after filename_parentat() in do_renameat2(). It subsumes quite a bit of code from that function. start_renaming() does calculate the hash and check X permission and is suitable elsewhere: - nfsd_rename() - ovl_rename() In ovl, ovl_do_rename_rd() is factored out of ovl_do_rename(), which itself will be gone by the end of the series. Acked-by: Chuck Lever (for nfsd parts) Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Signed-off-by: NeilBrown -- Changes since v3: - added missig dput() in ovl_rename when "whiteout" is not-NULL. Changes since v2: - in __start_renaming() some label have been renamed, and err is always set before a "goto out_foo" rather than passing the error in a dentry*. - ovl_do_rename() changed to call the new ovl_do_rename_rd() rather than keeping duplicate code - code around ovl_cleanup() call in ovl_rename() restructured. Link: https://patch.msgid.link/20251113002050.676694-11-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Acked-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Christian Brauner --- include/linux/namei.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index e5cff89679df..19c3d8e336d5 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -156,6 +156,9 @@ extern int follow_up(struct path *); extern struct dentry *lock_rename(struct dentry *, struct dentry *); extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); +int start_renaming(struct renamedata *rd, int lookup_flags, + struct qstr *old_last, struct qstr *new_last); +void end_renaming(struct renamedata *rd); /** * mode_strip_umask - handle vfs umask stripping -- cgit v1.2.3 From ac50950ca143fd637dec4f7457a9162e1a4344e8 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:34 +1100 Subject: VFS/ovl/smb: introduce start_renaming_dentry() Several callers perform a rename on a dentry they already have, and only require lookup for the target name. This includes smb/server and a few different places in overlayfs. start_renaming_dentry() performs the required lookup and takes the required lock using lock_rename_child() It is used in three places in overlayfs and in ksmbd_vfs_rename(). In the ksmbd case, the parent of the source is not important - the source must be renamed from wherever it is. So start_renaming_dentry() allows rd->old_parent to be NULL and only checks it if it is non-NULL. On success rd->old_parent will be the parent of old_dentry with an extra reference taken. Other start_renaming function also now take the extra reference and end_renaming() now drops this reference as well. ovl_lookup_temp(), ovl_parent_lock(), and ovl_parent_unlock() are all removed as they are no longer needed. OVL_TEMPNAME_SIZE and ovl_tempname() are now declared in overlayfs.h so that ovl_check_rename_whiteout() can access them. ovl_copy_up_workdir() now always cleans up on error. Reviewed-by: Namjae Jeon Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-12-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/namei.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index 19c3d8e336d5..f73001e3719a 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -158,6 +158,8 @@ extern struct dentry *lock_rename_child(struct dentry *, struct dentry *); extern void unlock_rename(struct dentry *, struct dentry *); int start_renaming(struct renamedata *rd, int lookup_flags, struct qstr *old_last, struct qstr *new_last); +int start_renaming_dentry(struct renamedata *rd, int lookup_flags, + struct dentry *old_dentry, struct qstr *new_last); void end_renaming(struct renamedata *rd); /** -- cgit v1.2.3 From 833d2b3a072f7ff6005bf84c065c7cbda81a8aaa Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:35 +1100 Subject: Add start_renaming_two_dentries() A few callers want to lock for a rename and already have both dentries. Also debugfs does want to perform a lookup but doesn't want permission checking, so start_renaming_dentry() cannot be used. This patch introduces start_renaming_two_dentries() which is given both dentries. debugfs performs one lookup itself. As it will only continue with a negative dentry and as those cannot be renamed or unlinked, it is safe to do the lookup before getting the rename locks. overlayfs uses start_renaming_two_dentries() in three places and selinux uses it twice in sel_make_policy_nodes(). In sel_make_policy_nodes() we now lock for rename twice instead of just once so the combined operation is no longer atomic w.r.t the parent directory locks. As selinux_state.policy_mutex is held across the whole operation this does not open up any interesting races. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-13-neilb@ownmail.net Signed-off-by: Christian Brauner --- include/linux/namei.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index f73001e3719a..a99ac8b7e24a 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -160,6 +160,8 @@ int start_renaming(struct renamedata *rd, int lookup_flags, struct qstr *old_last, struct qstr *new_last); int start_renaming_dentry(struct renamedata *rd, int lookup_flags, struct dentry *old_dentry, struct qstr *new_last); +int start_renaming_two_dentries(struct renamedata *rd, + struct dentry *old_dentry, struct dentry *new_dentry); void end_renaming(struct renamedata *rd); /** -- cgit v1.2.3 From f046fbb4d81d1b0c4a169707411e3cd540c03354 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:36 +1100 Subject: ecryptfs: use new start_creating/start_removing APIs This requires the addition of start_creating_dentry() which is given the dentry which has already been found, and asks for it to be locked and its parent validated. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-14-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/namei.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index a99ac8b7e24a..208aed1d6728 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -100,6 +100,8 @@ struct dentry *start_removing_killable(struct mnt_idmap *idmap, struct qstr *name); struct dentry *start_creating_noperm(struct dentry *parent, struct qstr *name); struct dentry *start_removing_noperm(struct dentry *parent, struct qstr *name); +struct dentry *start_creating_dentry(struct dentry *parent, + struct dentry *child); struct dentry *start_removing_dentry(struct dentry *parent, struct dentry *child); -- cgit v1.2.3 From fe497f0759e0efb949f9480911d00b6045c21f50 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:37 +1100 Subject: VFS: change vfs_mkdir() to unlock on failure. vfs_mkdir() already drops the reference to the dentry on failure but it leaves the parent locked. This complicates end_creating() which needs to unlock the parent even though the dentry is no longer available. If we change vfs_mkdir() to unlock on failure as well as releasing the dentry, we can remove the "parent" arg from end_creating() and simplify the rules for calling it. Note that cachefiles_get_directory() can choose to substitute an error instead of actually calling vfs_mkdir(), for fault injection. In that case it needs to call end_creating(), just as vfs_mkdir() now does on error. ovl_create_real() will now unlock on error. So the conditional end_creating() after the call is removed, and end_creating() is called internally on error. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-15-neilb@ownmail.net Signed-off-by: Christian Brauner --- include/linux/namei.h | 28 +++++++++------------------- 1 file changed, 9 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index 208aed1d6728..0ef73d739a31 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -105,34 +105,24 @@ struct dentry *start_creating_dentry(struct dentry *parent, struct dentry *start_removing_dentry(struct dentry *parent, struct dentry *child); -/** - * end_creating - finish action started with start_creating - * @child: dentry returned by start_creating() or vfs_mkdir() - * @parent: dentry given to start_creating(), - * - * Unlock and release the child. +/* end_creating - finish action started with start_creating + * @child: dentry returned by start_creating() or vfs_mkdir() * - * Unlike end_dirop() this can only be called if start_creating() succeeded. - * It handles @child being and error as vfs_mkdir() might have converted the - * dentry to an error - in that case the parent still needs to be unlocked. + * Unlock and release the child. This can be called after + * start_creating() whether that function succeeded or not, + * but it is not needed on failure. * * If vfs_mkdir() was called then the value returned from that function * should be given for @child rather than the original dentry, as vfs_mkdir() - * may have provided a new dentry. Even if vfs_mkdir() returns an error - * it must be given to end_creating(). + * may have provided a new dentry. + * * * If vfs_mkdir() was not called, then @child will be a valid dentry and * @parent will be ignored. */ -static inline void end_creating(struct dentry *child, struct dentry *parent) +static inline void end_creating(struct dentry *child) { - if (IS_ERR(child)) - /* The parent is still locked despite the error from - * vfs_mkdir() - must unlock it. - */ - inode_unlock(parent->d_inode); - else - end_dirop(child); + end_dirop(child); } /** -- cgit v1.2.3 From cf296b294c3bd8f7db229060efe677dfd49e46b6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Nov 2025 11:18:38 +1100 Subject: VFS: introduce end_creating_keep() Occasionally the caller of end_creating() wants to keep using the dentry. Rather then requiring them to dget() the dentry (when not an error) before calling end_creating(), provide end_creating_keep() which does this. cachefiles and overlayfs make use of this. Reviewed-by: Amir Goldstein Reviewed-by: Jeff Layton Signed-off-by: NeilBrown Link: https://patch.msgid.link/20251113002050.676694-16-neilb@ownmail.net Tested-by: syzbot@syzkaller.appspotmail.com Signed-off-by: Christian Brauner --- include/linux/namei.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/namei.h b/include/linux/namei.h index 0ef73d739a31..3d82c6a19197 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -125,6 +125,28 @@ static inline void end_creating(struct dentry *child) end_dirop(child); } +/* end_creating_keep - finish action started with start_creating() and return result + * @child: dentry returned by start_creating() or vfs_mkdir() + * + * Unlock and return the child. This can be called after + * start_creating() whether that function succeeded or not, + * but it is not needed on failure. + * + * If vfs_mkdir() was called then the value returned from that function + * should be given for @child rather than the original dentry, as vfs_mkdir() + * may have provided a new dentry. + * + * Returns: @child, which may be a dentry or an error. + * + */ +static inline struct dentry *end_creating_keep(struct dentry *child) +{ + if (!IS_ERR(child)) + dget(child); + end_dirop(child); + return child; +} + /** * end_removing - finish action started with start_removing * @child: dentry returned by start_removing() -- cgit v1.2.3 From e36dbd1cf3dfc4ce18e9f7a80183b53cae257e30 Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Fri, 4 Jul 2025 17:43:19 +0200 Subject: media: uapi: Introduce V4L2 generic ISP types Introduce v4l2-isp.h in the Linux kernel uAPI. The header includes types for generic ISP configuration parameters and will be extended in the future with support for generic ISP statistics formats. Generic ISP parameters support is provided by introducing two new types that represent an extensible and versioned buffer of ISP configuration parameters. The v4l2_params_buffer represents the container for the ISP configuration data block. The generic type is defined with a 0-sized data member that the ISP driver implementations shall properly size according to their capabilities. The v4l2_params_block_header structure represents the header to be prepend to each ISP configuration block. Signed-off-by: Daniel Scally Reviewed-by: Daniel Scally Reviewed-by: Laurent Pinchart Reviewed-by: Michael Riesch Acked-by: Sakari Ailus Tested-by: Lad Prabhakar Signed-off-by: Jacopo Mondi Signed-off-by: Hans Verkuil --- include/uapi/linux/media/v4l2-isp.h | 102 ++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 include/uapi/linux/media/v4l2-isp.h (limited to 'include') diff --git a/include/uapi/linux/media/v4l2-isp.h b/include/uapi/linux/media/v4l2-isp.h new file mode 100644 index 000000000000..779168f9058e --- /dev/null +++ b/include/uapi/linux/media/v4l2-isp.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Video4Linux2 generic ISP parameters and statistics support + * + * Copyright (C) 2025 Ideas On Board Oy + * Author: Jacopo Mondi + */ + +#ifndef _UAPI_V4L2_ISP_H_ +#define _UAPI_V4L2_ISP_H_ + +#include +#include + +/** + * enum v4l2_isp_params_version - V4L2 ISP parameters versioning + * + * @V4L2_ISP_PARAMS_VERSION_V0: First version of the V4L2 ISP parameters format + * (for compatibility) + * @V4L2_ISP_PARAMS_VERSION_V1: First version of the V4L2 ISP parameters format + * + * V0 and V1 are identical in order to support drivers compatible with the V4L2 + * ISP parameters format already upstreamed which use either 0 or 1 as their + * versioning identifier. Both V0 and V1 refers to the first version of the + * V4L2 ISP parameters format. + * + * Future revisions of the V4L2 ISP parameters format should start from the + * value of 2. + */ +enum v4l2_isp_params_version { + V4L2_ISP_PARAMS_VERSION_V0 = 0, + V4L2_ISP_PARAMS_VERSION_V1 +}; + +#define V4L2_ISP_PARAMS_FL_BLOCK_DISABLE (1U << 0) +#define V4L2_ISP_PARAMS_FL_BLOCK_ENABLE (1U << 1) + +/* + * Reserve the first 8 bits for V4L2_ISP_PARAMS_FL_* flag. + * + * Driver-specific flags should be defined as: + * #define DRIVER_SPECIFIC_FLAG0 ((1U << V4L2_ISP_PARAMS_FL_DRIVER_FLAGS(0)) + * #define DRIVER_SPECIFIC_FLAG1 ((1U << V4L2_ISP_PARAMS_FL_DRIVER_FLAGS(1)) + */ +#define V4L2_ISP_PARAMS_FL_DRIVER_FLAGS(n) ((n) + 8) + +/** + * struct v4l2_isp_params_block_header - V4L2 extensible parameters block header + * @type: The parameters block type (driver-specific) + * @flags: A bitmask of block flags (driver-specific) + * @size: Size (in bytes) of the parameters block, including this header + * + * This structure represents the common part of all the ISP configuration + * blocks. Each parameters block shall embed an instance of this structure type + * as its first member, followed by the block-specific configuration data. + * + * The @type field is an ISP driver-specific value that identifies the block + * type. The @size field specifies the size of the parameters block. + * + * The @flags field is a bitmask of per-block flags V4L2_PARAMS_ISP_FL_* and + * driver-specific flags specified by the driver header. + */ +struct v4l2_isp_params_block_header { + __u16 type; + __u16 flags; + __u32 size; +} __attribute__((aligned(8))); + +/** + * struct v4l2_isp_params_buffer - V4L2 extensible parameters configuration + * @version: The parameters buffer version (driver-specific) + * @data_size: The configuration data effective size, excluding this header + * @data: The configuration data + * + * This structure contains the configuration parameters of the ISP algorithms, + * serialized by userspace into a data buffer. Each configuration parameter + * block is represented by a block-specific structure which contains a + * :c:type:`v4l2_isp_params_block_header` entry as first member. Userspace + * populates the @data buffer with configuration parameters for the blocks that + * it intends to configure. As a consequence, the data buffer effective size + * changes according to the number of ISP blocks that userspace intends to + * configure and is set by userspace in the @data_size field. + * + * The parameters buffer is versioned by the @version field to allow modifying + * and extending its definition. Userspace shall populate the @version field to + * inform the driver about the version it intends to use. The driver will parse + * and handle the @data buffer according to the data layout specific to the + * indicated version and return an error if the desired version is not + * supported. + * + * For each ISP block that userspace wants to configure, a block-specific + * structure is appended to the @data buffer, one after the other without gaps + * in between. Userspace shall populate the @data_size field with the effective + * size, in bytes, of the @data buffer. + */ +struct v4l2_isp_params_buffer { + __u32 version; + __u32 data_size; + __u8 data[] __counted_by(data_size); +}; + +#endif /* _UAPI_V4L2_ISP_H_ */ -- cgit v1.2.3 From 1e8152db64bdee9f13e84e516c2b8a9bb10f025e Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Mon, 7 Jul 2025 14:13:53 +0200 Subject: media: uapi: Convert RkISP1 to V4L2 extensible params With the introduction of common types for extensible parameters format, convert the rkisp1-config.h header to use the new types. Factor out the documentation that is now part of the common header and only keep the driver-specific on in place. The conversion to use common types doesn't impact userspace as the new types are either identical to the ones already existing in the RkISP1 uAPI or are 1-to-1 type convertible. Reviewed-by: Daniel Scally Reviewed-by: Laurent Pinchart Reviewed-by: Michael Riesch Acked-by: Sakari Ailus Tested-by: Lad Prabhakar Signed-off-by: Jacopo Mondi Signed-off-by: Hans Verkuil --- include/uapi/linux/rkisp1-config.h | 107 +++++++++---------------------------- 1 file changed, 24 insertions(+), 83 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/rkisp1-config.h b/include/uapi/linux/rkisp1-config.h index 3b060ea6eed7..b2d2a71f7baf 100644 --- a/include/uapi/linux/rkisp1-config.h +++ b/include/uapi/linux/rkisp1-config.h @@ -7,8 +7,13 @@ #ifndef _UAPI_RKISP1_CONFIG_H #define _UAPI_RKISP1_CONFIG_H +#ifdef __KERNEL__ +#include +#endif /* __KERNEL__ */ #include +#include + /* Defect Pixel Cluster Detection */ #define RKISP1_CIF_ISP_MODULE_DPCC (1U << 0) /* Black Level Subtraction */ @@ -1158,79 +1163,26 @@ enum rkisp1_ext_params_block_type { RKISP1_EXT_PARAMS_BLOCK_TYPE_WDR, }; -#define RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE (1U << 0) -#define RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE (1U << 1) +/* For backward compatibility */ +#define RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE V4L2_ISP_PARAMS_FL_BLOCK_DISABLE +#define RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE V4L2_ISP_PARAMS_FL_BLOCK_ENABLE /* A bitmask of parameters blocks supported on the current hardware. */ #define RKISP1_CID_SUPPORTED_PARAMS_BLOCKS (V4L2_CID_USER_RKISP1_BASE + 0x01) /** - * struct rkisp1_ext_params_block_header - RkISP1 extensible parameters block - * header + * rkisp1_ext_params_block_header - RkISP1 extensible parameters block header * * This structure represents the common part of all the ISP configuration - * blocks. Each parameters block shall embed an instance of this structure type - * as its first member, followed by the block-specific configuration data. The - * driver inspects this common header to discern the block type and its size and - * properly handle the block content by casting it to the correct block-specific - * type. + * blocks and is identical to :c:type:`v4l2_isp_params_block_header`. * - * The @type field is one of the values enumerated by + * The type field is one of the values enumerated by * :c:type:`rkisp1_ext_params_block_type` and specifies how the data should be - * interpreted by the driver. The @size field specifies the size of the - * parameters block and is used by the driver for validation purposes. - * - * The @flags field is a bitmask of per-block flags RKISP1_EXT_PARAMS_FL_*. - * - * When userspace wants to configure and enable an ISP block it shall fully - * populate the block configuration and set the - * RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE bit in the @flags field. - * - * When userspace simply wants to disable an ISP block the - * RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE bit should be set in @flags field. The - * driver ignores the rest of the block configuration structure in this case. - * - * If a new configuration of an ISP block has to be applied userspace shall - * fully populate the ISP block configuration and omit setting the - * RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE and RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE bits - * in the @flags field. - * - * Setting both the RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE and - * RKISP1_EXT_PARAMS_FL_BLOCK_DISABLE bits in the @flags field is not allowed - * and not accepted by the driver. - * - * Userspace is responsible for correctly populating the parameters block header - * fields (@type, @flags and @size) and the block-specific parameters. - * - * For example: + * interpreted by the driver. * - * .. code-block:: c - * - * void populate_bls(struct rkisp1_ext_params_block_header *block) { - * struct rkisp1_ext_params_bls_config *bls = - * (struct rkisp1_ext_params_bls_config *)block; - * - * bls->header.type = RKISP1_EXT_PARAMS_BLOCK_ID_BLS; - * bls->header.flags = RKISP1_EXT_PARAMS_FL_BLOCK_ENABLE; - * bls->header.size = sizeof(*bls); - * - * bls->config.enable_auto = 0; - * bls->config.fixed_val.r = blackLevelRed_; - * bls->config.fixed_val.gr = blackLevelGreenR_; - * bls->config.fixed_val.gb = blackLevelGreenB_; - * bls->config.fixed_val.b = blackLevelBlue_; - * } - * - * @type: The parameters block type, see - * :c:type:`rkisp1_ext_params_block_type` - * @flags: A bitmask of block flags - * @size: Size (in bytes) of the parameters block, including this header + * The flags field is a bitmask of per-block flags RKISP1_EXT_PARAMS_FL_*. */ -struct rkisp1_ext_params_block_header { - __u16 type; - __u16 flags; - __u32 size; -}; +#define rkisp1_ext_params_block_header v4l2_isp_params_block_header /** * struct rkisp1_ext_params_bls_config - RkISP1 extensible params BLS config @@ -1588,27 +1540,14 @@ struct rkisp1_ext_params_wdr_config { * @RKISP1_EXT_PARAM_BUFFER_V1: First version of RkISP1 extensible parameters */ enum rksip1_ext_param_buffer_version { - RKISP1_EXT_PARAM_BUFFER_V1 = 1, + RKISP1_EXT_PARAM_BUFFER_V1 = V4L2_ISP_PARAMS_VERSION_V1, }; /** * struct rkisp1_ext_params_cfg - RkISP1 extensible parameters configuration * - * This struct contains the configuration parameters of the RkISP1 ISP - * algorithms, serialized by userspace into a data buffer. Each configuration - * parameter block is represented by a block-specific structure which contains a - * :c:type:`rkisp1_ext_params_block_header` entry as first member. Userspace - * populates the @data buffer with configuration parameters for the blocks that - * it intends to configure. As a consequence, the data buffer effective size - * changes according to the number of ISP blocks that userspace intends to - * configure and is set by userspace in the @data_size field. - * - * The parameters buffer is versioned by the @version field to allow modifying - * and extending its definition. Userspace shall populate the @version field to - * inform the driver about the version it intends to use. The driver will parse - * and handle the @data buffer according to the data layout specific to the - * indicated version and return an error if the desired version is not - * supported. + * This is the driver-specific implementation of + * :c:type:`v4l2_isp_params_buffer`. * * Currently the single RKISP1_EXT_PARAM_BUFFER_V1 version is supported. * When a new format version will be added, a mechanism for userspace to query @@ -1624,11 +1563,6 @@ enum rksip1_ext_param_buffer_version { * the maximum value represents the blocks supported by the kernel driver, * independently of the device instance. * - * For each ISP block that userspace wants to configure, a block-specific - * structure is appended to the @data buffer, one after the other without gaps - * in between nor overlaps. Userspace shall populate the @data_size field with - * the effective size, in bytes, of the @data buffer. - * * The expected memory layout of the parameters buffer is:: * * +-------------------- struct rkisp1_ext_params_cfg -------------------+ @@ -1678,4 +1612,11 @@ struct rkisp1_ext_params_cfg { __u8 data[RKISP1_EXT_PARAMS_MAX_SIZE]; }; +#ifdef __KERNEL__ +/* Make sure the header is type-convertible to the generic v4l2 params one */ +static_assert((sizeof(struct rkisp1_ext_params_cfg) - + RKISP1_EXT_PARAMS_MAX_SIZE) == + sizeof(struct v4l2_isp_params_buffer)); +#endif /* __KERNEL__ */ + #endif /* _UAPI_RKISP1_CONFIG_H */ -- cgit v1.2.3 From 45662082855c6acd1719c11e077388cbccf3baf2 Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Mon, 7 Jul 2025 14:18:52 +0200 Subject: media: uapi: Convert Amlogic C3 to V4L2 extensible params With the introduction of common types for extensible parameters format, convert the c3-isp-config.h header to use the new types. Factor-out the documentation that is now part of the common header and only keep the driver-specific on in place. The conversion to use common types doesn't impact userspace as the new types are either identical to the ones already existing in the C3 ISP uAPI or are 1-to-1 type convertible. Reviewed-by: Daniel Scally Reviewed-by: Keke Li Reviewed-by: Laurent Pinchart Acked-by: Sakari Ailus Tested-by: Lad Prabhakar Signed-off-by: Jacopo Mondi Signed-off-by: Hans Verkuil --- include/uapi/linux/media/amlogic/c3-isp-config.h | 92 +++++++----------------- 1 file changed, 24 insertions(+), 68 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/media/amlogic/c3-isp-config.h b/include/uapi/linux/media/amlogic/c3-isp-config.h index ed085ea62a57..0a3c1cc55ccb 100644 --- a/include/uapi/linux/media/amlogic/c3-isp-config.h +++ b/include/uapi/linux/media/amlogic/c3-isp-config.h @@ -6,8 +6,13 @@ #ifndef _UAPI_C3_ISP_CONFIG_H_ #define _UAPI_C3_ISP_CONFIG_H_ +#ifdef __KERNEL__ +#include +#endif /* __KERNEL__ */ #include +#include + /* * Frames are split into zones of almost equal width and height - a zone is a * rectangular tile of a frame. The metering blocks within the ISP collect @@ -141,7 +146,7 @@ struct c3_isp_stats_info { * @C3_ISP_PARAMS_BUFFER_V0: First version of C3 ISP parameters block */ enum c3_isp_params_buffer_version { - C3_ISP_PARAMS_BUFFER_V0, + C3_ISP_PARAMS_BUFFER_V0 = V4L2_ISP_PARAMS_VERSION_V0, }; /** @@ -176,62 +181,23 @@ enum c3_isp_params_block_type { C3_ISP_PARAMS_BLOCK_SENTINEL }; -#define C3_ISP_PARAMS_BLOCK_FL_DISABLE (1U << 0) -#define C3_ISP_PARAMS_BLOCK_FL_ENABLE (1U << 1) +/* For backward compatibility */ +#define C3_ISP_PARAMS_BLOCK_FL_DISABLE V4L2_ISP_PARAMS_FL_BLOCK_DISABLE +#define C3_ISP_PARAMS_BLOCK_FL_ENABLE V4L2_ISP_PARAMS_FL_BLOCK_ENABLE /** * struct c3_isp_params_block_header - C3 ISP parameter block header * * This structure represents the common part of all the ISP configuration - * blocks. Each parameters block shall embed an instance of this structure type - * as its first member, followed by the block-specific configuration data. The - * driver inspects this common header to discern the block type and its size and - * properly handle the block content by casting it to the correct block-specific - * type. + * blocks and is identical to :c:type:`v4l2_isp_params_block_header`. * - * The @type field is one of the values enumerated by + * The type field is one of the values enumerated by * :c:type:`c3_isp_params_block_type` and specifies how the data should be - * interpreted by the driver. The @size field specifies the size of the - * parameters block and is used by the driver for validation purposes. The - * @flags field is a bitmask of per-block flags C3_ISP_PARAMS_FL*. - * - * When userspace wants to disable an ISP block the - * C3_ISP_PARAMS_BLOCK_FL_DISABLED bit should be set in the @flags field. In - * this case userspace may optionally omit the remainder of the configuration - * block, which will be ignored by the driver. - * - * When a new configuration of an ISP block needs to be applied userspace - * shall fully populate the ISP block and omit setting the - * C3_ISP_PARAMS_BLOCK_FL_DISABLED bit in the @flags field. - * - * Userspace is responsible for correctly populating the parameters block header - * fields (@type, @flags and @size) and the block-specific parameters. - * - * For example: - * - * .. code-block:: c + * interpreted by the driver. * - * void populate_pst_gamma(struct c3_isp_params_block_header *block) { - * struct c3_isp_params_pst_gamma *gamma = - * (struct c3_isp_params_pst_gamma *)block; - * - * gamma->header.type = C3_ISP_PARAMS_BLOCK_PST_GAMMA; - * gamma->header.flags = C3_ISP_PARAMS_BLOCK_FL_ENABLE; - * gamma->header.size = sizeof(*gamma); - * - * for (unsigned int i = 0; i < 129; i++) - * gamma->pst_gamma_lut[i] = i; - * } - * - * @type: The parameters block type from :c:type:`c3_isp_params_block_type` - * @flags: A bitmask of block flags - * @size: Size (in bytes) of the parameters block, including this header + * The flags field is a bitmask of per-block flags C3_ISP_PARAMS_FL_*. */ -struct c3_isp_params_block_header { - __u16 type; - __u16 flags; - __u32 size; -}; +#define c3_isp_params_block_header v4l2_isp_params_block_header /** * struct c3_isp_params_awb_gains - Gains for auto-white balance @@ -498,26 +464,10 @@ struct c3_isp_params_blc { /** * struct c3_isp_params_cfg - C3 ISP configuration parameters * - * This struct contains the configuration parameters of the C3 ISP - * algorithms, serialized by userspace into an opaque data buffer. Each - * configuration parameter block is represented by a block-specific structure - * which contains a :c:type:`c3_isp_param_block_header` entry as first - * member. Userspace populates the @data buffer with configuration parameters - * for the blocks that it intends to configure. As a consequence, the data - * buffer effective size changes according to the number of ISP blocks that - * userspace intends to configure. - * - * The parameters buffer is versioned by the @version field to allow modifying - * and extending its definition. Userspace should populate the @version field to - * inform the driver about the version it intends to use. The driver will parse - * and handle the @data buffer according to the data layout specific to the - * indicated revision and return an error if the desired revision is not - * supported. - * - * For each ISP block that userspace wants to configure, a block-specific - * structure is appended to the @data buffer, one after the other without gaps - * in between nor overlaps. Userspace shall populate the @total_size field with - * the effective size, in bytes, of the @data buffer. + * This is the driver-specific implementation of + * :c:type:`v4l2_isp_params_buffer`. + * + * Currently only C3_ISP_PARAM_BUFFER_V0 is supported. * * The expected memory layout of the parameters buffer is:: * @@ -561,4 +511,10 @@ struct c3_isp_params_cfg { __u8 data[C3_ISP_PARAMS_MAX_SIZE]; }; +#ifdef __KERNEL__ +/* Make sure the header is type-convertible to the generic v4l2 params one */ +static_assert((sizeof(struct c3_isp_params_cfg) - C3_ISP_PARAMS_MAX_SIZE) == + sizeof(struct v4l2_isp_params_buffer)); +#endif /* __KERNEL__ */ + #endif -- cgit v1.2.3 From 3cb6de6fafb8fca55b14313e63f13ce10ecc6fc4 Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Tue, 1 Jul 2025 18:57:17 +0200 Subject: media: v4l2-core: Introduce v4l2-isp.c Add to the V4L2 framework helper functions to support drivers when validating a buffer of V4L2 ISP parameters. Driver shall use v4l2_isp_params_validate_buffer_size() to verify the size correctness of the data received from userspace, and after having copied the data to a kernel-only memory location, complete the validation by calling v4l2_isp_params_validate_buffer(). Reviewed-by: Daniel Scally Reviewed-by: Laurent Pinchart Reviewed-by: Michael Riesch Acked-by: Sakari Ailus Tested-by: Lad Prabhakar Signed-off-by: Jacopo Mondi Signed-off-by: Hans Verkuil --- include/media/v4l2-isp.h | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 include/media/v4l2-isp.h (limited to 'include') diff --git a/include/media/v4l2-isp.h b/include/media/v4l2-isp.h new file mode 100644 index 000000000000..8b4695663699 --- /dev/null +++ b/include/media/v4l2-isp.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Video4Linux2 generic ISP parameters and statistics support + * + * Copyright (C) 2025 Ideas On Board Oy + * Author: Jacopo Mondi + */ + +#ifndef _V4L2_ISP_H_ +#define _V4L2_ISP_H_ + +#include + +struct device; +struct vb2_buffer; + +/** + * v4l2_isp_params_buffer_size - Calculate size of v4l2_isp_params_buffer + * @max_params_size: The total size of the ISP configuration blocks + * + * Users of the v4l2 extensible parameters will have differing sized data arrays + * depending on their specific parameter buffers. Drivers and userspace will + * need to be able to calculate the appropriate size of the struct to + * accommodate all ISP configuration blocks provided by the platform. + * This macro provides a convenient tool for the calculation. + */ +#define v4l2_isp_params_buffer_size(max_params_size) \ + (offsetof(struct v4l2_isp_params_buffer, data) + (max_params_size)) + +/** + * v4l2_isp_params_validate_buffer_size - Validate a V4L2 ISP buffer sizes + * @dev: the driver's device pointer + * @vb: the videobuf2 buffer + * @max_size: the maximum allowed buffer size + * + * This function performs validation of the size of a V4L2 ISP parameters buffer + * before the driver can access the actual data buffer content. + * + * After the sizes validation, drivers should copy the buffer content to a + * kernel-only memory area to prevent userspace from modifying it, + * before completing validation using v4l2_isp_params_validate_buffer(). + * + * The @vb buffer as received from the vb2 .buf_prepare() operation is checked + * against @max_size and it's validated to be large enough to accommodate at + * least one ISP configuration block. + */ +int v4l2_isp_params_validate_buffer_size(struct device *dev, + struct vb2_buffer *vb, + size_t max_size); + +/** + * struct v4l2_isp_params_block_info - V4L2 ISP per-block info + * @size: the block expected size + * + * The v4l2_isp_params_block_info collects information of the ISP configuration + * blocks for validation purposes. It currently only contains the expected + * block size. + * + * Drivers shall prepare a list of block info, indexed by block type, one for + * each supported ISP block and correctly populate them with the expected block + * size. + */ +struct v4l2_isp_params_block_info { + size_t size; +}; + +/** + * v4l2_isp_params_validate_buffer - Validate a V4L2 ISP parameters buffer + * @dev: the driver's device pointer + * @vb: the videobuf2 buffer + * @buffer: the V4L2 ISP parameters buffer + * @info: the list of per-block validation info + * @num_blocks: the number of blocks + * + * This function completes the validation of a V4L2 ISP parameters buffer, + * verifying each configuration block correctness before the driver can use + * them to program the hardware. + * + * Drivers should use this function after having validated the correctness of + * the vb2 buffer sizes by using the v4l2_isp_params_validate_buffer_size() + * helper first. Once the buffer size has been validated, drivers should + * perform a copy of the user provided buffer into a kernel-only memory buffer + * to prevent userspace from modifying its content after it has been submitted + * to the driver, and then call this function to complete validation. + */ +int v4l2_isp_params_validate_buffer(struct device *dev, struct vb2_buffer *vb, + const struct v4l2_isp_params_buffer *buffer, + const struct v4l2_isp_params_block_info *info, + size_t num_blocks); + +#endif /* _V4L2_ISP_H_ */ -- cgit v1.2.3 From ec4ac3cb7198070611987a6e91829fce0f4ce6d0 Mon Sep 17 00:00:00 2001 From: Daniel Scally Date: Tue, 11 Nov 2025 16:15:45 +0000 Subject: media: uapi: Add MEDIA_BUS_FMT_RGB202020_1X60 format code The Mali-C55 ISP by ARM requires 20-bits per colour channel input on the bus. Add a new media bus format code to represent it. Reviewed-by: Lad Prabhakar Tested-by: Lad Prabhakar Reviewed-by: Laurent Pinchart Acked-by: Nayden Kanchev Co-developed-by: Jacopo Mondi Signed-off-by: Jacopo Mondi Signed-off-by: Daniel Scally Signed-off-by: Hans Verkuil --- include/uapi/linux/media-bus-format.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h index ff62056feed5..62ad82fd285a 100644 --- a/include/uapi/linux/media-bus-format.h +++ b/include/uapi/linux/media-bus-format.h @@ -34,7 +34,7 @@ #define MEDIA_BUS_FMT_FIXED 0x0001 -/* RGB - next is 0x1028 */ +/* RGB - next is 0x1029 */ #define MEDIA_BUS_FMT_RGB444_1X12 0x1016 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_BE 0x1001 #define MEDIA_BUS_FMT_RGB444_2X8_PADHI_LE 0x1002 @@ -74,6 +74,7 @@ #define MEDIA_BUS_FMT_RGB888_1X36_CPADLO 0x1021 #define MEDIA_BUS_FMT_RGB121212_1X36 0x1019 #define MEDIA_BUS_FMT_RGB161616_1X48 0x101a +#define MEDIA_BUS_FMT_RGB202020_1X60 0x1028 /* YUV (including grey) - next is 0x202f */ #define MEDIA_BUS_FMT_Y8_1X8 0x2001 -- cgit v1.2.3 From 2477ab037621632c3ec167187dc9e7afac2ba7f2 Mon Sep 17 00:00:00 2001 From: Daniel Scally Date: Tue, 11 Nov 2025 16:15:46 +0000 Subject: media: uapi: Add 20-bit bayer formats The Mali-C55 requires input data be in 20-bit format, MSB aligned. Add some new media bus format macros to represent that input format. Reviewed-by: Lad Prabhakar Tested-by: Lad Prabhakar Reviewed-by: Laurent Pinchart Co-developed-by: Jacopo Mondi Signed-off-by: Jacopo Mondi Signed-off-by: Daniel Scally Signed-off-by: Hans Verkuil --- include/uapi/linux/media-bus-format.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/media-bus-format.h b/include/uapi/linux/media-bus-format.h index 62ad82fd285a..6005f033e62c 100644 --- a/include/uapi/linux/media-bus-format.h +++ b/include/uapi/linux/media-bus-format.h @@ -124,7 +124,7 @@ #define MEDIA_BUS_FMT_YUV16_1X48 0x202a #define MEDIA_BUS_FMT_UYYVYY16_0_5X48 0x202b -/* Bayer - next is 0x3021 */ +/* Bayer - next is 0x3025 */ #define MEDIA_BUS_FMT_SBGGR8_1X8 0x3001 #define MEDIA_BUS_FMT_SGBRG8_1X8 0x3013 #define MEDIA_BUS_FMT_SGRBG8_1X8 0x3002 @@ -157,6 +157,10 @@ #define MEDIA_BUS_FMT_SGBRG16_1X16 0x301e #define MEDIA_BUS_FMT_SGRBG16_1X16 0x301f #define MEDIA_BUS_FMT_SRGGB16_1X16 0x3020 +#define MEDIA_BUS_FMT_SBGGR20_1X20 0x3021 +#define MEDIA_BUS_FMT_SGBRG20_1X20 0x3022 +#define MEDIA_BUS_FMT_SGRBG20_1X20 0x3023 +#define MEDIA_BUS_FMT_SRGGB20_1X20 0x3024 /* JPEG compressed formats - next is 0x4002 */ #define MEDIA_BUS_FMT_JPEG_1X8 0x4001 -- cgit v1.2.3 From 8d0bbed21ef737195277c0af8c30511fb72e608b Mon Sep 17 00:00:00 2001 From: Daniel Scally Date: Tue, 11 Nov 2025 16:15:48 +0000 Subject: media: uapi: Add controls for Mali-C55 ISP Add definitions and documentation for the custom control that will be needed by the Mali-C55 ISP driver. This will be a read only bitmask of the driver's capabilities, informing userspace of which blocks are fitted and which are absent. Tested-by: Lad Prabhakar Reviewed-by: Lad Prabhakar Reviewed-by: Jacopo Mondi Signed-off-by: Daniel Scally Signed-off-by: Jacopo Mondi Signed-off-by: Hans Verkuil --- include/uapi/linux/media/arm/mali-c55-config.h | 26 ++++++++++++++++++++++++++ include/uapi/linux/v4l2-controls.h | 6 ++++++ 2 files changed, 32 insertions(+) create mode 100644 include/uapi/linux/media/arm/mali-c55-config.h (limited to 'include') diff --git a/include/uapi/linux/media/arm/mali-c55-config.h b/include/uapi/linux/media/arm/mali-c55-config.h new file mode 100644 index 000000000000..7fddece54ada --- /dev/null +++ b/include/uapi/linux/media/arm/mali-c55-config.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * ARM Mali-C55 ISP Driver - Userspace API + * + * Copyright (C) 2023 Ideas on Board Oy + */ + +#ifndef __UAPI_MALI_C55_CONFIG_H +#define __UAPI_MALI_C55_CONFIG_H + +#include + +#define V4L2_CID_MALI_C55_CAPABILITIES (V4L2_CID_USER_MALI_C55_BASE + 0x0) +#define MALI_C55_GPS_PONG (1U << 0) +#define MALI_C55_GPS_WDR (1U << 1) +#define MALI_C55_GPS_COMPRESSION (1U << 2) +#define MALI_C55_GPS_TEMPER (1U << 3) +#define MALI_C55_GPS_SINTER_LITE (1U << 4) +#define MALI_C55_GPS_SINTER (1U << 5) +#define MALI_C55_GPS_IRIDIX_LTM (1U << 6) +#define MALI_C55_GPS_IRIDIX_GTM (1U << 7) +#define MALI_C55_GPS_CNR (1U << 8) +#define MALI_C55_GPS_FRSCALER (1U << 9) +#define MALI_C55_GPS_DS_PIPE (1U << 10) + +#endif /* __UAPI_MALI_C55_CONFIG_H */ diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 2d30107e047e..f84ed133a6c9 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -228,6 +228,12 @@ enum v4l2_colorfx { */ #define V4L2_CID_USER_RKISP1_BASE (V4L2_CID_USER_BASE + 0x1220) +/* + * The base for the Arm Mali-C55 ISP driver controls. + * We reserve 16 controls for this driver + */ +#define V4L2_CID_USER_MALI_C55_BASE (V4L2_CID_USER_BASE + 0x1230) + /* MPEG-class control IDs */ /* The MPEG controls are applicable to all codec controls * and the 'MPEG' part of the define is historical */ -- cgit v1.2.3 From 4d36f732366aeb32bf3486545e597500a3bf0994 Mon Sep 17 00:00:00 2001 From: Daniel Scally Date: Tue, 11 Nov 2025 16:15:52 +0000 Subject: media: Add MALI_C55_3A_STATS meta format Add a new meta format for the Mali-C55 ISP's 3A Statistics along with a new descriptor entry. Tested-by: Lad Prabhakar Reviewed-by: Laurent Pinchart Acked-by: Nayden Kanchev Co-developed-by: Jacopo Mondi Signed-off-by: Jacopo Mondi Signed-off-by: Daniel Scally Signed-off-by: Hans Verkuil --- include/uapi/linux/videodev2.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index becd08fdbddb..cba4b1311667 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -884,6 +884,9 @@ struct v4l2_pix_format { #define V4L2_META_FMT_RPI_FE_CFG v4l2_fourcc('R', 'P', 'F', 'C') /* PiSP FE configuration */ #define V4L2_META_FMT_RPI_FE_STATS v4l2_fourcc('R', 'P', 'F', 'S') /* PiSP FE stats */ +/* Vendor specific - used for Arm Mali-C55 ISP */ +#define V4L2_META_FMT_MALI_C55_STATS v4l2_fourcc('C', '5', '5', 'S') /* ARM Mali-C55 3A Statistics */ + #ifdef __KERNEL__ /* * Line-based metadata formats. Remember to update v4l_fill_fmtdesc() when -- cgit v1.2.3 From c7f832f6f8129bb666346cb4805805ad056059b7 Mon Sep 17 00:00:00 2001 From: Daniel Scally Date: Tue, 11 Nov 2025 16:15:53 +0000 Subject: media: uapi: Add 3a stats buffer for mali-c55 Describe the format of the 3A statistics buffers in the userspace API header for the mali-c55 ISP. Tested-by: Lad Prabhakar Reviewed-by: Laurent Pinchart Acked-by: Nayden Kanchev Co-developed-by: Jacopo Mondi Signed-off-by: Jacopo Mondi Signed-off-by: Daniel Scally Signed-off-by: Hans Verkuil --- include/uapi/linux/media/arm/mali-c55-config.h | 170 +++++++++++++++++++++++++ 1 file changed, 170 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/media/arm/mali-c55-config.h b/include/uapi/linux/media/arm/mali-c55-config.h index 7fddece54ada..e31fb8ffa10a 100644 --- a/include/uapi/linux/media/arm/mali-c55-config.h +++ b/include/uapi/linux/media/arm/mali-c55-config.h @@ -8,6 +8,7 @@ #ifndef __UAPI_MALI_C55_CONFIG_H #define __UAPI_MALI_C55_CONFIG_H +#include #include #define V4L2_CID_MALI_C55_CAPABILITIES (V4L2_CID_USER_MALI_C55_BASE + 0x0) @@ -23,4 +24,173 @@ #define MALI_C55_GPS_FRSCALER (1U << 9) #define MALI_C55_GPS_DS_PIPE (1U << 10) +/* + * Frames are split into zones of almost equal width and height - a zone is a + * rectangular tile of a frame. The metering blocks within the ISP collect + * aggregated statistics per zone. A maximum of 15x15 zones can be configured, + * and so the statistics buffer within the hardware is sized to accommodate + * that. + * + * The utilised number of zones is runtime configurable. + */ +#define MALI_C55_MAX_ZONES (15 * 15) + +/** + * struct mali_c55_ae_1024bin_hist - Auto Exposure 1024-bin histogram statistics + * + * @bins: 1024 element array of 16-bit pixel counts. + * + * The 1024-bin histogram module collects image-global but zone-weighted + * intensity distributions of pixels in fixed-width bins. The modules can be + * configured into different "plane modes" which affect the contents of the + * collected statistics. In plane mode 0, pixel intensities are taken regardless + * of colour plane into a single 1024-bin histogram with a bin width of 4. In + * plane mode 1, four 256-bin histograms with a bin width of 16 are collected - + * one for each CFA colour plane. In plane modes 4, 5, 6 and 7 two 512-bin + * histograms with a bin width of 8 are collected - in each mode one of the + * colour planes is collected into the first histogram and all the others are + * combined into the second. The histograms are stored consecutively in the bins + * array. + * + * The 16-bit pixel counts are stored as a 4-bit exponent in the most + * significant bits followed by a 12-bit mantissa. Conversion to a usable + * format can be done according to the following pseudo-code:: + * + * if (e == 0) { + * bin = m * 2; + * } else { + * bin = (m + 4096) * 2^e + * } + * + * where + * e is the exponent value in range 0..15 + * m is the mantissa value in range 0..4095 + * + * The pixels used in calculating the statistics can be masked using three + * methods: + * + * 1. Pixels can be skipped in X and Y directions independently. + * 2. Minimum/Maximum intensities can be configured + * 3. Zones can be differentially weighted, including 0 weighted to mask them + * + * The data for this histogram can be collected from different tap points in the + * ISP depending on configuration - after the white balance or digital gain + * blocks, or immediately after the input crossbar. + */ +struct mali_c55_ae_1024bin_hist { + __u16 bins[1024]; +} __attribute__((packed)); + +/** + * struct mali_c55_ae_5bin_hist - Auto Exposure 5-bin histogram statistics + * + * @hist0: 16-bit normalised pixel count for the 0th intensity bin + * @hist1: 16-bit normalised pixel count for the 1st intensity bin + * @hist3: 16-bit normalised pixel count for the 3rd intensity bin + * @hist4: 16-bit normalised pixel count for the 4th intensity bin + * + * The ISP generates a 5-bin histogram of normalised pixel counts within bins of + * pixel intensity for each of 225 possible zones within a frame. The centre bin + * of the histogram for each zone is not available from the hardware and must be + * calculated by subtracting the values of hist0, hist1, hist3 and hist4 from + * 0xffff as in the following equation: + * + * hist2 = 0xffff - (hist0 + hist1 + hist3 + hist4) + */ +struct mali_c55_ae_5bin_hist { + __u16 hist0; + __u16 hist1; + __u16 hist3; + __u16 hist4; +} __attribute__((packed)); + +/** + * struct mali_c55_awb_average_ratios - Auto White Balance colour ratios + * + * @avg_rg_gr: Average R/G or G/R ratio in Q4.8 format. + * @avg_bg_br: Average B/G or B/R ratio in Q4.8 format. + * @num_pixels: The number of pixels used in the AWB calculation + * + * The ISP calculates and collects average colour ratios for each zone in an + * image and stores them in Q4.8 format (the lowest 8 bits are fractional, with + * bits [11:8] representing the integer). The exact ratios collected (either + * R/G, B/G or G/R, B/R) are configurable through the parameters buffer. The + * value of the 4 high bits is undefined. + */ +struct mali_c55_awb_average_ratios { + __u16 avg_rg_gr; + __u16 avg_bg_br; + __u32 num_pixels; +} __attribute__((packed)); + +/** + * struct mali_c55_af_statistics - Auto Focus edge and intensity statistics + * + * @intensity_stats: Packed mantissa and exponent value for pixel intensity + * @edge_stats: Packed mantissa and exponent values for edge intensity + * + * The ISP collects the squared sum of pixel intensities for each zone within a + * configurable Region of Interest on the frame. Additionally, the same data are + * collected after being passed through a bandpass filter which removes high and + * low frequency components - these are referred to as the edge statistics. + * + * The intensity and edge statistics for a zone can be used to calculate the + * contrast information for a zone + * + * C = E2 / I2 + * + * Where I2 is the intensity statistic for a zone and E2 is the edge statistic + * for that zone. Optimum focus is reached when C is at its maximum. + * + * The intensity and edge statistics are stored packed into a non-standard 16 + * bit floating point format, where the 7 most significant bits represent the + * exponent and the 9 least significant bits the mantissa. This format can be + * unpacked with the following pseudocode:: + * + * if (e == 0) { + * x = m; + * } else { + * x = 2^e-1 * (m + 2^9) + * } + * + * where + * e is the exponent value in range 0..127 + * m is the mantissa value in range 0..511 + */ +struct mali_c55_af_statistics { + __u16 intensity_stats; + __u16 edge_stats; +} __attribute__((packed)); + +/** + * struct mali_c55_stats_buffer - 3A statistics for the mali-c55 ISP + * + * @ae_1024bin_hist: 1024-bin frame-global pixel intensity histogram + * @iridix_1024bin_hist: Post-Iridix block 1024-bin histogram + * @ae_5bin_hists: 5-bin pixel intensity histograms for AEC + * @reserved1: Undefined buffer space + * @awb_ratios: Color balance ratios for Auto White Balance + * @reserved2: Undefined buffer space + * @af_statistics: Pixel intensity statistics for Auto Focus + * @reserved3: Undefined buffer space + * + * This struct describes the metering statistics space in the Mali-C55 ISP's + * hardware in its entirety. The space between each defined area is marked as + * "unknown" and may not be 0, but should not be used. The @ae_5bin_hists, + * @awb_ratios and @af_statistics members are arrays of statistics per-zone. + * The zones are arranged in the array in raster order starting from the top + * left corner of the image. + */ + +struct mali_c55_stats_buffer { + struct mali_c55_ae_1024bin_hist ae_1024bin_hist; + struct mali_c55_ae_1024bin_hist iridix_1024bin_hist; + struct mali_c55_ae_5bin_hist ae_5bin_hists[MALI_C55_MAX_ZONES]; + __u32 reserved1[14]; + struct mali_c55_awb_average_ratios awb_ratios[MALI_C55_MAX_ZONES]; + __u32 reserved2[14]; + struct mali_c55_af_statistics af_statistics[MALI_C55_MAX_ZONES]; + __u32 reserved3[15]; +} __attribute__((packed)); + #endif /* __UAPI_MALI_C55_CONFIG_H */ -- cgit v1.2.3 From 1ab3cb233d61131b2d02650f8ed9e4e077fd4508 Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Tue, 11 Nov 2025 16:15:56 +0000 Subject: media: mali-c55: Add image formats for Mali-C55 parameters buffer Add a new V4L2 meta format code for the Mali-C55 parameters. Tested-by: Lad Prabhakar Reviewed-by: Laurent Pinchart Acked-by: Nayden Kanchev Signed-off-by: Jacopo Mondi Signed-off-by: Daniel Scally Signed-off-by: Hans Verkuil --- include/uapi/linux/videodev2.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index cba4b1311667..add08188f068 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -885,6 +885,7 @@ struct v4l2_pix_format { #define V4L2_META_FMT_RPI_FE_STATS v4l2_fourcc('R', 'P', 'F', 'S') /* PiSP FE stats */ /* Vendor specific - used for Arm Mali-C55 ISP */ +#define V4L2_META_FMT_MALI_C55_PARAMS v4l2_fourcc('C', '5', '5', 'P') /* ARM Mali-C55 Parameters */ #define V4L2_META_FMT_MALI_C55_STATS v4l2_fourcc('C', '5', '5', 'S') /* ARM Mali-C55 3A Statistics */ #ifdef __KERNEL__ -- cgit v1.2.3 From 08a99369f44eeb63eacc56fe42f4c67a6c7dbc37 Mon Sep 17 00:00:00 2001 From: Daniel Scally Date: Tue, 11 Nov 2025 16:15:57 +0000 Subject: media: uapi: Add parameters structs to mali-c55-config.h Add structures describing the ISP parameters to mali-c55-config.h Tested-by: Lad Prabhakar Acked-by: Nayden Kanchev Co-developed-by: Jacopo Mondi Signed-off-by: Jacopo Mondi Signed-off-by: Daniel Scally Signed-off-by: Hans Verkuil --- include/uapi/linux/media/arm/mali-c55-config.h | 598 +++++++++++++++++++++++++ 1 file changed, 598 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/media/arm/mali-c55-config.h b/include/uapi/linux/media/arm/mali-c55-config.h index e31fb8ffa10a..109082c5694f 100644 --- a/include/uapi/linux/media/arm/mali-c55-config.h +++ b/include/uapi/linux/media/arm/mali-c55-config.h @@ -10,6 +10,7 @@ #include #include +#include #define V4L2_CID_MALI_C55_CAPABILITIES (V4L2_CID_USER_MALI_C55_BASE + 0x0) #define MALI_C55_GPS_PONG (1U << 0) @@ -193,4 +194,601 @@ struct mali_c55_stats_buffer { __u32 reserved3[15]; } __attribute__((packed)); +/** + * enum mali_c55_param_buffer_version - Mali-C55 parameters block versioning + * + * @MALI_C55_PARAM_BUFFER_V1: First version of Mali-C55 parameters block + */ +enum mali_c55_param_buffer_version { + MALI_C55_PARAM_BUFFER_V1, +}; + +/** + * enum mali_c55_param_block_type - Enumeration of Mali-C55 parameter blocks + * + * This enumeration defines the types of Mali-C55 parameters block. Each block + * configures a specific processing block of the Mali-C55 ISP. The block + * type allows the driver to correctly interpret the parameters block data. + * + * It is the responsibility of userspace to correctly set the type of each + * parameters block. + * + * @MALI_C55_PARAM_BLOCK_SENSOR_OFFS: Sensor pre-shading black level offset + * @MALI_C55_PARAM_BLOCK_AEXP_HIST: Auto-exposure 1024-bin histogram + * configuration + * @MALI_C55_PARAM_BLOCK_AEXP_IHIST: Post-Iridix auto-exposure 1024-bin + * histogram configuration + * @MALI_C55_PARAM_BLOCK_AEXP_HIST_WEIGHTS: Auto-exposure 1024-bin histogram + * weighting + * @MALI_C55_PARAM_BLOCK_AEXP_IHIST_WEIGHTS: Post-Iridix auto-exposure 1024-bin + * histogram weighting + * @MALI_C55_PARAM_BLOCK_DIGITAL_GAIN: Digital gain + * @MALI_C55_PARAM_BLOCK_AWB_GAINS: Auto-white balance gains + * @MALI_C55_PARAM_BLOCK_AWB_CONFIG: Auto-white balance statistics config + * @MALI_C55_PARAM_BLOCK_AWB_GAINS_AEXP: Auto-white balance gains for AEXP-0 tap + * @MALI_C55_PARAM_MESH_SHADING_CONFIG : Mesh shading tables configuration + * @MALI_C55_PARAM_MESH_SHADING_SELECTION: Mesh shading table selection + */ +enum mali_c55_param_block_type { + MALI_C55_PARAM_BLOCK_SENSOR_OFFS, + MALI_C55_PARAM_BLOCK_AEXP_HIST, + MALI_C55_PARAM_BLOCK_AEXP_IHIST, + MALI_C55_PARAM_BLOCK_AEXP_HIST_WEIGHTS, + MALI_C55_PARAM_BLOCK_AEXP_IHIST_WEIGHTS, + MALI_C55_PARAM_BLOCK_DIGITAL_GAIN, + MALI_C55_PARAM_BLOCK_AWB_GAINS, + MALI_C55_PARAM_BLOCK_AWB_CONFIG, + MALI_C55_PARAM_BLOCK_AWB_GAINS_AEXP, + MALI_C55_PARAM_MESH_SHADING_CONFIG, + MALI_C55_PARAM_MESH_SHADING_SELECTION, +}; + +/** + * struct mali_c55_params_sensor_off_preshading - offset subtraction for each + * color channel + * + * Provides removal of the sensor black level from the sensor data. Separate + * offsets are provided for each of the four Bayer component color channels + * which are defaulted to R, Gr, Gb, B. + * + * header.type should be set to MALI_C55_PARAM_BLOCK_SENSOR_OFFS from + * :c:type:`mali_c55_param_block_type` for this block. + * + * @header: The Mali-C55 parameters block header + * @chan00: Offset for color channel 00 (default: R) + * @chan01: Offset for color channel 01 (default: Gr) + * @chan10: Offset for color channel 10 (default: Gb) + * @chan11: Offset for color channel 11 (default: B) + */ +struct mali_c55_params_sensor_off_preshading { + struct v4l2_isp_params_block_header header; + __u32 chan00; + __u32 chan01; + __u32 chan10; + __u32 chan11; +}; + +/** + * enum mali_c55_aexp_hist_tap_points - Tap points for the AEXP histogram + * @MALI_C55_AEXP_HIST_TAP_WB: After static white balance + * @MALI_C55_AEXP_HIST_TAP_FS: After WDR Frame Stitch + * @MALI_C55_AEXP_HIST_TAP_TPG: After the test pattern generator + */ +enum mali_c55_aexp_hist_tap_points { + MALI_C55_AEXP_HIST_TAP_WB = 0, + MALI_C55_AEXP_HIST_TAP_FS, + MALI_C55_AEXP_HIST_TAP_TPG, +}; + +/** + * enum mali_c55_aexp_skip_x - Horizontal pixel skipping + * @MALI_C55_AEXP_SKIP_X_EVERY_2ND: Collect every 2nd pixel horizontally + * @MALI_C55_AEXP_SKIP_X_EVERY_3RD: Collect every 3rd pixel horizontally + * @MALI_C55_AEXP_SKIP_X_EVERY_4TH: Collect every 4th pixel horizontally + * @MALI_C55_AEXP_SKIP_X_EVERY_5TH: Collect every 5th pixel horizontally + * @MALI_C55_AEXP_SKIP_X_EVERY_8TH: Collect every 8th pixel horizontally + * @MALI_C55_AEXP_SKIP_X_EVERY_9TH: Collect every 9th pixel horizontally + */ +enum mali_c55_aexp_skip_x { + MALI_C55_AEXP_SKIP_X_EVERY_2ND, + MALI_C55_AEXP_SKIP_X_EVERY_3RD, + MALI_C55_AEXP_SKIP_X_EVERY_4TH, + MALI_C55_AEXP_SKIP_X_EVERY_5TH, + MALI_C55_AEXP_SKIP_X_EVERY_8TH, + MALI_C55_AEXP_SKIP_X_EVERY_9TH +}; + +/** + * enum mali_c55_aexp_skip_y - Vertical pixel skipping + * @MALI_C55_AEXP_SKIP_Y_ALL: Collect every single pixel vertically + * @MALI_C55_AEXP_SKIP_Y_EVERY_2ND: Collect every 2nd pixel vertically + * @MALI_C55_AEXP_SKIP_Y_EVERY_3RD: Collect every 3rd pixel vertically + * @MALI_C55_AEXP_SKIP_Y_EVERY_4TH: Collect every 4th pixel vertically + * @MALI_C55_AEXP_SKIP_Y_EVERY_5TH: Collect every 5th pixel vertically + * @MALI_C55_AEXP_SKIP_Y_EVERY_8TH: Collect every 8th pixel vertically + * @MALI_C55_AEXP_SKIP_Y_EVERY_9TH: Collect every 9th pixel vertically + */ +enum mali_c55_aexp_skip_y { + MALI_C55_AEXP_SKIP_Y_ALL, + MALI_C55_AEXP_SKIP_Y_EVERY_2ND, + MALI_C55_AEXP_SKIP_Y_EVERY_3RD, + MALI_C55_AEXP_SKIP_Y_EVERY_4TH, + MALI_C55_AEXP_SKIP_Y_EVERY_5TH, + MALI_C55_AEXP_SKIP_Y_EVERY_8TH, + MALI_C55_AEXP_SKIP_Y_EVERY_9TH +}; + +/** + * enum mali_c55_aexp_row_column_offset - Start from the first or second row or + * column + * @MALI_C55_AEXP_FIRST_ROW_OR_COL: Start from the first row / column + * @MALI_C55_AEXP_SECOND_ROW_OR_COL: Start from the second row / column + */ +enum mali_c55_aexp_row_column_offset { + MALI_C55_AEXP_FIRST_ROW_OR_COL = 1, + MALI_C55_AEXP_SECOND_ROW_OR_COL = 2, +}; + +/** + * enum mali_c55_aexp_hist_plane_mode - Mode for the AEXP Histograms + * @MALI_C55_AEXP_HIST_COMBINED: All color planes in one 1024-bin histogram + * @MALI_C55_AEXP_HIST_SEPARATE: Each color plane in one 256-bin histogram with a bin width of 16 + * @MALI_C55_AEXP_HIST_FOCUS_00: Top left plane in the first bank, rest in second bank + * @MALI_C55_AEXP_HIST_FOCUS_01: Top right plane in the first bank, rest in second bank + * @MALI_C55_AEXP_HIST_FOCUS_10: Bottom left plane in the first bank, rest in second bank + * @MALI_C55_AEXP_HIST_FOCUS_11: Bottom right plane in the first bank, rest in second bank + * + * In the "focus" modes statistics are collected into two 512-bin histograms + * with a bin width of 8. One colour plane is in the first histogram with the + * remainder combined into the second. The four options represent which of the + * four positions in a bayer pattern are the focused plane. + */ +enum mali_c55_aexp_hist_plane_mode { + MALI_C55_AEXP_HIST_COMBINED = 0, + MALI_C55_AEXP_HIST_SEPARATE = 1, + MALI_C55_AEXP_HIST_FOCUS_00 = 4, + MALI_C55_AEXP_HIST_FOCUS_01 = 5, + MALI_C55_AEXP_HIST_FOCUS_10 = 6, + MALI_C55_AEXP_HIST_FOCUS_11 = 7, +}; + +/** + * struct mali_c55_params_aexp_hist - configuration for AEXP metering hists + * + * This struct allows users to configure the 1024-bin AEXP histograms. Broadly + * speaking the parameters allow you to mask particular regions of the image and + * to select different kinds of histogram. + * + * The skip_x, offset_x, skip_y and offset_y fields allow users to ignore or + * mask pixels in the frame by their position relative to the top left pixel. + * First, the skip_y, offset_x and offset_y fields define which of the pixels + * within each 2x2 region will be counted in the statistics. + * + * If skip_y == 0 then two pixels from each covered region will be counted. If + * both offset_x and offset_y are zero, then the two left-most pixels in each + * 2x2 pixel region will be counted. Setting offset_x = 1 will discount the top + * left pixel and count the top right pixel. Setting offset_y = 1 will discount + * the bottom left pixel and count the bottom right pixel. + * + * If skip_y != 0 then only a single pixel from each region covered by the + * pattern will be counted. In this case offset_x controls whether the pixel + * that's counted is in the left (if offset_x == 0) or right (if offset_x == 1) + * column and offset_y controls whether the pixel that's counted is in the top + * (if offset_y == 0) or bottom (if offset_y == 1) row. + * + * The skip_x and skip_y fields control how the 2x2 pixel region is repeated + * across the image data. The first instance of the region is always in the top + * left of the image data. The skip_x field controls how many pixels are ignored + * in the x direction before the pixel masking region is repeated. The skip_y + * field controls how many pixels are ignored in the y direction before the + * pixel masking region is repeated. + * + * These fields can be used to reduce the number of pixels counted for the + * statistics, but it's important to be careful to configure them correctly. + * Some combinations of values will result in colour components from the input + * data being ignored entirely, for example in the following configuration: + * + * skip_x = 0 + * offset_x = 0 + * skip_y = 0 + * offset_y = 0 + * + * Only the R and Gb components of RGGB data that was input would be collected. + * Similarly in the following configuration: + * + * skip_x = 0 + * offset_x = 0 + * skip_y = 1 + * offset_y = 1 + * + * Only the Gb component of RGGB data that was input would be collected. To + * correct things such that all 4 colour components were included it would be + * necessary to set the skip_x and skip_y fields in a way that resulted in all + * four colour components being collected: + * + * skip_x = 1 + * offset_x = 0 + * skip_y = 1 + * offset_y = 1 + * + * header.type should be set to one of either MALI_C55_PARAM_BLOCK_AEXP_HIST or + * MALI_C55_PARAM_BLOCK_AEXP_IHIST from :c:type:`mali_c55_param_block_type`. + * + * @header: The Mali-C55 parameters block header + * @skip_x: Horizontal decimation. See enum mali_c55_aexp_skip_x + * @offset_x: Skip the first column, or not. See enum mali_c55_aexp_row_column_offset + * @skip_y: Vertical decimation. See enum mali_c55_aexp_skip_y + * @offset_y: Skip the first row, or not. See enum mali_c55_aexp_row_column_offset + * @scale_bottom: Scale pixels in bottom half of intensity range: 0=1x ,1=2x, 2=4x, 4=8x, 4=16x + * @scale_top: scale pixels in top half of intensity range: 0=1x ,1=2x, 2=4x, 4=8x, 4=16x + * @plane_mode: Plane separation mode. See enum mali_c55_aexp_hist_plane_mode + * @tap_point: Tap point for histogram from enum mali_c55_aexp_hist_tap_points. + * This parameter is unused for the post-Iridix Histogram + */ +struct mali_c55_params_aexp_hist { + struct v4l2_isp_params_block_header header; + __u8 skip_x; + __u8 offset_x; + __u8 skip_y; + __u8 offset_y; + __u8 scale_bottom; + __u8 scale_top; + __u8 plane_mode; + __u8 tap_point; +}; + +/** + * struct mali_c55_params_aexp_weights - Array of weights for AEXP metering + * + * This struct allows users to configure the weighting for both of the 1024-bin + * AEXP histograms. The pixel data collected for each zone is multiplied by the + * corresponding weight from this array, which may be zero if the intention is + * to mask off the zone entirely. + * + * header.type should be set to one of either MALI_C55_PARAM_BLOCK_AEXP_HIST_WEIGHTS + * or MALI_C55_PARAM_BLOCK_AEXP_IHIST_WEIGHTS from :c:type:`mali_c55_param_block_type`. + * + * @header: The Mali-C55 parameters block header + * @nodes_used_horiz: Number of active zones horizontally [0..15] + * @nodes_used_vert: Number of active zones vertically [0..15] + * @zone_weights: Zone weighting. Index is row*col where 0,0 is the top + * left zone continuing in raster order. Each zone can be + * weighted in the range [0..15]. The number of rows and + * columns is defined by @nodes_used_vert and + * @nodes_used_horiz + */ +struct mali_c55_params_aexp_weights { + struct v4l2_isp_params_block_header header; + __u8 nodes_used_horiz; + __u8 nodes_used_vert; + __u8 zone_weights[MALI_C55_MAX_ZONES]; +}; + +/** + * struct mali_c55_params_digital_gain - Digital gain value + * + * This struct carries a digital gain value to set in the ISP. + * + * header.type should be set to MALI_C55_PARAM_BLOCK_DIGITAL_GAIN from + * :c:type:`mali_c55_param_block_type` for this block. + * + * @header: The Mali-C55 parameters block header + * @gain: The digital gain value to apply, in Q5.8 format. + */ +struct mali_c55_params_digital_gain { + struct v4l2_isp_params_block_header header; + __u16 gain; +}; + +/** + * enum mali_c55_awb_stats_mode - Statistics mode for AWB + * @MALI_C55_AWB_MODE_GRBR: Statistics collected as Green/Red and Blue/Red ratios + * @MALI_C55_AWB_MODE_RGBG: Statistics collected as Red/Green and Blue/Green ratios + */ +enum mali_c55_awb_stats_mode { + MALI_C55_AWB_MODE_GRBR = 0, + MALI_C55_AWB_MODE_RGBG, +}; + +/** + * struct mali_c55_params_awb_gains - Gain settings for auto white balance + * + * This struct allows users to configure the gains for auto-white balance. There + * are four gain settings corresponding to each colour channel in the bayer + * domain. Although named generically, the association between the gain applied + * and the colour channel is done automatically within the ISP depending on the + * input format, and so the following mapping always holds true:: + * + * gain00 = R + * gain01 = Gr + * gain10 = Gb + * gain11 = B + * + * All of the gains are stored in Q4.8 format. + * + * header.type should be set to one of either MALI_C55_PARAM_BLOCK_AWB_GAINS or + * MALI_C55_PARAM_BLOCK_AWB_GAINS_AEXP from :c:type:`mali_c55_param_block_type`. + * + * @header: The Mali-C55 parameters block header + * @gain00: Multiplier for colour channel 00 + * @gain01: Multiplier for colour channel 01 + * @gain10: Multiplier for colour channel 10 + * @gain11: Multiplier for colour channel 11 + */ +struct mali_c55_params_awb_gains { + struct v4l2_isp_params_block_header header; + __u16 gain00; + __u16 gain01; + __u16 gain10; + __u16 gain11; +}; + +/** + * enum mali_c55_params_awb_tap_points - Tap points for the AWB statistics + * @MALI_C55_AWB_STATS_TAP_PF: Immediately after the Purple Fringe block + * @MALI_C55_AWB_STATS_TAP_CNR: Immediately after the CNR block + */ +enum mali_c55_params_awb_tap_points { + MALI_C55_AWB_STATS_TAP_PF = 0, + MALI_C55_AWB_STATS_TAP_CNR, +}; + +/** + * struct mali_c55_params_awb_config - Stats settings for auto-white balance + * + * This struct allows the configuration of the statistics generated for auto + * white balance. Pixel intensity limits can be set to exclude overly bright or + * dark regions of an image from the statistics entirely. Colour ratio minima + * and maxima can be set to discount pixels who's ratios fall outside the + * defined boundaries; there are two sets of registers to do this - the + * "min/max" ratios which bound a region and the "high/low" ratios which further + * trim the upper and lower ratios. For example with the boundaries configured + * as follows, only pixels whos colour ratios falls into the region marked "A" + * would be counted:: + * + * cr_high + * 2.0 | | + * | cb_max --> _________________________v_____ + * 1.8 | | \ | + * | | \ | + * 1.6 | | \ | + * | | \ | + * c 1.4 | cb_low -->|\ A \|<-- cb_high + * b | | \ | + * 1.2 | | \ | + * r | | \ | + * a 1.0 | cb_min --> |____\_________________________| + * t | ^ ^ ^ + * i 0.8 | | | | + * o | cr_min | cr_max + * s 0.6 | | + * | cr_low + * 0.4 | + * | + * 0.2 | + * | + * 0.0 |_______________________________________________________________ + * 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0 + * cr ratios + * + * header.type should be set to MALI_C55_PARAM_BLOCK_AWB_CONFIG from + * :c:type:`mali_c55_param_block_type` for this block. + * + * @header: The Mali-C55 parameters block header + * @tap_point: The tap point from enum mali_c55_params_awb_tap_points + * @stats_mode: AWB statistics collection mode, see :c:type:`mali_c55_awb_stats_mode` + * @white_level: Upper pixel intensity (I.E. raw pixel values) limit + * @black_level: Lower pixel intensity (I.E. raw pixel values) limit + * @cr_max: Maximum R/G ratio (Q4.8 format) + * @cr_min: Minimum R/G ratio (Q4.8 format) + * @cb_max: Maximum B/G ratio (Q4.8 format) + * @cb_min: Minimum B/G ratio (Q4.8 format) + * @nodes_used_horiz: Number of active zones horizontally [0..15] + * @nodes_used_vert: Number of active zones vertically [0..15] + * @cr_high: R/G ratio trim high (Q4.8 format) + * @cr_low: R/G ratio trim low (Q4.8 format) + * @cb_high: B/G ratio trim high (Q4.8 format) + * @cb_low: B/G ratio trim low (Q4.8 format) + */ +struct mali_c55_params_awb_config { + struct v4l2_isp_params_block_header header; + __u8 tap_point; + __u8 stats_mode; + __u16 white_level; + __u16 black_level; + __u16 cr_max; + __u16 cr_min; + __u16 cb_max; + __u16 cb_min; + __u8 nodes_used_horiz; + __u8 nodes_used_vert; + __u16 cr_high; + __u16 cr_low; + __u16 cb_high; + __u16 cb_low; +}; + +#define MALI_C55_NUM_MESH_SHADING_ELEMENTS 3072 + +/** + * struct mali_c55_params_mesh_shading_config - Mesh shading configuration + * + * The mesh shading correction module allows programming a separate table of + * either 16x16 or 32x32 node coefficients for 3 different light sources. The + * final correction coefficients applied are computed by blending the + * coefficients from two tables together. + * + * A page of 1024 32-bit integers is associated to each colour channel, with + * pages stored consecutively in memory. Each 32-bit integer packs 3 8-bit + * correction coefficients for a single node, one for each of the three light + * sources. The 8 most significant bits are unused. The following table + * describes the layout:: + * + * +----------- Page (Colour Plane) 0 -------------+ + * | @mesh[i] | Mesh Point | Bits | Light Source | + * +-----------+------------+-------+--------------+ + * | 0 | 0,0 | 16,23 | LS2 | + * | | | 08-15 | LS1 | + * | | | 00-07 | LS0 | + * +-----------+------------+-------+--------------+ + * | 1 | 0,1 | 16,23 | LS2 | + * | | | 08-15 | LS1 | + * | | | 00-07 | LS0 | + * +-----------+------------+-------+--------------+ + * | ... | ... | ... | ... | + * +-----------+------------+-------+--------------+ + * | 1023 | 31,31 | 16,23 | LS2 | + * | | | 08-15 | LS1 | + * | | | 00-07 | LS0 | + * +----------- Page (Colour Plane) 1 -------------+ + * | @mesh[i] | Mesh Point | Bits | Light Source | + * +-----------+------------+-------+--------------+ + * | 1024 | 0,0 | 16,23 | LS2 | + * | | | 08-15 | LS1 | + * | | | 00-07 | LS0 | + * +-----------+------------+-------+--------------+ + * | 1025 | 0,1 | 16,23 | LS2 | + * | | | 08-15 | LS1 | + * | | | 00-07 | LS0 | + * +-----------+------------+-------+--------------+ + * | ... | ... | ... | ... | + * +-----------+------------+-------+--------------+ + * | 2047 | 31,31 | 16,23 | LS2 | + * | | | 08-15 | LS1 | + * | | | 00-07 | LS0 | + * +----------- Page (Colour Plane) 2 -------------+ + * | @mesh[i] | Mesh Point | Bits | Light Source | + * +-----------+------------+-------+--------------+ + * | 2048 | 0,0 | 16,23 | LS2 | + * | | | 08-15 | LS1 | + * | | | 00-07 | LS0 | + * +-----------+------------+-------+--------------+ + * | 2049 | 0,1 | 16,23 | LS2 | + * | | | 08-15 | LS1 | + * | | | 00-07 | LS0 | + * +-----------+------------+-------+--------------+ + * | ... | ... | ... | ... | + * +-----------+------------+-------+--------------+ + * | 3071 | 31,31 | 16,23 | LS2 | + * | | | 08-15 | LS1 | + * | | | 00-07 | LS0 | + * +-----------+------------+-------+--------------+ + * + * The @mesh_scale member determines the precision and minimum and maximum gain. + * For example if @mesh_scale is 0 and therefore selects 0 - 2x gain, a value of + * 0 in a coefficient means 0.0 gain, a value of 128 means 1.0 gain and 255 + * means 2.0 gain. + * + * header.type should be set to MALI_C55_PARAM_MESH_SHADING_CONFIG from + * :c:type:`mali_c55_param_block_type` for this block. + * + * @header: The Mali-C55 parameters block header + * @mesh_show: Output the mesh data rather than image data + * @mesh_scale: Set the precision and maximum gain range of mesh shading + * - 0 = 0-2x gain + * - 1 = 0-4x gain + * - 2 = 0-8x gain + * - 3 = 0-16x gain + * - 4 = 1-2x gain + * - 5 = 1-3x gain + * - 6 = 1-5x gain + * - 7 = 1-9x gain + * @mesh_page_r: Mesh page select for red colour plane [0..2] + * @mesh_page_g: Mesh page select for green colour plane [0..2] + * @mesh_page_b: Mesh page select for blue colour plane [0..2] + * @mesh_width: Number of horizontal nodes minus 1 [15,31] + * @mesh_height: Number of vertical nodes minus 1 [15,31] + * @mesh: Mesh shading correction tables + */ +struct mali_c55_params_mesh_shading_config { + struct v4l2_isp_params_block_header header; + __u8 mesh_show; + __u8 mesh_scale; + __u8 mesh_page_r; + __u8 mesh_page_g; + __u8 mesh_page_b; + __u8 mesh_width; + __u8 mesh_height; + __u32 mesh[MALI_C55_NUM_MESH_SHADING_ELEMENTS]; +}; + +/** enum mali_c55_params_mesh_alpha_bank - Mesh shading table bank selection + * @MALI_C55_MESH_ALPHA_BANK_LS0_AND_LS1 - Select Light Sources 0 and 1 + * @MALI_C55_MESH_ALPHA_BANK_LS1_AND_LS2 - Select Light Sources 1 and 2 + * @MALI_C55_MESH_ALPHA_BANK_LS0_AND_LS2 - Select Light Sources 0 and 2 + */ +enum mali_c55_params_mesh_alpha_bank { + MALI_C55_MESH_ALPHA_BANK_LS0_AND_LS1 = 0, + MALI_C55_MESH_ALPHA_BANK_LS1_AND_LS2 = 1, + MALI_C55_MESH_ALPHA_BANK_LS0_AND_LS2 = 4 +}; + +/** + * struct mali_c55_params_mesh_shading_selection - Mesh table selection + * + * The module computes the final correction coefficients by blending the ones + * from two light source tables, which are selected (independently for each + * colour channel) by the @mesh_alpha_bank_r/g/b fields. + * + * The final blended coefficients for each node are calculated using the + * following equation: + * + * Final coefficient = (a * LS\ :sub:`b`\ + (256 - a) * LS\ :sub:`a`\) / 256 + * + * Where a is the @mesh_alpha_r/g/b value, and LS\ :sub:`a`\ and LS\ :sub:`b`\ + * are the node cofficients for the two tables selected by the + * @mesh_alpha_bank_r/g/b value. + * + * The scale of the applied correction may also be controlled by tuning the + * @mesh_strength member. This is a modifier to the final coefficients which can + * be used to globally reduce the gains applied. + * + * header.type should be set to MALI_C55_PARAM_MESH_SHADING_SELECTION from + * :c:type:`mali_c55_param_block_type` for this block. + * + * @header: The Mali-C55 parameters block header + * @mesh_alpha_bank_r: Red mesh table select (c:type:`enum mali_c55_params_mesh_alpha_bank`) + * @mesh_alpha_bank_g: Green mesh table select (c:type:`enum mali_c55_params_mesh_alpha_bank`) + * @mesh_alpha_bank_b: Blue mesh table select (c:type:`enum mali_c55_params_mesh_alpha_bank`) + * @mesh_alpha_r: Blend coefficient for R [0..255] + * @mesh_alpha_g: Blend coefficient for G [0..255] + * @mesh_alpha_b: Blend coefficient for B [0..255] + * @mesh_strength: Mesh strength in Q4.12 format [0..4096] + */ +struct mali_c55_params_mesh_shading_selection { + struct v4l2_isp_params_block_header header; + __u8 mesh_alpha_bank_r; + __u8 mesh_alpha_bank_g; + __u8 mesh_alpha_bank_b; + __u8 mesh_alpha_r; + __u8 mesh_alpha_g; + __u8 mesh_alpha_b; + __u16 mesh_strength; +}; + +/** + * define MALI_C55_PARAMS_MAX_SIZE - Maximum size of all Mali C55 Parameters + * + * Though the parameters for the Mali-C55 are passed as optional blocks, the + * driver still needs to know the absolute maximum size so that it can allocate + * a buffer sized appropriately to accommodate userspace attempting to set all + * possible parameters in a single frame. + * + * Some structs are in this list multiple times. Where that's the case, it just + * reflects the fact that the same struct can be used with multiple different + * header types from :c:type:`mali_c55_param_block_type`. + */ +#define MALI_C55_PARAMS_MAX_SIZE \ + (sizeof(struct mali_c55_params_sensor_off_preshading) + \ + sizeof(struct mali_c55_params_aexp_hist) + \ + sizeof(struct mali_c55_params_aexp_weights) + \ + sizeof(struct mali_c55_params_aexp_hist) + \ + sizeof(struct mali_c55_params_aexp_weights) + \ + sizeof(struct mali_c55_params_digital_gain) + \ + sizeof(struct mali_c55_params_awb_gains) + \ + sizeof(struct mali_c55_params_awb_config) + \ + sizeof(struct mali_c55_params_awb_gains) + \ + sizeof(struct mali_c55_params_mesh_shading_config) + \ + sizeof(struct mali_c55_params_mesh_shading_selection)) + #endif /* __UAPI_MALI_C55_CONFIG_H */ -- cgit v1.2.3 From d619dd9a3d401063cc6d31cada98c99db449d381 Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Fri, 14 Nov 2025 12:02:11 +0100 Subject: media: v4l2-isp: Rename block_info to block_type_info The v4l2_isp_params_block_info structure contains validation information that apply to a block -type- and not only to a specific ISP block implementation. Clarify this by renaming v4l2_isp_params_block_info in v4l2_isp_params_block_type_info and update the documentation and the users of v4l2-isp accordingly. Signed-off-by: Jacopo Mondi Reviewed-by: Laurent Pinchart Reviewed-by: Sakari Ailus Signed-off-by: Hans Verkuil --- include/media/v4l2-isp.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/media/v4l2-isp.h b/include/media/v4l2-isp.h index 8b4695663699..f3a6d0edcb24 100644 --- a/include/media/v4l2-isp.h +++ b/include/media/v4l2-isp.h @@ -49,18 +49,18 @@ int v4l2_isp_params_validate_buffer_size(struct device *dev, size_t max_size); /** - * struct v4l2_isp_params_block_info - V4L2 ISP per-block info - * @size: the block expected size + * struct v4l2_isp_params_block_type_info - V4L2 ISP per-block-type info + * @size: the block type expected size * - * The v4l2_isp_params_block_info collects information of the ISP configuration - * blocks for validation purposes. It currently only contains the expected - * block size. + * The v4l2_isp_params_block_type_info collects information of the ISP + * configuration block types for validation purposes. It currently only contains + * the expected block type size. * - * Drivers shall prepare a list of block info, indexed by block type, one for - * each supported ISP block and correctly populate them with the expected block - * size. + * Drivers shall prepare a list of block type info, indexed by block type, one + * for each supported ISP block type and correctly populate them with the + * expected block type size. */ -struct v4l2_isp_params_block_info { +struct v4l2_isp_params_block_type_info { size_t size; }; @@ -69,8 +69,8 @@ struct v4l2_isp_params_block_info { * @dev: the driver's device pointer * @vb: the videobuf2 buffer * @buffer: the V4L2 ISP parameters buffer - * @info: the list of per-block validation info - * @num_blocks: the number of blocks + * @type_info: the array of per-block-type validation info + * @num_block_types: the number of block types in the type_info array * * This function completes the validation of a V4L2 ISP parameters buffer, * verifying each configuration block correctness before the driver can use @@ -85,7 +85,7 @@ struct v4l2_isp_params_block_info { */ int v4l2_isp_params_validate_buffer(struct device *dev, struct vb2_buffer *vb, const struct v4l2_isp_params_buffer *buffer, - const struct v4l2_isp_params_block_info *info, - size_t num_blocks); + const struct v4l2_isp_params_block_type_info *type_info, + size_t num_block_types); #endif /* _V4L2_ISP_H_ */ -- cgit v1.2.3 From c42ba5a87bdccbca11403b7ca8bad1a57b833732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 10 Nov 2025 10:38:52 +0100 Subject: futex: Store time as ktime_t in restart block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The futex core uses ktime_t to represent times, use that also for the restart block. This allows the simplification of the accessors. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Reviewed-by: Jan Kara Link: https://patch.msgid.link/20251110-restart-block-expiration-v1-2-5d39cc93df4f@linutronix.de --- include/linux/restart_block.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 7e50bbc94e47..4f9316e7590d 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -32,7 +32,7 @@ struct restart_block { u32 val; u32 flags; u32 bitset; - u64 time; + ktime_t time; u32 __user *uaddr2; } futex; /* For nanosleep */ -- cgit v1.2.3 From 4702f4eceb639b6af199151e352e570943619d98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 10 Nov 2025 10:38:53 +0100 Subject: hrtimer: Store time as ktime_t in restart block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hrtimer core uses ktime_t to represent times, use that also for the restart block. CPU timers internally use nanoseconds instead of ktime_t but use the same restart block, so use the correct accessors for those. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251110-restart-block-expiration-v1-3-5d39cc93df4f@linutronix.de --- include/linux/restart_block.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/restart_block.h b/include/linux/restart_block.h index 7e50bbc94e47..36ddfa1ec301 100644 --- a/include/linux/restart_block.h +++ b/include/linux/restart_block.h @@ -43,7 +43,7 @@ struct restart_block { struct __kernel_timespec __user *rmtp; struct old_timespec32 __user *compat_rmtp; }; - u64 expires; + ktime_t expires; } nanosleep; /* For poll */ struct { -- cgit v1.2.3 From 0ca04993dac9b0d21ffbfd22bf54cc43ec2c49f2 Mon Sep 17 00:00:00 2001 From: "Mario Limonciello (AMD)" Date: Wed, 12 Nov 2025 16:40:23 -0600 Subject: PM: Introduce new PMSG_POWEROFF event PMSG_POWEROFF will be used for the PM core to allow differentiating between a hibernation or shutdown sequence when re-using callbacks for common code. Hibernation is started by writing a hibernation method (such as 'platform' 'shutdown', or 'reboot') to use into /sys/power/disk and writing 'disk' to /sys/power/state. Shutdown is initiated with the reboot() syscall with arguments on whether to halt the system or power it off. Tested-by: Eric Naim Signed-off-by: Mario Limonciello (AMD) Link: https://patch.msgid.link/20251112224025.2051702-2-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- include/linux/pm.h | 3 +++ include/trace/events/power.h | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pm.h b/include/linux/pm.h index a72e42eec130..7f69f739f613 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -508,6 +508,7 @@ const struct dev_pm_ops name = { \ * RECOVER Creation of a hibernation image or restoration of the main * memory contents from a hibernation image has failed, call * ->thaw() and ->complete() for all devices. + * POWEROFF System will poweroff, call ->poweroff() for all devices. * * The following PM_EVENT_ messages are defined for internal use by * kernel subsystems. They are never issued by the PM core. @@ -538,6 +539,7 @@ const struct dev_pm_ops name = { \ #define PM_EVENT_USER 0x0100 #define PM_EVENT_REMOTE 0x0200 #define PM_EVENT_AUTO 0x0400 +#define PM_EVENT_POWEROFF 0x0800 #define PM_EVENT_SLEEP (PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE) #define PM_EVENT_USER_SUSPEND (PM_EVENT_USER | PM_EVENT_SUSPEND) @@ -552,6 +554,7 @@ const struct dev_pm_ops name = { \ #define PMSG_QUIESCE ((struct pm_message){ .event = PM_EVENT_QUIESCE, }) #define PMSG_SUSPEND ((struct pm_message){ .event = PM_EVENT_SUSPEND, }) #define PMSG_HIBERNATE ((struct pm_message){ .event = PM_EVENT_HIBERNATE, }) +#define PMSG_POWEROFF ((struct pm_message){ .event = PM_EVENT_POWEROFF, }) #define PMSG_RESUME ((struct pm_message){ .event = PM_EVENT_RESUME, }) #define PMSG_THAW ((struct pm_message){ .event = PM_EVENT_THAW, }) #define PMSG_RESTORE ((struct pm_message){ .event = PM_EVENT_RESTORE, }) diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 82904291c2b8..370f8df2fdb4 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -179,7 +179,8 @@ TRACE_EVENT(pstate_sample, { PM_EVENT_HIBERNATE, "hibernate" }, \ { PM_EVENT_THAW, "thaw" }, \ { PM_EVENT_RESTORE, "restore" }, \ - { PM_EVENT_RECOVER, "recover" }) + { PM_EVENT_RECOVER, "recover" }, \ + { PM_EVENT_POWEROFF, "poweroff" }) DEFINE_EVENT(cpu, cpu_frequency, -- cgit v1.2.3 From ce62118a2e4838bcef1050fff55001a0bf87f0cb Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:49 -0500 Subject: KVM: SEV: Consolidate the SEV policy bits in a single header file Consolidate SEV policy bit definitions into a single file. Use include/linux/psp-sev.h to hold the definitions and remove the current definitions from the arch/x86/kvm/svm/sev.c and arch/x86/include/svm.h files. No functional change intended. Signed-off-by: Tom Lendacky Link: https://patch.msgid.link/d9639f88a0b521a1a67aeac77cc609fdea1f90bd.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- include/linux/psp-sev.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index e0dbcb4b4fd9..27c92543bf38 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -14,6 +14,25 @@ #include +/* As defined by SEV API, under "Guest Policy". */ +#define SEV_POLICY_MASK_NODBG BIT(0) +#define SEV_POLICY_MASK_NOKS BIT(1) +#define SEV_POLICY_MASK_ES BIT(2) +#define SEV_POLICY_MASK_NOSEND BIT(3) +#define SEV_POLICY_MASK_DOMAIN BIT(4) +#define SEV_POLICY_MASK_SEV BIT(5) +#define SEV_POLICY_MASK_API_MAJOR GENMASK(23, 16) +#define SEV_POLICY_MASK_API_MINOR GENMASK(31, 24) + +/* As defined by SEV-SNP Firmware ABI, under "Guest Policy". */ +#define SNP_POLICY_MASK_API_MINOR GENMASK_ULL(7, 0) +#define SNP_POLICY_MASK_API_MAJOR GENMASK_ULL(15, 8) +#define SNP_POLICY_MASK_SMT BIT_ULL(16) +#define SNP_POLICY_MASK_RSVD_MBO BIT_ULL(17) +#define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) +#define SNP_POLICY_MASK_DEBUG BIT_ULL(19) +#define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) + #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ /** -- cgit v1.2.3 From c9434e64e8b4d17511f514f7495008f573595e3e Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 27 Oct 2025 14:33:50 -0500 Subject: crypto: ccp - Add an API to return the supported SEV-SNP policy bits Supported policy bits are dependent on the level of SEV firmware that is currently running. Create an API to return the supported policy bits for the current level of firmware. Signed-off-by: Tom Lendacky Acked-by: Herbert Xu Link: https://patch.msgid.link/e3f711366ddc22e3dd215c987fd2e28dc1c07f54.1761593632.git.thomas.lendacky@amd.com Signed-off-by: Sean Christopherson --- include/linux/psp-sev.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 27c92543bf38..abcdee256c65 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -32,6 +32,20 @@ #define SNP_POLICY_MASK_MIGRATE_MA BIT_ULL(18) #define SNP_POLICY_MASK_DEBUG BIT_ULL(19) #define SNP_POLICY_MASK_SINGLE_SOCKET BIT_ULL(20) +#define SNP_POLICY_MASK_CXL_ALLOW BIT_ULL(21) +#define SNP_POLICY_MASK_MEM_AES_256_XTS BIT_ULL(22) +#define SNP_POLICY_MASK_RAPL_DIS BIT_ULL(23) +#define SNP_POLICY_MASK_CIPHERTEXT_HIDING_DRAM BIT_ULL(24) +#define SNP_POLICY_MASK_PAGE_SWAP_DISABLE BIT_ULL(25) + +/* Base SEV-SNP policy bitmask for minimum supported SEV firmware version */ +#define SNP_POLICY_MASK_BASE (SNP_POLICY_MASK_API_MINOR | \ + SNP_POLICY_MASK_API_MAJOR | \ + SNP_POLICY_MASK_SMT | \ + SNP_POLICY_MASK_RSVD_MBO | \ + SNP_POLICY_MASK_MIGRATE_MA | \ + SNP_POLICY_MASK_DEBUG | \ + SNP_POLICY_MASK_SINGLE_SOCKET) #define SEV_FW_BLOB_MAX_SIZE 0x4000 /* 16KB */ @@ -868,7 +882,10 @@ struct snp_feature_info { u32 edx; } __packed; +#define SNP_RAPL_DISABLE_SUPPORTED BIT(2) #define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) +#define SNP_AES_256_XTS_POLICY_SUPPORTED BIT(4) +#define SNP_CXL_ALLOW_POLICY_SUPPORTED BIT(5) #ifdef CONFIG_CRYPTO_DEV_SP_PSP @@ -1014,6 +1031,7 @@ void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); void sev_platform_shutdown(void); bool sev_is_snp_ciphertext_hiding_supported(void); +u64 sev_get_snp_policy_bits(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ -- cgit v1.2.3 From 337b1b566db087347194e4543ddfdfa5645275cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 13 Nov 2025 18:26:23 +0200 Subject: PCI: Fix restoring BARs on BAR resize rollback path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BAR resize operation is implemented in the pci_resize_resource() and pbus_reassign_bridge_resources() functions. pci_resize_resource() can be called either from __resource_resize_store() from sysfs or directly by the driver for the Endpoint Device. The pci_resize_resource() requires that caller has released the device resources that share the bridge window with the BAR to be resized as otherwise the bridge window is pinned in place and cannot be changed. pbus_reassign_bridge_resources() rolls back resources if the resize operation fails, but rollback is performed only for the bridge windows. Because releasing the device resources are done by the caller of the BAR resize interface, these functions performing the BAR resize do not have access to the device resources as they were before the resize. pbus_reassign_bridge_resources() could try __pci_bridge_assign_resources() after rolling back the bridge windows as they were, however, it will not guarantee the resource are assigned due to differences in how FW and the kernel assign the resources (alignment of the start address and tail). To perform rollback robustly, the BAR resize interface has to be altered to also release the device resources that share the bridge window with the BAR to be resized. Also, remove restoring from the entries failed list as saved list should now contain both the bridge windows and device resources so the extra restore is duplicated work. Some drivers (currently only amdgpu) want to prevent releasing some resources. Add exclude_bars param to pci_resize_resource() and make amdgpu pass its register BAR (BAR 2 or 5), which should never be released during resize operation. Normally 64-bit prefetchable resources do not share a bridge window with the 32-bit only register BAR, but there are various fallbacks in the resource assignment logic which may make the resources share the bridge window in rare cases. This change (together with the driver side changes) is to counter the resource releases that had to be done to prevent resource tree corruption in the ("PCI: Release assigned resource before restoring them") change. As such, it likely restores functionality in cases where device resources were released to avoid resource tree conflicts which appeared to be "working" when such conflicts were not correctly detected by the kernel. Reported-by: Simon Richter Link: https://lore.kernel.org/linux-pci/f9a8c975-f5d3-4dd2-988e-4371a1433a60@hogyros.de/ Reported-by: Alex Bennée Link: https://lore.kernel.org/linux-pci/874irqop6b.fsf@draig.linaro.org/ Signed-off-by: Ilpo Järvinen [bhelgaas: squash amdgpu BAR selection from https://lore.kernel.org/r/20251114103053.13778-1-ilpo.jarvinen@linux.intel.com] Signed-off-by: Bjorn Helgaas Tested-by: Alex Bennée # AVA, AMD GPU Reviewed-by: Christian König Link: https://patch.msgid.link/20251113162628.5946-7-ilpo.jarvinen@linux.intel.com --- include/linux/pci.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..34ff295cd2e3 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1428,7 +1428,8 @@ static inline int pci_rebar_bytes_to_size(u64 bytes) } u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); -int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size); +int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, + int exclude_bars); int pci_select_bars(struct pci_dev *dev, unsigned long flags); bool pci_device_is_present(struct pci_dev *pdev); void pci_ignore_hotplug(struct pci_dev *dev); -- cgit v1.2.3 From 876e15943e9205096441cbe520dc9ccf82df8344 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 13 Nov 2025 20:00:44 +0200 Subject: PCI: Move pci_rebar_bytes_to_size() and clean it up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move pci_rebar_bytes_to_size() from include/linux/pci.h to rebar.c as it does not look very trivial and is not expected to be performance critical. Convert literals to use a newly added PCI_REBAR_MIN_SIZE define. Also add kernel doc for the function as the function is exported. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Michael J. Ruhl Link: https://patch.msgid.link/20251113180053.27944-3-ilpo.jarvinen@linux.intel.com --- include/linux/pci.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 34ff295cd2e3..628dda63b9e0 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1419,17 +1419,13 @@ void pcibios_reset_secondary_bus(struct pci_dev *dev); void pci_update_resource(struct pci_dev *dev, int resno); int __must_check pci_assign_resource(struct pci_dev *dev, int i); int pci_release_resource(struct pci_dev *dev, int resno); -static inline int pci_rebar_bytes_to_size(u64 bytes) -{ - bytes = roundup_pow_of_two(bytes); - - /* Return BAR size as defined in the resizable BAR specification */ - return max(ilog2(bytes), 20) - 20; -} +/* Resizable BAR related routines */ +int pci_rebar_bytes_to_size(u64 bytes); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); + int pci_select_bars(struct pci_dev *dev, unsigned long flags); bool pci_device_is_present(struct pci_dev *pdev); void pci_ignore_hotplug(struct pci_dev *dev); -- cgit v1.2.3 From a337869885083131e575c6367c679f4da4b68bb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 13 Nov 2025 20:00:45 +0200 Subject: PCI: Move pci_rebar_size_to_bytes() and export it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pci_rebar_size_to_bytes() is in drivers/pci/pci.h but would be useful for endpoint drivers as well. Move the function to rebar.c and export it. In addition, convert the literal to where the number comes from (PCI_REBAR_MIN_SIZE). Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Link: https://patch.msgid.link/20251113180053.27944-4-ilpo.jarvinen@linux.intel.com --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 628dda63b9e0..33b27e0c4f3e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1422,6 +1422,7 @@ int pci_release_resource(struct pci_dev *dev, int resno); /* Resizable BAR related routines */ int pci_rebar_bytes_to_size(u64 bytes); +resource_size_t pci_rebar_size_to_bytes(int size); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); -- cgit v1.2.3 From bb1fabd0d94efc29f88f86fb996c40ac06db3669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 13 Nov 2025 20:00:47 +0200 Subject: PCI: Add pci_rebar_size_supported() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many callers of pci_rebar_get_possible_sizes() are interested in finding out if a particular encoded BAR Size (PCIe r7.0, sec 7.8.6.3) is supported by the particular BAR. Add pci_rebar_size_supported() into PCI core to make it easy for the drivers to determine if the BAR size is supported or not. Use the new function in pci_resize_resource() and in pci_iov_vf_bar_set_size(). Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Reviewed-by: Andi Shyti Link: https://patch.msgid.link/20251113180053.27944-6-ilpo.jarvinen@linux.intel.com --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 33b27e0c4f3e..0ef827cfaf0c 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1424,6 +1424,7 @@ int pci_release_resource(struct pci_dev *dev, int resno); int pci_rebar_bytes_to_size(u64 bytes); resource_size_t pci_rebar_size_to_bytes(int size); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); +bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); -- cgit v1.2.3 From 1c680f2acdbb3b64965962ca060a6daa6379575d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 13 Nov 2025 20:00:50 +0200 Subject: PCI: Add pci_rebar_get_max_size() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add pci_rebar_get_max_size() to allow simplifying code that wants to know the maximum possible size for a Resizable BAR. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Link: https://patch.msgid.link/20251113180053.27944-9-ilpo.jarvinen@linux.intel.com --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 0ef827cfaf0c..898bc3a4e8e7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1425,6 +1425,7 @@ int pci_rebar_bytes_to_size(u64 bytes); resource_size_t pci_rebar_size_to_bytes(int size); u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size); +int pci_rebar_get_max_size(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, int exclude_bars); -- cgit v1.2.3 From bf0a90fc907e47344f88e5b9b241082184dbac27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 13 Nov 2025 20:00:53 +0200 Subject: PCI: Convert BAR sizes bitmasks to u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCIe r7.0, sec 7.8.6, defines resizable BAR sizes beyond the currently supported maximum of 128TB, which will require more than u32 to store the entire bitmask. Convert Resizable BAR related functions to use u64 bitmask for BAR sizes to make the typing more future-proof. The support for the larger BAR sizes themselves is not added at this point. Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Reviewed-by: Christian König Link: https://patch.msgid.link/20251113180053.27944-12-ilpo.jarvinen@linux.intel.com --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index 898bc3a4e8e7..4b7f4c08b5c7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1423,7 +1423,7 @@ int pci_release_resource(struct pci_dev *dev, int resno); /* Resizable BAR related routines */ int pci_rebar_bytes_to_size(u64 bytes); resource_size_t pci_rebar_size_to_bytes(int size); -u32 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); +u64 pci_rebar_get_possible_sizes(struct pci_dev *pdev, int bar); bool pci_rebar_size_supported(struct pci_dev *pdev, int bar, int size); int pci_rebar_get_max_size(struct pci_dev *pdev, int bar); int __must_check pci_resize_resource(struct pci_dev *dev, int i, int size, -- cgit v1.2.3 From 4518767be9089ea4f54754ad27364d6134fc46e2 Mon Sep 17 00:00:00 2001 From: Jianyun Gao Date: Sat, 27 Sep 2025 17:34:10 +0800 Subject: time: Fix a few typos in time[r] related code comments Signed-off-by: Jianyun Gao Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20250927093411.1509275-1-jianyungao89@gmail.com --- include/linux/delay.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/delay.h b/include/linux/delay.h index 89866bab100d..46412c00033a 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -68,7 +68,7 @@ void usleep_range_state(unsigned long min, unsigned long max, * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * - * For basic information please refere to usleep_range_state(). + * For basic information please refer to usleep_range_state(). * * The task will be in the state TASK_UNINTERRUPTIBLE during the sleep. */ @@ -82,10 +82,10 @@ static inline void usleep_range(unsigned long min, unsigned long max) * @min: Minimum time in microseconds to sleep * @max: Maximum time in microseconds to sleep * - * For basic information please refere to usleep_range_state(). + * For basic information please refer to usleep_range_state(). * * The sleeping task has the state TASK_IDLE during the sleep to prevent - * contribution to the load avarage. + * contribution to the load average. */ static inline void usleep_range_idle(unsigned long min, unsigned long max) { @@ -96,7 +96,7 @@ static inline void usleep_range_idle(unsigned long min, unsigned long max) * ssleep - wrapper for seconds around msleep * @seconds: Requested sleep duration in seconds * - * Please refere to msleep() for detailed information. + * Please refer to msleep() for detailed information. */ static inline void ssleep(unsigned int seconds) { -- cgit v1.2.3 From ef8057b07c72a817537856b98d6e7493b9404eaf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 13 Nov 2025 20:33:33 +0100 Subject: PM: runtime: Wrapper macros for ACQUIRE()/ACQUIRE_ERR() Add wrapper macros for ACQUIRE()/ACQUIRE_ERR() and runtime PM usage counter guards introduced recently: pm_runtime_active_try, pm_runtime_active_auto_try, pm_runtime_active_try_enabled, and pm_runtime_active_auto_try_enabled. The new macros should be more straightforward to use. For example, they can be used for rewriting a piece of code like below: ACQUIRE(pm_runtime_active_try, pm)(dev); if ((ret = ACQUIRE_ERR(pm_runtime_active_try, &pm))) return ret; in the following way: PM_RUNTIME_ACQUIRE(dev, pm); if ((ret = PM_RUNTIME_ACQUIRE_ERR(&pm))) return ret; If the original code does not care about the specific error code returned when attepmting to resume the device: ACQUIRE(pm_runtime_active_try, pm)(dev); if (ACQUIRE_ERR(pm_runtime_active_try, &pm)) return -ENXIO; it may be changed like this: PM_RUNTIME_ACQUIRE(dev, pm); if (PM_RUNTIME_ACQUIRE_ERR(&pm)) return -ENXIO; Link: https://lore.kernel.org/linux-pm/5068916.31r3eYUQgx@rafael.j.wysocki/ Signed-off-by: Rafael J. Wysocki Reviewed-by: Dan Williams Reviewed-by: Dhruva Gole Reviewed-by: Jonathan Cameron Reviewed-by: Frank Li Link: https://patch.msgid.link/3400866.aeNJFYEL58@rafael.j.wysocki --- include/linux/pm_runtime.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 0b436e15f4cd..911d7a4d32c1 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -637,6 +637,30 @@ DEFINE_GUARD_COND(pm_runtime_active_auto, _try, DEFINE_GUARD_COND(pm_runtime_active_auto, _try_enabled, pm_runtime_resume_and_get(_T), _RET == 0) +/* ACQUIRE() wrapper macros for the guards defined above. */ + +#define PM_RUNTIME_ACQUIRE(_dev, _var) \ + ACQUIRE(pm_runtime_active_try, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_AUTOSUSPEND(_dev, _var) \ + ACQUIRE(pm_runtime_active_auto_try, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_IF_ENABLED(_dev, _var) \ + ACQUIRE(pm_runtime_active_try_enabled, _var)(_dev) + +#define PM_RUNTIME_ACQUIRE_IF_ENABLED_AUTOSUSPEND(_dev, _var) \ + ACQUIRE(pm_runtime_active_auto_try_enabled, _var)(_dev) + +/* + * ACQUIRE_ERR() wrapper macro for guard pm_runtime_active. + * + * Always check PM_RUNTIME_ACQUIRE_ERR() after using one of the + * PM_RUNTIME_ACQUIRE*() macros defined above (yes, it can be used with + * any of them) and if it is nonzero, avoid accessing the given device. + */ +#define PM_RUNTIME_ACQUIRE_ERR(_var_ptr) \ + ACQUIRE_ERR(pm_runtime_active, _var_ptr) + /** * pm_runtime_put_sync - Drop device usage counter and run "idle check" if 0. * @dev: Target device. -- cgit v1.2.3 From 1dcb98bbb7538d4b9015d47c934acdf5ea86045c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Nov 2025 15:33:41 -1000 Subject: sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs With the buddy lockup detector, smp_processor_id() returns the detecting CPU, not the locked CPU, making scx_hardlockup()'s printouts confusing. Pass the locked CPU number from watchdog_hardlockup_check() as a parameter instead. Also add kerneldoc comments to handle_lockup(), scx_hardlockup(), and scx_rcu_cpu_stall() documenting their return value semantics. Suggested-by: Doug Anderson Reviewed-by: Douglas Anderson Acked-by: Andrea Righi Reviewed-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- include/linux/sched/ext.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h index 70ee5c28a74d..bcb962d5ee7d 100644 --- a/include/linux/sched/ext.h +++ b/include/linux/sched/ext.h @@ -230,7 +230,7 @@ struct sched_ext_entity { void sched_ext_dead(struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p); void scx_softlockup(u32 dur_s); -bool scx_hardlockup(void); +bool scx_hardlockup(int cpu); bool scx_rcu_cpu_stall(void); #else /* !CONFIG_SCHED_CLASS_EXT */ @@ -238,7 +238,7 @@ bool scx_rcu_cpu_stall(void); static inline void sched_ext_dead(struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void scx_softlockup(u32 dur_s) {} -static inline bool scx_hardlockup(void) { return false; } +static inline bool scx_hardlockup(int cpu) { return false; } static inline bool scx_rcu_cpu_stall(void) { return false; } #endif /* CONFIG_SCHED_CLASS_EXT */ -- cgit v1.2.3 From f86e51399c2a911a5b01d441de513f17bf773856 Mon Sep 17 00:00:00 2001 From: Xu Yilun Date: Thu, 13 Nov 2025 17:02:27 -0800 Subject: PCI/IDE: Add Address Association Register setup for downstream MMIO The address ranges for downstream Address Association Registers need to cover memory addresses for all functions (PFs/VFs/downstream devices) managed by a Device Security Manager (DSM). The proposed solution is get the memory (32-bit only) range and prefetchable-memory (64-bit capable) range from the immediate ancestor downstream port (either the direct-attach RP or deepest switch port when switch attached). Similar to RID association, address associations will be set by default if hardware sets 'Number of Address Association Register Blocks' in the 'Selective IDE Stream Capability Register' to a non-zero value. TSM drivers can opt-out of the settings by zero'ing out unwanted / unsupported address ranges. E.g. TDX Connect only supports prefetachable (64-bit capable) memory ranges for the Address Association setting. If the immediate downstream port provides both a memory range and prefetchable-memory range, but the IDE partner port only provides 1 Address Association Register block then the TSM driver can pick which range to associate, or let the PCI core prioritize memory. Note, the Address Association Register setup for upstream requests is still uncertain so is not included. Co-developed-by: Aneesh Kumar K.V Signed-off-by: Aneesh Kumar K.V Co-developed-by: Arto Merilainen Signed-off-by: Arto Merilainen Signed-off-by: Xu Yilun Co-developed-by: Dan Williams Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251114010227.567693-1-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-ide.h | 32 ++++++++++++++++++++++++++++++++ include/linux/pci.h | 5 +++++ 2 files changed, 37 insertions(+) (limited to 'include') diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index d0f10f3c89fc..93194338e4d0 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -28,21 +28,53 @@ enum pci_ide_partner_select { * @rid_start: Partner Port Requester ID range start * @rid_end: Partner Port Requester ID range end * @stream_index: Selective IDE Stream Register Block selection + * @mem_assoc: PCI bus memory address association for targeting peer partner + * @pref_assoc: PCI bus prefetchable memory address association for + * targeting peer partner * @default_stream: Endpoint uses this stream for all upstream TLPs regardless of * address and RID association registers * @setup: flag to track whether to run pci_ide_stream_teardown() for this * partner slot * @enable: flag whether to run pci_ide_stream_disable() for this partner slot + * + * By default, pci_ide_stream_alloc() initializes @mem_assoc and @pref_assoc + * with the immediate ancestor downstream port memory ranges (i.e. Type 1 + * Configuration Space Header values). Caller may zero size ({0, -1}) the range + * to drop it from consideration at pci_ide_stream_setup() time. */ struct pci_ide_partner { u16 rid_start; u16 rid_end; u8 stream_index; + struct pci_bus_region mem_assoc; + struct pci_bus_region pref_assoc; unsigned int default_stream:1; unsigned int setup:1; unsigned int enable:1; }; +/** + * struct pci_ide_regs - Hardware register association settings for Selective + * IDE Streams + * @rid1: IDE RID Association Register 1 + * @rid2: IDE RID Association Register 2 + * @addr: Up to two address association blocks (IDE Address Association Register + * 1 through 3) for MMIO and prefetchable MMIO + * @nr_addr: Number of address association blocks initialized + * + * See pci_ide_stream_to_regs() + */ +struct pci_ide_regs { + u32 rid1; + u32 rid2; + struct { + u32 assoc1; + u32 assoc2; + u32 assoc3; + } addr[2]; + int nr_addr; +}; + /** * struct pci_ide - PCIe Selective IDE Stream descriptor * @pdev: PCIe Endpoint in the pci_ide_partner pair diff --git a/include/linux/pci.h b/include/linux/pci.h index 2c8dbae4916c..ba39ca78b382 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -870,6 +870,11 @@ struct pci_bus_region { pci_bus_addr_t end; }; +static inline pci_bus_addr_t pci_bus_region_size(const struct pci_bus_region *region) +{ + return region->end - region->start + 1; +} + struct pci_dynids { spinlock_t lock; /* Protects list, index */ struct list_head list; /* For IDs added at runtime */ -- cgit v1.2.3 From 079115370d00c78ef69b31dd15def90adf2aa579 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:43 -0800 Subject: PCI/IDE: Initialize an ID for all IDE streams The PCIe spec defines two types of streams - selective and link. Each stream has an ID from the same bucket so a stream ID does not tell the type. The spec defines an "enable" bit for every stream and required stream IDs to be unique among all enabled stream but there is no such requirement for disabled streams. However, when IDE_KM is programming keys, an IDE-capable device needs to know the type of stream being programmed to write it directly to the hardware as keys are relatively large, possibly many of them and devices often struggle with keeping around rather big data not being used. Walk through all streams on a device and initialise the IDs to some unique number, both link and selective. The weakest part of this proposal is the host bridge ide_stream_ids_ida. Technically, a Stream ID only needs to be unique within a given partner pair. However, with "anonymous" / unassigned streams there is no convenient place to track the available ids. Proceed with an ida in the host bridge for now, but consider moving this tracking to be an ide_stream_ids_ida per device. Co-developed-by: Alexey Kardashevskiy Signed-off-by: Alexey Kardashevskiy Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-6-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-ide.h | 6 ++++++ include/linux/pci.h | 1 + 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/pci-ide.h b/include/linux/pci-ide.h index 93194338e4d0..37a1ad9501b0 100644 --- a/include/linux/pci-ide.h +++ b/include/linux/pci-ide.h @@ -97,6 +97,12 @@ struct pci_ide { struct tsm_dev *tsm_dev; }; +/* + * Some devices need help with aliased stream-ids even for idle streams. Use + * this id as the "never enabled" place holder. + */ +#define PCI_IDE_RESERVED_STREAM_ID 255 + void pci_ide_set_nr_streams(struct pci_host_bridge *hb, u16 nr); struct pci_ide_partner *pci_ide_to_settings(struct pci_dev *pdev, struct pci_ide *ide); diff --git a/include/linux/pci.h b/include/linux/pci.h index ba39ca78b382..52a235c61023 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -619,6 +619,7 @@ struct pci_host_bridge { #ifdef CONFIG_PCI_IDE u16 nr_ide_streams; /* Max streams possibly active in @ide_stream_ida */ struct ida ide_stream_ida; + struct ida ide_stream_ids_ida; /* track unique ids per domain */ #endif u8 (*swizzle_irq)(struct pci_dev *, u8 *); /* Platform IRQ swizzler */ int (*map_irq)(const struct pci_dev *, u8, u8); -- cgit v1.2.3 From 50cbec192f5317e29be993e2a634bbbdfcf0230e Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:44 -0800 Subject: PCI/TSM: Add pci_tsm_bind() helper for instantiating TDIs After a PCIe device has established a secure link and session between a TEE Security Manager (TSM) and its local Device Security Manager (DSM), the device or its subfunctions are candidates to be bound to a private memory context, a TVM. A PCIe device function interface assigned to a TVM is a TEE Device Interface (TDI). The pci_tsm_bind() requests the low-level TSM driver to associate the device with private MMIO and private IOMMU context resources of a given TVM represented by a @kvm argument. A device in the bound state corresponds to the TDISP protocol LOCKED state and awaits validation by the TVM. It is a 'struct pci_tsm_link_ops' operation because, similar to IDE establishment, it involves host side resource establishment and context setup on behalf of the guest. It is also expected to be performed lazily to allow for operation of the device in non-confidential "shared" context for pre-lock configuration. Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-7-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-tsm.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include') diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h index d7b078d5e272..a5e297677917 100644 --- a/include/linux/pci-tsm.h +++ b/include/linux/pci-tsm.h @@ -6,6 +6,8 @@ struct pci_tsm; struct tsm_dev; +struct kvm; +enum pci_tsm_req_scope; /* * struct pci_tsm_ops - manage confidential links and security state @@ -29,12 +31,16 @@ struct pci_tsm_ops { * @connect: establish / validate a secure connection (e.g. IDE) * with the device * @disconnect: teardown the secure link + * @bind: bind a TDI in preparation for it to be accepted by a TVM + * @unbind: remove a TDI from secure operation with a TVM * * Context: @probe, @remove, @connect, and @disconnect run under * pci_tsm_rwsem held for write to sync with TSM unregistration and * mutual exclusion of @connect and @disconnect. @connect and * @disconnect additionally run under the DSM lock (struct * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions. + * @bind and @unbind run under pci_tsm_rwsem held for read and the DSM + * lock. */ struct_group_tagged(pci_tsm_link_ops, link_ops, struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev, @@ -42,6 +48,9 @@ struct pci_tsm_ops { void (*remove)(struct pci_tsm *tsm); int (*connect)(struct pci_dev *pdev); void (*disconnect)(struct pci_dev *pdev); + struct pci_tdi *(*bind)(struct pci_dev *pdev, + struct kvm *kvm, u32 tdi_id); + void (*unbind)(struct pci_tdi *tdi); ); /* @@ -61,12 +70,25 @@ struct pci_tsm_ops { ); }; +/** + * struct pci_tdi - Core TEE I/O Device Interface (TDI) context + * @pdev: host side representation of guest-side TDI + * @kvm: TEE VM context of bound TDI + * @tdi_id: Identifier (virtual BDF) for the TDI as referenced by the TSM and DSM + */ +struct pci_tdi { + struct pci_dev *pdev; + struct kvm *kvm; + u32 tdi_id; +}; + /** * struct pci_tsm - Core TSM context for a given PCIe endpoint * @pdev: Back ref to device function, distinguishes type of pci_tsm context * @dsm_dev: PCI Device Security Manager for link operations on @pdev * @tsm_dev: PCI TEE Security Manager device for Link Confidentiality or Device * Function Security operations + * @tdi: TDI context established by the @bind link operation * * This structure is wrapped by low level TSM driver data and returned by * probe()/lock(), it is freed by the corresponding remove()/unlock(). @@ -82,6 +104,7 @@ struct pci_tsm { struct pci_dev *pdev; struct pci_dev *dsm_dev; struct tsm_dev *tsm_dev; + struct pci_tdi *tdi; }; /** @@ -139,6 +162,10 @@ int pci_tsm_pf0_constructor(struct pci_dev *pdev, struct pci_tsm_pf0 *tsm, void pci_tsm_pf0_destructor(struct pci_tsm_pf0 *tsm); int pci_tsm_doe_transfer(struct pci_dev *pdev, u8 type, const void *req, size_t req_sz, void *resp, size_t resp_sz); +int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id); +void pci_tsm_unbind(struct pci_dev *pdev); +void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi, + struct kvm *kvm, u32 tdi_id); #else static inline int pci_tsm_register(struct tsm_dev *tsm_dev) { @@ -147,5 +174,12 @@ static inline int pci_tsm_register(struct tsm_dev *tsm_dev) static inline void pci_tsm_unregister(struct tsm_dev *tsm_dev) { } +static inline int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u64 tdi_id) +{ + return -ENXIO; +} +static inline void pci_tsm_unbind(struct pci_dev *pdev) +{ +} #endif #endif /*__PCI_TSM_H */ -- cgit v1.2.3 From c316c75d57fbb34e2305690813f4dbec9311f2b0 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 12 Nov 2025 18:14:45 -0800 Subject: PCI/TSM: Add pci_tsm_guest_req() for managing TDIs A PCIe device function interface assigned to a TVM is a TEE Device Interface (TDI). A TDI instantiated by pci_tsm_bind() needs additional steps taken by the TVM to be accepted into the TVM's Trusted Compute Boundary (TCB) and transitioned to the RUN state. pci_tsm_guest_req() is a channel for the guest to request TDISP collateral, like Device Interface Reports, and effect TDISP state changes, like LOCKED->RUN transititions. Similar to IDE establishment and pci_tsm_bind(), these are long running operations involving SPDM message passing via the DOE mailbox. The path for a TVM to invoke pci_tsm_guest_req() is: * TSM triggers exit via guest-to-host-interface ABI (implementation specific) * VMM invokes handler (KVM handle_exit() -> userspace io) * handler issues request (userspace io handler -> ioctl() -> pci_tsm_guest_req()) * handler supplies response * VMM posts response, notifies/re-enters TVM This path is purely a transport for messages from TVM to platform TSM. By design the host kernel does not and must not care about the content of these messages. I.e. the host kernel is not in the TCB of the TVM. As this is an opaque passthrough interface, similar to fwctl, the kernel requires that implementations stay within the bounds defined by 'enum pci_tsm_req_scope'. Violation of those expectations likely has market and regulatory consequences. Out of scope requests are blocked by default. Co-developed-by: Xu Yilun Signed-off-by: Xu Yilun Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20251113021446.436830-8-dan.j.williams@intel.com Signed-off-by: Dan Williams --- include/linux/pci-tsm.h | 62 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pci-tsm.h b/include/linux/pci-tsm.h index a5e297677917..a6435aba03f9 100644 --- a/include/linux/pci-tsm.h +++ b/include/linux/pci-tsm.h @@ -3,6 +3,7 @@ #define __PCI_TSM_H #include #include +#include struct pci_tsm; struct tsm_dev; @@ -33,14 +34,15 @@ struct pci_tsm_ops { * @disconnect: teardown the secure link * @bind: bind a TDI in preparation for it to be accepted by a TVM * @unbind: remove a TDI from secure operation with a TVM + * @guest_req: marshal TVM information and state change requests * * Context: @probe, @remove, @connect, and @disconnect run under * pci_tsm_rwsem held for write to sync with TSM unregistration and * mutual exclusion of @connect and @disconnect. @connect and * @disconnect additionally run under the DSM lock (struct * pci_tsm_pf0::lock) as well as @probe and @remove of the subfunctions. - * @bind and @unbind run under pci_tsm_rwsem held for read and the DSM - * lock. + * @bind, @unbind, and @guest_req run under pci_tsm_rwsem held for read + * and the DSM lock. */ struct_group_tagged(pci_tsm_link_ops, link_ops, struct pci_tsm *(*probe)(struct tsm_dev *tsm_dev, @@ -51,6 +53,11 @@ struct pci_tsm_ops { struct pci_tdi *(*bind)(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id); void (*unbind)(struct pci_tdi *tdi); + ssize_t (*guest_req)(struct pci_tdi *tdi, + enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, + sockptr_t req_out, size_t out_len, + u64 *tsm_code); ); /* @@ -152,6 +159,46 @@ static inline bool is_pci_tsm_pf0(struct pci_dev *pdev) return PCI_FUNC(pdev->devfn) == 0; } +/** + * enum pci_tsm_req_scope - Scope of guest requests to be validated by TSM + * + * Guest requests are a transport for a TVM to communicate with a TSM + DSM for + * a given TDI. A TSM driver is responsible for maintaining the kernel security + * model and limit commands that may affect the host, or are otherwise outside + * the typical TDISP operational model. + */ +enum pci_tsm_req_scope { + /** + * @PCI_TSM_REQ_INFO: Read-only, without side effects, request for + * typical TDISP collateral information like Device Interface Reports. + * No device secrets are permitted, and no device state is changed. + */ + PCI_TSM_REQ_INFO = 0, + /** + * @PCI_TSM_REQ_STATE_CHANGE: Request to change the TDISP state from + * UNLOCKED->LOCKED, LOCKED->RUN, or other architecture specific state + * changes to support those transitions for a TDI. No other (unrelated + * to TDISP) device / host state, configuration, or data change is + * permitted. + */ + PCI_TSM_REQ_STATE_CHANGE = 1, + /** + * @PCI_TSM_REQ_DEBUG_READ: Read-only request for debug information + * + * A method to facilitate TVM information retrieval outside of typical + * TDISP operational requirements. No device secrets are permitted. + */ + PCI_TSM_REQ_DEBUG_READ = 2, + /** + * @PCI_TSM_REQ_DEBUG_WRITE: Device state changes for debug purposes + * + * The request may affect the operational state of the device outside of + * the TDISP operational model. If allowed, requires CAP_SYS_RAW_IO, and + * will taint the kernel. + */ + PCI_TSM_REQ_DEBUG_WRITE = 3, +}; + #ifdef CONFIG_PCI_TSM int pci_tsm_register(struct tsm_dev *tsm_dev); void pci_tsm_unregister(struct tsm_dev *tsm_dev); @@ -166,6 +213,9 @@ int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u32 tdi_id); void pci_tsm_unbind(struct pci_dev *pdev); void pci_tsm_tdi_constructor(struct pci_dev *pdev, struct pci_tdi *tdi, struct kvm *kvm, u32 tdi_id); +ssize_t pci_tsm_guest_req(struct pci_dev *pdev, enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, sockptr_t req_out, + size_t out_len, u64 *tsm_code); #else static inline int pci_tsm_register(struct tsm_dev *tsm_dev) { @@ -181,5 +231,13 @@ static inline int pci_tsm_bind(struct pci_dev *pdev, struct kvm *kvm, u64 tdi_id static inline void pci_tsm_unbind(struct pci_dev *pdev) { } +static inline ssize_t pci_tsm_guest_req(struct pci_dev *pdev, + enum pci_tsm_req_scope scope, + sockptr_t req_in, size_t in_len, + sockptr_t req_out, size_t out_len, + u64 *tsm_code) +{ + return -ENXIO; +} #endif #endif /*__PCI_TSM_H */ -- cgit v1.2.3 From 6d650ae9282bcec1e76205b44cb8f17e2265052e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Nov 2025 14:03:57 +0000 Subject: tcp: gro: inline tcp_gro_pull_header() tcp_gro_pull_header() is used in GRO fast path, inline it. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251113140358.58242-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/gro.h | 27 +++++++++++++++++++++++++++ include/net/tcp.h | 1 - 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/gro.h b/include/net/gro.h index e3affb2e2ca8..b65f631c521d 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -593,4 +593,31 @@ static inline void inet6_get_iif_sdif(const struct sk_buff *skb, int *iif, int * struct packet_offload *gro_find_receive_by_type(__be16 type); struct packet_offload *gro_find_complete_by_type(__be16 type); +static inline struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) +{ + unsigned int thlen, hlen, off; + struct tcphdr *th; + + off = skb_gro_offset(skb); + hlen = off + sizeof(*th); + th = skb_gro_header(skb, hlen, off); + if (unlikely(!th)) + return NULL; + + thlen = th->doff * 4; + if (unlikely(thlen < sizeof(*th))) + return NULL; + + hlen = off + thlen; + if (!skb_gro_may_pull(skb, hlen)) { + th = skb_gro_header_slow(skb, hlen, off); + if (unlikely(!th)) + return NULL; + } + + skb_gro_pull(skb, thlen); + + return th; +} + #endif /* _NET_GRO_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 4833ec7903ec..0deb5e9dd911 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2313,7 +2313,6 @@ void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); -struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb); struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, struct tcphdr *th); -- cgit v1.2.3 From 06ac470658190e97518f131df01c9c530c293320 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Thu, 13 Nov 2025 19:45:01 +0800 Subject: sctp: Remove unused declaration sctp_auth_init_hmacs() Commit bf40785fa437 ("sctp: Use HMAC-SHA1 and HMAC-SHA256 library for chunk authentication") removed the implementation but leave declaration. Signed-off-by: Yue Haibing Reviewed-by: Eric Biggers Link: https://patch.msgid.link/20251113114501.32905-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- include/net/sctp/auth.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/sctp/auth.h b/include/net/sctp/auth.h index 3d5879e08e78..6f2cd562b1de 100644 --- a/include/net/sctp/auth.h +++ b/include/net/sctp/auth.h @@ -72,7 +72,6 @@ struct sctp_shared_key *sctp_auth_get_shkey( int sctp_auth_asoc_copy_shkeys(const struct sctp_endpoint *ep, struct sctp_association *asoc, gfp_t gfp); -int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp); const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id); const struct sctp_hmac * sctp_auth_asoc_get_hmac(const struct sctp_association *asoc); -- cgit v1.2.3 From 4cc1aa469cd6b714adc958547a4866247bfd60a9 Mon Sep 17 00:00:00 2001 From: Nuno Das Neves Date: Fri, 17 Oct 2025 11:58:17 -0700 Subject: mshv: Fix deposit memory in MSHV_ROOT_HVCALL When the MSHV_ROOT_HVCALL ioctl is executing a hypercall, and gets HV_STATUS_INSUFFICIENT_MEMORY, it deposits memory and then returns -EAGAIN to userspace. The expectation is that the VMM will retry. However, some VMM code in the wild doesn't do this and simply fails. Rather than force the VMM to retry, change the ioctl to deposit memory on demand and immediately retry the hypercall as is done with all the other hypercall helper functions. In addition to making the ioctl easier to use, removing the need for multiple syscalls improves performance. There is a complication: unlike the other hypercall helper functions, in MSHV_ROOT_HVCALL the input is opaque to the kernel. This is problematic for rep hypercalls, because the next part of the input list can't be copied on each loop after depositing pages (this was the original reason for returning -EAGAIN in this case). Introduce hv_do_rep_hypercall_ex(), which adds a 'rep_start' parameter. This solves the issue, allowing the deposit loop in MSHV_ROOT_HVCALL to restart a rep hypercall after depositing pages partway through. Fixes: 621191d709b1 ("Drivers: hv: Introduce mshv_root module to expose /dev/mshv to VMMs") Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- include/asm-generic/mshyperv.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 64ba6bc807d9..b89c7e3a2047 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -124,10 +124,12 @@ static inline unsigned int hv_repcomp(u64 status) /* * Rep hypercalls. Callers of this functions are supposed to ensure that - * rep_count and varhead_size comply with Hyper-V hypercall definition. + * rep_count, varhead_size, and rep_start comply with Hyper-V hypercall + * definition. */ -static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, - void *input, void *output) +static inline u64 hv_do_rep_hypercall_ex(u16 code, u16 rep_count, + u16 varhead_size, u16 rep_start, + void *input, void *output) { u64 control = code; u64 status; @@ -135,6 +137,7 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET; control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET; + control |= (u64)rep_start << HV_HYPERCALL_REP_START_OFFSET; do { status = hv_do_hypercall(control, input, output); @@ -152,6 +155,14 @@ static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, return status; } +/* For the typical case where rep_start is 0 */ +static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, + void *input, void *output) +{ + return hv_do_rep_hypercall_ex(code, rep_count, varhead_size, 0, + input, output); +} + /* Generate the guest OS identifier as described in the Hyper-V TLFS */ static inline u64 hv_generate_guest_id(u64 kernel_version) { -- cgit v1.2.3 From 3e1b611515d286c6725028e17170f7143e5e51fc Mon Sep 17 00:00:00 2001 From: Tianyu Lan Date: Thu, 18 Sep 2025 11:00:20 -0400 Subject: drivers: hv: Allow vmbus message synic interrupt injected from Hyper-V When Secure AVIC is enabled, VMBus driver should call x2apic Secure AVIC interface to allow Hyper-V to inject VMBus message interrupt. Reviewed-by: Michael Kelley Reviewed-by: Neeraj Upadhyay Signed-off-by: Tianyu Lan Signed-off-by: Wei Liu --- include/asm-generic/mshyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index b89c7e3a2047..db84aced1658 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -347,6 +347,7 @@ bool hv_is_isolation_supported(void); bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); +void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set); void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); void hv_setup_dma_ops(struct device *dev, bool coherent); -- cgit v1.2.3 From 6802d8af47d1dccd9a74a1f708fb9129244ef843 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:04 -0700 Subject: Drivers: hv: VMBus protocol version 6.0 The confidential VMBus is supported starting from the protocol version 6.0 onwards. Provide the required definitions. No functional changes. Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 1 + include/linux/hyperv.h | 69 ++++++++++++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 77abddfc750e..7f730a0e54e6 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -260,6 +260,7 @@ union hv_hypervisor_version_info { #define HYPERV_CPUID_VIRT_STACK_PROPERTIES 0x40000082 /* Support for the extended IOAPIC RTE format */ #define HYPERV_VS_PROPERTIES_EAX_EXTENDED_IOAPIC_RTE BIT(2) +#define HYPERV_VS_PROPERTIES_EAX_CONFIDENTIAL_VMBUS_AVAILABLE BIT(3) #define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 #define HYPERV_CPUID_MIN 0x40000005 diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 59826c89171c..dfc516c1c719 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -265,16 +265,18 @@ static inline u32 hv_get_avail_to_write_percent( * Linux kernel. */ -#define VERSION_WS2008 ((0 << 16) | (13)) -#define VERSION_WIN7 ((1 << 16) | (1)) -#define VERSION_WIN8 ((2 << 16) | (4)) -#define VERSION_WIN8_1 ((3 << 16) | (0)) -#define VERSION_WIN10 ((4 << 16) | (0)) -#define VERSION_WIN10_V4_1 ((4 << 16) | (1)) -#define VERSION_WIN10_V5 ((5 << 16) | (0)) -#define VERSION_WIN10_V5_1 ((5 << 16) | (1)) -#define VERSION_WIN10_V5_2 ((5 << 16) | (2)) -#define VERSION_WIN10_V5_3 ((5 << 16) | (3)) +#define VMBUS_MAKE_VERSION(MAJ, MIN) ((((u32)MAJ) << 16) | (MIN)) +#define VERSION_WS2008 VMBUS_MAKE_VERSION(0, 13) +#define VERSION_WIN7 VMBUS_MAKE_VERSION(1, 1) +#define VERSION_WIN8 VMBUS_MAKE_VERSION(2, 4) +#define VERSION_WIN8_1 VMBUS_MAKE_VERSION(3, 0) +#define VERSION_WIN10 VMBUS_MAKE_VERSION(4, 0) +#define VERSION_WIN10_V4_1 VMBUS_MAKE_VERSION(4, 1) +#define VERSION_WIN10_V5 VMBUS_MAKE_VERSION(5, 0) +#define VERSION_WIN10_V5_1 VMBUS_MAKE_VERSION(5, 1) +#define VERSION_WIN10_V5_2 VMBUS_MAKE_VERSION(5, 2) +#define VERSION_WIN10_V5_3 VMBUS_MAKE_VERSION(5, 3) +#define VERSION_WIN10_V6_0 VMBUS_MAKE_VERSION(6, 0) /* Make maximum size of pipe payload of 16K */ #define MAX_PIPE_DATA_PAYLOAD (sizeof(u8) * 16384) @@ -335,14 +337,22 @@ struct vmbus_channel_offer { } __packed; /* Server Flags */ -#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 1 -#define VMBUS_CHANNEL_SERVER_SUPPORTS_TRANSFER_PAGES 2 -#define VMBUS_CHANNEL_SERVER_SUPPORTS_GPADLS 4 -#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x10 -#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x100 -#define VMBUS_CHANNEL_PARENT_OFFER 0x200 -#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x400 -#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 +#define VMBUS_CHANNEL_ENUMERATE_DEVICE_INTERFACE 0x0001 +/* + * This flag indicates that the channel is offered by the paravisor, and must + * use encrypted memory for the channel ring buffer. + */ +#define VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER 0x0002 +/* + * This flag indicates that the channel is offered by the paravisor, and must + * use encrypted memory for GPA direct packets and additional GPADLs. + */ +#define VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY 0x0004 +#define VMBUS_CHANNEL_NAMED_PIPE_MODE 0x0010 +#define VMBUS_CHANNEL_LOOPBACK_OFFER 0x0100 +#define VMBUS_CHANNEL_PARENT_OFFER 0x0200 +#define VMBUS_CHANNEL_REQUEST_MONITORED_NOTIFICATION 0x0400 +#define VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER 0x2000 struct vmpacket_descriptor { u16 type; @@ -621,6 +631,12 @@ struct vmbus_channel_relid_released { u32 child_relid; } __packed; +/* + * Used by the paravisor only, means that the encrypted ring buffers and + * the encrypted external memory are supported + */ +#define VMBUS_FEATURE_FLAG_CONFIDENTIAL_CHANNELS 0x10 + struct vmbus_channel_initiate_contact { struct vmbus_channel_message_header header; u32 vmbus_version_requested; @@ -630,7 +646,8 @@ struct vmbus_channel_initiate_contact { struct { u8 msg_sint; u8 msg_vtl; - u8 reserved[6]; + u8 reserved[2]; + u32 feature_flags; /* VMBus version 6.0 */ }; }; u64 monitor_page1; @@ -1003,6 +1020,10 @@ struct vmbus_channel { /* boolean to control visibility of sysfs for ring buffer */ bool ring_sysfs_visible; + /* The ring buffer is encrypted */ + bool co_ring_buffer; + /* The external memory is encrypted */ + bool co_external_memory; }; #define lock_requestor(channel, flags) \ @@ -1027,6 +1048,16 @@ u64 vmbus_request_addr_match(struct vmbus_channel *channel, u64 trans_id, u64 rqst_addr); u64 vmbus_request_addr(struct vmbus_channel *channel, u64 trans_id); +static inline bool is_co_ring_buffer(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_RING_BUFFER); +} + +static inline bool is_co_external_memory(const struct vmbus_channel_offer_channel *o) +{ + return !!(o->offer.chn_flags & VMBUS_CHANNEL_CONFIDENTIAL_EXTERNAL_MEMORY); +} + static inline bool is_hvsock_offer(const struct vmbus_channel_offer_channel *o) { return !!(o->offer.chn_flags & VMBUS_CHANNEL_TLNPI_PROVIDER_OFFER); -- cgit v1.2.3 From 7c8b6c326d830ca5c6b95f390c703966e14167e6 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:05 -0700 Subject: arch/x86: mshyperv: Discover Confidential VMBus availability Confidential VMBus requires enabling paravisor SynIC, and the x86_64 guest has to inspect the Virtualization Stack (VS) CPUID leaf to see if Confidential VMBus is available. If it is, the guest shall enable the paravisor SynIC. Read the relevant data from the VS CPUID leaf. Refactor the code to avoid repeating CPUID and add flags to the struct ms_hyperv_info. For ARM64, the flag for Confidential VMBus is not set which provides the desired behaviour for now as it is not available on ARM64 just yet. Once ARM64 CCA guests are supported, this flag will be set unconditionally when running such a guest. Signed-off-by: Roman Kisel Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- include/asm-generic/mshyperv.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index db84aced1658..8da1893365f0 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -62,6 +62,8 @@ struct ms_hyperv_info { }; }; u64 shared_gpa_boundary; + bool msi_ext_dest_id; + bool confidential_vmbus_available; }; extern struct ms_hyperv_info ms_hyperv; extern bool hv_nested; -- cgit v1.2.3 From e6eeb3c782739cd1613a8da856b878b99f741943 Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:06 -0700 Subject: arch: hyperv: Get/set SynIC synth.registers via paravisor The existing Hyper-V wrappers for getting and setting MSRs are hv_get/set_msr(). Via hv_get/set_non_nested_msr(), they detect when running in a CoCo VM with a paravisor, and use the TDX or SNP guest-host communication protocol to bypass the paravisor and go directly to the host hypervisor for SynIC MSRs. The "set" function also implements the required special handling for the SINT MSRs. Provide functions that allow manipulating the SynIC registers through the paravisor. Move vmbus_signal_eom() to a more appropriate location (which also avoids breaking KVM). Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- include/asm-generic/mshyperv.h | 42 ++---------------------------------------- 1 file changed, 2 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index 8da1893365f0..c328265de624 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -176,46 +176,6 @@ static inline u64 hv_generate_guest_id(u64 kernel_version) return guest_id; } -#if IS_ENABLED(CONFIG_HYPERV_VMBUS) -/* Free the message slot and signal end-of-message if required */ -static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) -{ - /* - * On crash we're reading some other CPU's message page and we need - * to be careful: this other CPU may already had cleared the header - * and the host may already had delivered some other message there. - * In case we blindly write msg->header.message_type we're going - * to lose it. We can still lose a message of the same type but - * we count on the fact that there can only be one - * CHANNELMSG_UNLOAD_RESPONSE and we don't care about other messages - * on crash. - */ - if (cmpxchg(&msg->header.message_type, old_msg_type, - HVMSG_NONE) != old_msg_type) - return; - - /* - * The cmxchg() above does an implicit memory barrier to - * ensure the write to MessageType (ie set to - * HVMSG_NONE) happens before we read the - * MessagePending and EOMing. Otherwise, the EOMing - * will not deliver any more messages since there is - * no empty slot - */ - if (msg->header.message_flags.msg_pending) { - /* - * This will cause message queue rescan to - * possibly deliver another msg from the - * hypervisor - */ - hv_set_msr(HV_MSR_EOM, 0); - } -} - -extern int vmbus_interrupt; -extern int vmbus_irq; -#endif /* CONFIG_HYPERV_VMBUS */ - int hv_get_hypervisor_version(union hv_hypervisor_version_info *info); void hv_setup_vmbus_handler(void (*handler)(void)); @@ -350,6 +310,8 @@ bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set); +u64 hv_para_get_synic_register(unsigned int reg); +void hv_para_set_synic_register(unsigned int reg, u64 val); void hyperv_cleanup(void); bool hv_query_ext_cap(u64 cap_query); void hv_setup_dma_ops(struct device *dev, bool coherent); -- cgit v1.2.3 From a156ad8c508209ce22f3213d25c3c2ae1774a57d Mon Sep 17 00:00:00 2001 From: Roman Kisel Date: Wed, 8 Oct 2025 16:34:07 -0700 Subject: arch/x86: mshyperv: Trap on access for some synthetic MSRs hv_set_non_nested_msr() has special handling for SINT MSRs when a paravisor is present. In addition to updating the MSR on the host, the mirror MSR in the paravisor is updated, including with the proxy bit. But with Confidential VMBus, the proxy bit must not be used, so add a special case to skip it. Signed-off-by: Roman Kisel Reviewed-by: Alok Tiwari Reviewed-by: Tianyu Lan Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- include/asm-generic/mshyperv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-generic/mshyperv.h b/include/asm-generic/mshyperv.h index c328265de624..ecedab554c80 100644 --- a/include/asm-generic/mshyperv.h +++ b/include/asm-generic/mshyperv.h @@ -310,6 +310,7 @@ bool hv_isolation_type_snp(void); u64 hv_ghcb_hypercall(u64 control, void *input, void *output, u32 input_size); u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2); void hv_enable_coco_interrupt(unsigned int cpu, unsigned int vector, bool set); +void hv_para_set_sint_proxy(bool enable); u64 hv_para_get_synic_register(unsigned int reg); void hv_para_set_synic_register(unsigned int reg, u64 val); void hyperv_cleanup(void); -- cgit v1.2.3 From 59aeea195948fd507cef2e439a5a964b8432750e Mon Sep 17 00:00:00 2001 From: Purna Pavan Chandra Aekkaladevi Date: Fri, 10 Oct 2025 14:55:48 -0700 Subject: mshv: Add the HVCALL_GET_PARTITION_PROPERTY_EX hypercall This hypercall can be used to fetch extended properties of a partition. Extended properties are properties with values larger than a u64. Some of these also need additional input arguments. Add helper function for using the hypercall in the mshv_root driver. Signed-off-by: Purna Pavan Chandra Aekkaladevi Signed-off-by: Nuno Das Neves Reviewed-by: Anirudh Rayabharam Reviewed-by: Praveen K Paladugu Reviewed-by: Easwar Hariharan Reviewed-by: Tianyu Lan Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 1 + include/hyperv/hvhdk.h | 40 ++++++++++++++++++++++++++++++++++++++++ include/hyperv/hvhdk_mini.h | 26 ++++++++++++++++++++++++++ 3 files changed, 67 insertions(+) (limited to 'include') diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 7f730a0e54e6..af85b1c36b6e 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -491,6 +491,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_VP_STATE 0x00e3 #define HVCALL_SET_VP_STATE 0x00e4 #define HVCALL_GET_VP_CPUID_VALUES 0x00f4 +#define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index b4067ada02cf..416c0d45b793 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -376,6 +376,46 @@ struct hv_input_set_partition_property { u64 property_value; } __packed; +union hv_partition_property_arg { + u64 as_uint64; + struct { + union { + u32 arg; + u32 vp_index; + }; + u16 reserved0; + u8 reserved1; + u8 object_type; + } __packed; +}; + +struct hv_input_get_partition_property_ex { + u64 partition_id; + u32 property_code; /* enum hv_partition_property_code */ + u32 padding; + union { + union hv_partition_property_arg arg_data; + u64 arg; + }; +} __packed; + +/* + * NOTE: Should use hv_input_set_partition_property_ex_header to compute this + * size, but hv_input_get_partition_property_ex is identical so it suffices + */ +#define HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE \ + (HV_HYP_PAGE_SIZE - sizeof(struct hv_input_get_partition_property_ex)) + +union hv_partition_property_ex { + u8 buffer[HV_PARTITION_PROPERTY_EX_MAX_VAR_SIZE]; + struct hv_partition_property_vmm_capabilities vmm_capabilities; + /* More fields to be filled in when needed */ +}; + +struct hv_output_get_partition_property_ex { + union hv_partition_property_ex property_value; +} __packed; + enum hv_vp_state_page_type { HV_VP_STATE_PAGE_REGISTERS = 0, HV_VP_STATE_PAGE_INTERCEPT_MESSAGE = 1, diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index 858f6a3925b3..bf2ce27dfcc5 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -96,8 +96,34 @@ enum hv_partition_property_code { HV_PARTITION_PROPERTY_XSAVE_STATES = 0x00060007, HV_PARTITION_PROPERTY_MAX_XSAVE_DATA_SIZE = 0x00060008, HV_PARTITION_PROPERTY_PROCESSOR_CLOCK_FREQUENCY = 0x00060009, + + /* Extended properties with larger property values */ + HV_PARTITION_PROPERTY_VMM_CAPABILITIES = 0x00090007, }; +#define HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT 1 +#define HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT 59 + +struct hv_partition_property_vmm_capabilities { + u16 bank_count; + u16 reserved[3]; + union { + u64 as_uint64[HV_PARTITION_VMM_CAPABILITIES_BANK_COUNT]; + struct { + u64 map_gpa_preserve_adjustable: 1; + u64 vmm_can_provide_overlay_gpfn: 1; + u64 vp_affinity_property: 1; +#if IS_ENABLED(CONFIG_ARM64) + u64 vmm_can_provide_gic_overlay_locations: 1; +#else + u64 reservedbit3: 1; +#endif + u64 assignable_synthetic_proc_features: 1; + u64 reserved0: HV_PARTITION_VMM_CAPABILITIES_RESERVED_BITFIELD_COUNT; + } __packed; + }; +} __packed; + enum hv_snp_status { HV_SNP_STATUS_NONE = 0, HV_SNP_STATUS_AVAILABLE = 1, -- cgit v1.2.3 From d62313bdf5961b5f815f0b212f029cf146a8a804 Mon Sep 17 00:00:00 2001 From: Jinank Jain Date: Fri, 10 Oct 2025 14:55:51 -0700 Subject: mshv: Introduce new hypercall to map stats page for L1VH partitions Introduce HVCALL_MAP_STATS_PAGE2 which provides a map location (GPFN) to map the stats to. This hypercall is required for L1VH partitions, depending on the hypervisor version. This uses the same check as the state page map location; mshv_use_overlay_gpfn(). Add mshv_map_vp_state_page() helpers to use this new hypercall or the old one depending on availability. For unmapping, the original HVCALL_UNMAP_STATS_PAGE works for both cases. Signed-off-by: Jinank Jain Signed-off-by: Nuno Das Neves Reviewed-by: Easwar Hariharan Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 1 + include/hyperv/hvhdk_mini.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index af85b1c36b6e..f6e31d1c3267 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -494,6 +494,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 +#define HVCALL_MAP_STATS_PAGE2 0x0131 /* HV_HYPERCALL_INPUT */ #define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index bf2ce27dfcc5..064bf735cab6 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -177,6 +177,13 @@ struct hv_input_map_stats_page { union hv_stats_object_identity identity; } __packed; +struct hv_input_map_stats_page2 { + u32 type; /* enum hv_stats_object_type */ + u32 padding; + union hv_stats_object_identity identity; + u64 map_location; +} __packed; + struct hv_output_map_stats_page { u64 map_location; } __packed; -- cgit v1.2.3 From 56c3feb3cc17b764f51191fd3dc461ab55a7b803 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 6 Oct 2025 15:42:04 -0700 Subject: hyperv: Add two new hypercall numbers to guest ABI public header In preparation for the subsequent crashdump patches, copy two hypercall numbers to the guest ABI header published by Hyper-V. One to notify hypervisor of an event that occurs in the root partition, other to ask hypervisor to disable the hypervisor. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index f6e31d1c3267..7499a679e60a 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -470,6 +470,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_MAP_DEVICE_INTERRUPT 0x007c #define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d #define HVCALL_RETARGET_INTERRUPT 0x007e +#define HVCALL_NOTIFY_PARTITION_EVENT 0x0087 #define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b #define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 @@ -494,6 +495,7 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_GET_PARTITION_PROPERTY_EX 0x0101 #define HVCALL_MMIO_READ 0x0106 #define HVCALL_MMIO_WRITE 0x0107 +#define HVCALL_DISABLE_HYP_EX 0x010f #define HVCALL_MAP_STATS_PAGE2 0x0131 /* HV_HYPERCALL_INPUT */ -- cgit v1.2.3 From e0a975ecd2e671664d208723476eeabb3baf08be Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Mon, 6 Oct 2025 15:42:05 -0700 Subject: hyperv: Add definitions for hypervisor crash dump support Add data structures for hypervisor crash dump support to the hypervisor host ABI header file. Details of their usages are in subsequent commits. Signed-off-by: Mukesh Rathor Signed-off-by: Wei Liu --- include/hyperv/hvhdk_mini.h | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include') diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index 064bf735cab6..f2d7b50de7a4 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -142,6 +142,17 @@ enum hv_system_property { /* Add more values when needed */ HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15, HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21, + HV_SYSTEM_PROPERTY_CRASHDUMPAREA = 47, +}; + +#define HV_PFN_RANGE_PGBITS 24 /* HV_SPA_PAGE_RANGE_ADDITIONAL_PAGES_BITS */ +union hv_pfn_range { /* HV_SPA_PAGE_RANGE */ + u64 as_uint64; + struct { + /* 39:0: base pfn. 63:40: additional pages */ + u64 base_pfn : 64 - HV_PFN_RANGE_PGBITS; + u64 add_pfns : HV_PFN_RANGE_PGBITS; + } __packed; }; enum hv_dynamic_processor_feature_property { @@ -168,6 +179,8 @@ struct hv_output_get_system_property { #if IS_ENABLED(CONFIG_X86) u64 hv_processor_feature_value; #endif + union hv_pfn_range hv_cda_info; /* CrashdumpAreaAddress */ + u64 hv_tramp_pa; /* CrashdumpTrampolineAddress */ }; } __packed; @@ -267,6 +280,48 @@ union hv_gpa_page_access_state { u8 as_uint8; } __packed; +enum hv_crashdump_action { + HV_CRASHDUMP_NONE = 0, + HV_CRASHDUMP_SUSPEND_ALL_VPS, + HV_CRASHDUMP_PREPARE_FOR_STATE_SAVE, + HV_CRASHDUMP_STATE_SAVED, + HV_CRASHDUMP_ENTRY, +}; + +struct hv_partition_event_root_crashdump_input { + u32 crashdump_action; /* enum hv_crashdump_action */ +} __packed; + +struct hv_input_disable_hyp_ex { /* HV_X64_INPUT_DISABLE_HYPERVISOR_EX */ + u64 rip; + u64 arg; +} __packed; + +struct hv_crashdump_area { /* HV_CRASHDUMP_AREA */ + u32 version; + union { + u32 flags_as_uint32; + struct { + u32 cda_valid : 1; + u32 cda_unused : 31; + } __packed; + }; + /* more unused fields */ +} __packed; + +union hv_partition_event_input { + struct hv_partition_event_root_crashdump_input crashdump_input; +}; + +enum hv_partition_event { + HV_PARTITION_EVENT_ROOT_CRASHDUMP = 2, +}; + +struct hv_input_notify_partition_event { + u32 event; /* enum hv_partition_event */ + union hv_partition_event_input input; +} __packed; + struct hv_lp_startup_status { u64 hv_status; u64 substatus1; -- cgit v1.2.3 From f91bc8f61abf0e1d23108ae9871c60d7612a09b2 Mon Sep 17 00:00:00 2001 From: Magnus Kulke Date: Thu, 6 Nov 2025 14:13:31 -0800 Subject: mshv: Allow mappings that overlap in uaddr Currently the MSHV driver rejects mappings that would overlap in userspace. Some VMMs require the same memory to be mapped to different parts of the guest's address space, and so working around this restriction is difficult. The hypervisor itself doesn't prohibit mappings that overlap in uaddr, (really in SPA; system physical addresses), so supporting this in the driver doesn't require any extra work: only the checks need to be removed. Since no userspace code until now has been able to overlap regions in userspace, relaxing this constraint can't break any existing code. Signed-off-by: Magnus Kulke Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- include/uapi/linux/mshv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index 876bfe4e4227..374f75e198bc 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -89,7 +89,7 @@ enum { * @rsvd: MBZ * * Map or unmap a region of userspace memory to Guest Physical Addresses (GPA). - * Mappings can't overlap in GPA space or userspace. + * Mappings can't overlap in GPA space. * To unmap, these fields must match an existing mapping. */ struct mshv_user_mem_region { -- cgit v1.2.3 From c91fe5f162f278d4aa960d06d2dbc42f9857593a Mon Sep 17 00:00:00 2001 From: Muminul Islam Date: Thu, 13 Nov 2025 11:45:33 -0800 Subject: mshv: Extend create partition ioctl to support cpu features The existing mshv create partition ioctl does not provide a way to specify which cpu features are enabled in the guest. Instead, it attempts to enable all features and those that are not supported are silently disabled by the hypervisor. This was done to reduce unnecessary complexity and is sufficient for many cases. However, new scenarios require fine-grained control over these features. Define a new mshv_create_partition_v2 structure which supports passing the disabled processor and xsave feature bits through to the create partition hypercall directly. Introduce a new flag MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES which enables the new structure. If unset, the original mshv_create_partition struct is used, with the old behavior of enabling all features. Co-developed-by: Jinank Jain Signed-off-by: Jinank Jain Signed-off-by: Muminul Islam Signed-off-by: Nuno Das Neves Reviewed-by: Michael Kelley Signed-off-by: Wei Liu --- include/uapi/linux/mshv.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index 374f75e198bc..b645d17cc531 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -26,6 +26,7 @@ enum { MSHV_PT_BIT_LAPIC, MSHV_PT_BIT_X2APIC, MSHV_PT_BIT_GPA_SUPER_PAGES, + MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES, MSHV_PT_BIT_COUNT, }; @@ -41,6 +42,8 @@ enum { * @pt_flags: Bitmask of 1 << MSHV_PT_BIT_* * @pt_isolation: MSHV_PT_ISOLATION_* * + * This is the initial/v1 version for backward compatibility. + * * Returns a file descriptor to act as a handle to a guest partition. * At this point the partition is not yet initialized in the hypervisor. * Some operations must be done with the partition in this state, e.g. setting @@ -52,6 +55,37 @@ struct mshv_create_partition { __u64 pt_isolation; }; +#define MSHV_NUM_CPU_FEATURES_BANKS 2 + +/** + * struct mshv_create_partition_v2 + * + * This is extended version of the above initial MSHV_CREATE_PARTITION + * ioctl and allows for following additional parameters: + * + * @pt_num_cpu_fbanks: Must be set to MSHV_NUM_CPU_FEATURES_BANKS. + * @pt_cpu_fbanks: Disabled processor feature banks array. + * @pt_disabled_xsave: Disabled xsave feature bits. + * + * pt_cpu_fbanks and pt_disabled_xsave are passed through as-is to the create + * partition hypercall. + * + * Returns : same as above original mshv_create_partition + */ +struct mshv_create_partition_v2 { + __u64 pt_flags; + __u64 pt_isolation; + __u16 pt_num_cpu_fbanks; + __u8 pt_rsvd[6]; /* MBZ */ + __u64 pt_cpu_fbanks[MSHV_NUM_CPU_FEATURES_BANKS]; + __u64 pt_rsvd1[2]; /* MBZ */ +#if defined(__x86_64__) + __u64 pt_disabled_xsave; +#else + __u64 pt_rsvd2; /* MBZ */ +#endif +} __packed; + /* /dev/mshv */ #define MSHV_CREATE_PARTITION _IOW(MSHV_IOCTL, 0x00, struct mshv_create_partition) -- cgit v1.2.3 From 796ef5a7fe86a8605f2844471ed7baa8e80bace8 Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Thu, 13 Nov 2025 04:41:47 +0000 Subject: static_call: allow using STATIC_CALL_TRAMP_STR() from assembly STATIC_CALL_TRAMP_STR() could not be used from .S files because static_call_types.h was not safe to include in assembly as it pulled in C types/constructs that are unavailable under __ASSEMBLY__. Make the header assembly-friendly by adding __ASSEMBLY__ checks and providing only the minimal definitions needed for assembly, so that it can be safely included by .S code. This enables emitting the static call trampoline symbol name via STATIC_CALL_TRAMP_STR() directly in assembly sources, to be used with 'call' instruction. Also, move a certain definitions out of __ASSEMBLY__ checks in compiler_types.h to meet the dependencies. No functional change for C compilation. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Naman Jain Signed-off-by: Wei Liu --- include/linux/compiler_types.h | 8 ++++---- include/linux/static_call_types.h | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 59288a2c1ad2..6897d4d5cb28 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -11,6 +11,10 @@ #define __has_builtin(x) (0) #endif +/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ +#define ___PASTE(a, b) a##b +#define __PASTE(a, b) ___PASTE(a, b) + #ifndef __ASSEMBLY__ /* @@ -79,10 +83,6 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # define __builtin_warning(x, y...) (1) #endif /* __CHECKER__ */ -/* Indirect macros required for expanded argument pasting, eg. __LINE__. */ -#define ___PASTE(a,b) a##b -#define __PASTE(a,b) ___PASTE(a,b) - #ifdef __KERNEL__ /* Attributes */ diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h index 5a00b8b2cf9f..cfb6ddeb292b 100644 --- a/include/linux/static_call_types.h +++ b/include/linux/static_call_types.h @@ -25,6 +25,8 @@ #define STATIC_CALL_SITE_INIT 2UL /* init section */ #define STATIC_CALL_SITE_FLAGS 3UL +#ifndef __ASSEMBLY__ + /* * The static call site table needs to be created by external tooling (objtool * or a compiler plugin). @@ -100,4 +102,6 @@ struct static_call_key { #endif /* CONFIG_HAVE_STATIC_CALL */ +#endif /* __ASSEMBLY__ */ + #endif /* _STATIC_CALL_TYPES_H */ -- cgit v1.2.3 From 4a09126a33638945a1640e064ed73e983b51ae07 Mon Sep 17 00:00:00 2001 From: Michael Riesch Date: Fri, 14 Nov 2025 16:20:13 +0100 Subject: media: dt-bindings: video-interfaces: add defines for sampling modes Add defines for the pixel clock sampling modes (rising edge, falling edge, dual edge) for parallel video interfaces. This avoids hardcoded constants in device tree sources. Acked-by: Rob Herring Signed-off-by: Michael Riesch Reviewed-by: Bryan O'Donoghue Signed-off-by: Michael Riesch Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil --- include/dt-bindings/media/video-interfaces.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/media/video-interfaces.h b/include/dt-bindings/media/video-interfaces.h index 88b9d05d8075..0b19c9b2e627 100644 --- a/include/dt-bindings/media/video-interfaces.h +++ b/include/dt-bindings/media/video-interfaces.h @@ -20,4 +20,8 @@ #define MEDIA_BUS_CSI2_CPHY_LINE_ORDER_CAB 4 #define MEDIA_BUS_CSI2_CPHY_LINE_ORDER_CBA 5 +#define MEDIA_PCLK_SAMPLE_FALLING_EDGE 0 +#define MEDIA_PCLK_SAMPLE_RISING_EDGE 1 +#define MEDIA_PCLK_SAMPLE_DUAL_EDGE 2 + #endif /* __DT_BINDINGS_MEDIA_VIDEO_INTERFACES_H__ */ -- cgit v1.2.3 From b3bc229b54e780fe02a41ec65a0cb06acf7ac1d9 Mon Sep 17 00:00:00 2001 From: Chin-Ting Kuo Date: Fri, 10 Oct 2025 16:03:13 +0800 Subject: dt-bindings: watchdog: aspeed,ast2400-wdt: Add support for AST2700 Add support for the AST2700 SoC in the ASPEED watchdog device tree bindings. This includes: - Adding "aspeed,ast2700-wdt" to the compatible string list. - Extending the "aspeed,reset-mask" property description for AST2700. - Defining AST2700-specific reset mask bits in aspeed-wdt.h, covering RESET1 to RESET5. Signed-off-by: Chin-Ting Kuo Reviewed-by: Rob Herring (Arm) Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/dt-bindings/watchdog/aspeed-wdt.h | 138 ++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/watchdog/aspeed-wdt.h b/include/dt-bindings/watchdog/aspeed-wdt.h index 7ae6d84b2bd9..89fa31ffce2d 100644 --- a/include/dt-bindings/watchdog/aspeed-wdt.h +++ b/include/dt-bindings/watchdog/aspeed-wdt.h @@ -89,4 +89,142 @@ #define AST2600_WDT_RESET2_DEFAULT 0x03fffff1 +#define AST2700_WDT_RESET1_CPU (1 << 0) +#define AST2700_WDT_RESET1_DRAM (1 << 1) +#define AST2700_WDT_RESET1_SLI0 (1 << 2) +#define AST2700_WDT_RESET1_EHCI (1 << 3) +#define AST2700_WDT_RESET1_HACE (1 << 4) +#define AST2700_WDT_RESET1_SOC_MISC0 (1 << 5) +#define AST2700_WDT_RESET1_VIDEO (1 << 6) +#define AST2700_WDT_RESET1_2D_GRAPHIC (1 << 7) +#define AST2700_WDT_RESET1_RAVS0 (1 << 8) +#define AST2700_WDT_RESET1_RAVS1 (1 << 9) +#define AST2700_WDT_RESET1_GPIO0 (1 << 10) +#define AST2700_WDT_RESET1_SSP (1 << 11) +#define AST2700_WDT_RESET1_TSP (1 << 12) +#define AST2700_WDT_RESET1_CRT (1 << 13) +#define AST2700_WDT_RESET1_USB20_HOST (1 << 14) +#define AST2700_WDT_RESET1_USB11_HOST (1 << 15) +#define AST2700_WDT_RESET1_UFS (1 << 16) +#define AST2700_WDT_RESET1_EMMC (1 << 17) +#define AST2700_WDT_RESET1_AHB_TO_PCIE1 (1 << 18) +#define AST2700_WDT_RESET1_XDMA0 (1 << 22) +#define AST2700_WDT_RESET1_MCTP1 (1 << 23) +#define AST2700_WDT_RESET1_MCTP0 (1 << 24) +#define AST2700_WDT_RESET1_JTAG0 (1 << 25) +#define AST2700_WDT_RESET1_ECC (1 << 26) +#define AST2700_WDT_RESET1_XDMA1 (1 << 27) +#define AST2700_WDT_RESET1_DP (1 << 28) +#define AST2700_WDT_RESET1_DP_MCU (1 << 29) +#define AST2700_WDT_RESET1_AHB_TO_PCIE0 (1 << 31) + +#define AST2700_WDT_RESET1_DEFAULT 0x8207ff71 + +#define AST2700_WDT_RESET2_USB3_A_HOST (1 << 0) +#define AST2700_WDT_RESET2_USB3_A_VHUB3 (1 << 1) +#define AST2700_WDT_RESET2_USB3_A_VHUB2 (1 << 2) +#define AST2700_WDT_RESET2_USB3_B_HOST (1 << 3) +#define AST2700_WDT_RESET2_USB3_B_VHUB3 (1 << 4) +#define AST2700_WDT_RESET2_USB3_B_VHUB2 (1 << 5) +#define AST2700_WDT_RESET2_SM3 (1 << 6) +#define AST2700_WDT_RESET2_SM4 (1 << 7) +#define AST2700_WDT_RESET2_SHA3 (1 << 8) +#define AST2700_WDT_RESET2_RSA (1 << 9) + +#define AST2700_WDT_RESET2_DEFAULT 0x000003f6 + +#define AST2700_WDT_RESET3_LPC0 (1 << 0) +#define AST2700_WDT_RESET3_LPC1 (1 << 1) +#define AST2700_WDT_RESET3_MDIO (1 << 2) +#define AST2700_WDT_RESET3_PECI (1 << 3) +#define AST2700_WDT_RESET3_PWM (1 << 4) +#define AST2700_WDT_RESET3_MAC0 (1 << 5) +#define AST2700_WDT_RESET3_MAC1 (1 << 6) +#define AST2700_WDT_RESET3_MAC2 (1 << 7) +#define AST2700_WDT_RESET3_ADC (1 << 8) +#define AST2700_WDT_RESET3_SDC (1 << 9) +#define AST2700_WDT_RESET3_ESPI0 (1 << 10) +#define AST2700_WDT_RESET3_ESPI1 (1 << 11) +#define AST2700_WDT_RESET3_JTAG1 (1 << 12) +#define AST2700_WDT_RESET3_SPI0 (1 << 13) +#define AST2700_WDT_RESET3_SPI1 (1 << 14) +#define AST2700_WDT_RESET3_SPI2 (1 << 15) +#define AST2700_WDT_RESET3_I3C0 (1 << 16) +#define AST2700_WDT_RESET3_I3C1 (1 << 17) +#define AST2700_WDT_RESET3_I3C2 (1 << 18) +#define AST2700_WDT_RESET3_I3C3 (1 << 19) +#define AST2700_WDT_RESET3_I3C4 (1 << 20) +#define AST2700_WDT_RESET3_I3C5 (1 << 21) +#define AST2700_WDT_RESET3_I3C6 (1 << 22) +#define AST2700_WDT_RESET3_I3C7 (1 << 23) +#define AST2700_WDT_RESET3_I3C8 (1 << 24) +#define AST2700_WDT_RESET3_I3C9 (1 << 25) +#define AST2700_WDT_RESET3_I3C10 (1 << 26) +#define AST2700_WDT_RESET3_I3C11 (1 << 27) +#define AST2700_WDT_RESET3_I3C12 (1 << 28) +#define AST2700_WDT_RESET3_I3C13 (1 << 29) +#define AST2700_WDT_RESET3_I3C14 (1 << 30) +#define AST2700_WDT_RESET3_I3C15 (1 << 31) + +#define AST2700_WDT_RESET3_DEFAULT 0x000093ec + +#define AST2700_WDT_RESET4_FMC (1 << 0) +#define AST2700_WDT_RESET4_SOC_MISC1 (1 << 1) +#define AST2700_WDT_RESET4_AHB (1 << 2) +#define AST2700_WDT_RESET4_SLI1 (1 << 3) +#define AST2700_WDT_RESET4_UART0 (1 << 4) +#define AST2700_WDT_RESET4_UART1 (1 << 5) +#define AST2700_WDT_RESET4_UART2 (1 << 6) +#define AST2700_WDT_RESET4_UART3 (1 << 7) +#define AST2700_WDT_RESET4_I2C_MONITOR (1 << 8) +#define AST2700_WDT_RESET4_HOST_TO_SPI1 (1 << 9) +#define AST2700_WDT_RESET4_HOST_TO_SPI2 (1 << 10) +#define AST2700_WDT_RESET4_GPIO1 (1 << 11) +#define AST2700_WDT_RESET4_FSI (1 << 12) +#define AST2700_WDT_RESET4_CANBUS (1 << 13) +#define AST2700_WDT_RESET4_MCTP (1 << 14) +#define AST2700_WDT_RESET4_XDMA (1 << 15) +#define AST2700_WDT_RESET4_UART5 (1 << 16) +#define AST2700_WDT_RESET4_UART6 (1 << 17) +#define AST2700_WDT_RESET4_UART7 (1 << 18) +#define AST2700_WDT_RESET4_UART8 (1 << 19) +#define AST2700_WDT_RESET4_BOOT_MCU (1 << 20) +#define AST2700_WDT_RESET4_IO_MCU (1 << 21) +#define AST2700_WDT_RESET4_LTPI0 (1 << 22) +#define AST2700_WDT_RESET4_VGA_LINK (1 << 23) +#define AST2700_WDT_RESET4_LTPI1 (1 << 24) +#define AST2700_WDT_RESET4_LTPI_PHY (1 << 25) +#define AST2700_WDT_RESET4_ACE (1 << 26) +#define AST2700_WDT_RESET4_LTPI_GPIO0 (1 << 28) +#define AST2700_WDT_RESET4_LTPI_GPIO1 (1 << 29) +#define AST2700_WDT_RESET4_AHB_TO_PCIE1 (1 << 30) +#define AST2700_WDT_RESET4_I3C_DMA (1 << 31) + +#define AST2700_WDT_RESET4_DEFAULT 0x40303803 + +#define AST2700_WDT_RESET5_I2C_GLOBAL (1 << 0) +#define AST2700_WDT_RESET5_I2C0 (1 << 1) +#define AST2700_WDT_RESET5_I2C1 (1 << 2) +#define AST2700_WDT_RESET5_I2C2 (1 << 3) +#define AST2700_WDT_RESET5_I2C3 (1 << 4) +#define AST2700_WDT_RESET5_I2C4 (1 << 5) +#define AST2700_WDT_RESET5_I2C5 (1 << 6) +#define AST2700_WDT_RESET5_I2C6 (1 << 7) +#define AST2700_WDT_RESET5_I2C7 (1 << 8) +#define AST2700_WDT_RESET5_I2C8 (1 << 9) +#define AST2700_WDT_RESET5_I2C9 (1 << 10) +#define AST2700_WDT_RESET5_I2C10 (1 << 11) +#define AST2700_WDT_RESET5_I2C11 (1 << 12) +#define AST2700_WDT_RESET5_I2C12 (1 << 13) +#define AST2700_WDT_RESET5_I2C13 (1 << 14) +#define AST2700_WDT_RESET5_I2C14 (1 << 15) +#define AST2700_WDT_RESET5_I2C15 (1 << 16) +#define AST2700_WDT_RESET5_UHCI (1 << 17) +#define AST2700_WDT_RESET5_USB2_C_UART (1 << 18) +#define AST2700_WDT_RESET5_USB2_C (1 << 19) +#define AST2700_WDT_RESET5_USB2_D_UART (1 << 20) +#define AST2700_WDT_RESET5_USB2_D (1 << 21) + +#define AST2700_WDT_RESET5_DEFAULT 0x00320000 + #endif -- cgit v1.2.3 From 4051a9115ad24bb9a691774730ca9c1dd56de665 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Sep 2025 22:19:10 -0400 Subject: new helper: simple_remove_by_name() simple_recursive_removal(), but instead of victim dentry it takes parent + name. Used to be open-coded in fs/fuse/control.c, but there's no need to expose the guts of that thing there and there are other potential users, so let's lift it into libfs... Acked-by: Miklos Szeredi Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..28bd4e8d3892 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3631,6 +3631,8 @@ extern int simple_rename(struct mnt_idmap *, struct inode *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); +extern void simple_remove_by_name(struct dentry *, const char *, + void (*callback)(struct dentry *)); extern void locked_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); -- cgit v1.2.3 From 1552ddc7fade1ae55af298580ef6c913b8db74bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Sep 2025 17:46:01 -0400 Subject: new helper: simple_done_creating() should be paired with simple_start_creating() - unlocks parent and drops dentry reference. Signed-off-by: Al Viro --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 28bd4e8d3892..f5037c556f61 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3662,6 +3662,7 @@ extern int simple_fill_super(struct super_block *, unsigned long, extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); struct dentry *simple_start_creating(struct dentry *, const char *); +void simple_done_creating(struct dentry *); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); -- cgit v1.2.3 From 8a210cacf5dc2a6210ee42aeca5cd03b2400876f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Mar 2025 19:15:35 -0500 Subject: introduce a flag for explicitly marking persistently pinned dentries Some filesystems use a kinda-sorta controlled dentry refcount leak to pin dentries of created objects in dcache (and undo it when removing those). Reference is grabbed and not released, but it's not actually _stored_ anywhere. That works, but it's hard to follow and verify; among other things, we have no way to tell _which_ of the increments is intended to be an unpaired one. Worse, on removal we need to decide whether the reference had already been dropped, which can be non-trivial if that removal is on umount and we need to figure out if this dentry is pinned due to e.g. unlink() not done. Usually that is handled by using kill_litter_super() as ->kill_sb(), but there are open-coded special cases of the same (consider e.g. /proc/self). Things get simpler if we introduce a new dentry flag (DCACHE_PERSISTENT) marking those "leaked" dentries. Having it set claims responsibility for +1 in refcount. The end result this series is aiming for: * get these unbalanced dget() and dput() replaced with new primitives that would, in addition to adjusting refcount, set and clear persistency flag. * instead of having kill_litter_super() mess with removing the remaining "leaked" references (e.g. for all tmpfs files that hadn't been removed prior to umount), have the regular shrink_dcache_for_umount() strip DCACHE_PERSISTENT of all dentries, dropping the corresponding reference if it had been set. After that kill_litter_super() becomes an equivalent of kill_anon_super(). Doing that in a single step is not feasible - it would affect too many places in too many filesystems. It has to be split into a series. Here we * introduce the new flag * teach shrink_dcache_for_umount() to handle it (i.e. remove and drop refcount on anything that survives to umount with that flag still set) * teach kill_litter_super() that anything with that flag does *not* need to be unpinned. Next commits will add primitives for maintaing that flag and convert the common helpers to those. After that - a long series of per-filesystem patches converting to those primitives. Signed-off-by: Al Viro --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c83e02b94389..94b58655322a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -225,6 +225,7 @@ enum dentry_flags { DCACHE_PAR_LOOKUP = BIT(24), /* being looked up (with parent locked shared) */ DCACHE_DENTRY_CURSOR = BIT(25), DCACHE_NORCU = BIT(26), /* No RCU delay for freeing */ + DCACHE_PERSISTENT = BIT(27) }; #define DCACHE_MANAGED_DENTRY \ -- cgit v1.2.3 From bacdf1d70bbe2027619c7bbbe48b379a806a9678 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Mar 2025 19:38:04 -0500 Subject: primitives for maintaining persisitency * d_make_persistent(dentry, inode) - bump refcount, mark persistent and make hashed positive. Return value is a borrowed reference to dentry; it can be used until something removes persistency (at the very least, until the parent gets unlocked, but some filesystems may have stronger exclusion). * d_make_discardable() - remove persistency mark and drop reference. d_make_persistent() is similar to combination of d_instantiate(), dget() and setting flag. The only difference is that unlike d_instantiate() it accepts hashed and unhashed negatives alike. It is always called in strong locking environment (parent held exclusive, or, in some cases, dentry coming from d_alloc_name()); if we ever start using it with parent held only shared and dentry coming from d_alloc_parallel(), we'll need to copy the in-lookup logics from __d_add(). d_make_discardable() is eqiuvalent to combination of removing flag and dput(); since flag removal requires ->d_lock, there's no point trying to avoid taking that for refcount decrement as fast_dput() does. The slow path of dput() has been taken into a helper and reused in d_make_discardable() instead. Signed-off-by: Al Viro --- include/linux/dcache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 94b58655322a..6ec4066825e3 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -611,5 +611,7 @@ static inline struct dentry *d_next_sibling(const struct dentry *dentry) } void set_default_d_op(struct super_block *, const struct dentry_operations *); +struct dentry *d_make_persistent(struct dentry *, struct inode *); +void d_make_discardable(struct dentry *dentry); #endif /* __LINUX_DCACHE_H */ -- cgit v1.2.3 From 23cbc7a795853bc7a8d0512b7c686ef879f6e909 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 26 Feb 2024 01:55:36 -0500 Subject: procfs: make /self and /thread_self dentries persistent ... and there's no need to remember those pointers anywhere - ->kill_sb() no longer needs to bother since kill_anon_super() will take care of them anyway and proc_pid_readdir() only wants the inumbers, which we had in a couple of static variables all along. Signed-off-by: Al Viro --- include/linux/proc_fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index f139377f4b31..19d1c5e5f335 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -66,8 +66,6 @@ enum proc_pidonly { struct proc_fs_info { struct pid_namespace *pid_ns; - struct dentry *proc_self; /* For /proc/self */ - struct dentry *proc_thread_self; /* For /proc/thread-self */ kgid_t pid_gid; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; -- cgit v1.2.3 From 566a414558aec1ab263ab8709fa783dfa2e34325 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 2 Oct 2025 10:00:52 -0400 Subject: svcrdma: Increase the server's default RPC/RDMA credit grant The range of commits from commit e3274026e2ec ("SUNRPC: move all of xprt handling into svc_xprt_handle()") to commit 15d39883ee7d ("SUNRPC: change the back-channel queue to lwq") enabled NFSD performance to scale better as the number of nfsd threads is increased. These commits were merged in v6.7. Now that the nfsd thread count can scale to more threads, permit individual clients to make more use of those threads. Increase the RPC/RDMA per-connection credit grant from 64 to 128 -- same as the Linux NFS client. Simple single client fio-based benchmarking so far shows only improvement, no regression. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 22704c2e5b9b..57f4fd94166a 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -131,7 +131,7 @@ static inline struct svcxprt_rdma *svc_rdma_rqst_rdma(struct svc_rqst *rqstp) */ enum { RPCRDMA_LISTEN_BACKLOG = 10, - RPCRDMA_MAX_REQUESTS = 64, + RPCRDMA_MAX_REQUESTS = 128, RPCRDMA_MAX_BC_REQUESTS = 2, }; -- cgit v1.2.3 From 6b3b697d65d46a0f640216a3f6c72856c159c567 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 13 Oct 2025 09:54:53 -0400 Subject: sunrpc: allocate a separate bvec array for socket sends svc_tcp_sendmsg() calls xdr_buf_to_bvec() with the second slot of rq_bvec as the start, but doesn't reduce the array length by one, which could lead to an array overrun. Also, rq_bvec is always rq_maxpages in length, which can be too short in some cases, since the TCP record marker consumes a slot. Fix both problems by adding a separate bvec array to the svc_sock that is specifically for sending. For TCP, make this array one slot longer than rq_maxpages, to account for the record marker. For UDP, only allocate as large an array as we need since it's limited to 64k of payload. Signed-off-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: Chuck Lever --- include/linux/sunrpc/svcsock.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 963bbe251e52..de37069aba90 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -26,6 +26,9 @@ struct svc_sock { void (*sk_odata)(struct sock *); void (*sk_owspace)(struct sock *); + /* For sends (protected by xpt_mutex) */ + struct bio_vec *sk_bvec; + /* private TCP part */ /* On-the-wire fragment header: */ __be32 sk_marker; -- cgit v1.2.3 From dd9896d41fdf1050934d6a46a1c5ca2164284e72 Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Sat, 15 Nov 2025 19:06:26 +0100 Subject: ASoC: Intel: avs: Allow the topology to carry NHLT data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Typically the hardware configuration for I2S and DMIC devices resides in the Non-HDAudio Link Table (NHLT) that is part of the ACPI tree. As the NHLTs existing in the field are not always perfect, workaround mechanisms are provided to patch them. Currently the avs-driver is utilizing the ->blob_fmt override (see topology.h and struct avs_tplg_modcfg_ext) when there is a valid entry within a NHLT to configure the hardware for specific format but its descriptor (header) is invalid. A separate case is when there is no correct hardware configuration at all within the NHLT available in the system. Patching the header won't help and forcing ad-hoc BIOS updates for dated system is not feasible. Allowing the topology to carry the data is the solution of choice as replacing a userspace file that is part of /lib/firmware/intel/ is less invasive than BIOS update and solves the problem. Co-developed-by: Amadeusz Sławiński Signed-off-by: Amadeusz Sławiński Signed-off-by: Cezary Rojewski Link: https://patch.msgid.link/20251115180627.3589520-2-cezary.rojewski@intel.com Signed-off-by: Mark Brown --- include/uapi/sound/intel/avs/tokens.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/sound/intel/avs/tokens.h b/include/uapi/sound/intel/avs/tokens.h index f3ff6aae09a9..f7cbbfb00227 100644 --- a/include/uapi/sound/intel/avs/tokens.h +++ b/include/uapi/sound/intel/avs/tokens.h @@ -21,6 +21,7 @@ enum avs_tplg_token { AVS_TKN_MANIFEST_NUM_BINDINGS_U32 = 8, AVS_TKN_MANIFEST_NUM_CONDPATH_TMPLS_U32 = 9, AVS_TKN_MANIFEST_NUM_INIT_CONFIGS_U32 = 10, + AVS_TKN_MANIFEST_NUM_NHLT_CONFIGS_U32 = 11, /* struct avs_tplg_library */ AVS_TKN_LIBRARY_ID_U32 = 101, @@ -160,6 +161,10 @@ enum avs_tplg_token { AVS_TKN_INIT_CONFIG_ID_U32 = 2401, AVS_TKN_INIT_CONFIG_PARAM_U8 = 2402, AVS_TKN_INIT_CONFIG_LENGTH_U32 = 2403, + + /* struct avs_tplg_nhlt_config */ + AVS_TKN_NHLT_CONFIG_ID_U32 = 2501, + AVS_TKN_NHLT_CONFIG_SIZE_U32 = 2502, }; #endif -- cgit v1.2.3 From d5c8b7902a41625ea328b52c78ebe750fbf6fef7 Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Sat, 15 Nov 2025 19:06:27 +0100 Subject: ASoC: Intel: avs: Honor NHLT override when setting up a path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case topology provides NHLT configuration, use it instead of relying on the table in ACPI tree. Only gateway-related modules e.g.: Copier care about the process. For those the order of fetching for hardware configuration becomes: 1) check if NHLT override is set, 2) check if NHLT descriptor override is set, 3) use NHLT from ACPI directly Such approach ensures no conflicts exist between 1) and 2) and that 1) always takes precedence. Co-developed-by: Amadeusz Sławiński Signed-off-by: Amadeusz Sławiński Signed-off-by: Cezary Rojewski Link: https://patch.msgid.link/20251115180627.3589520-3-cezary.rojewski@intel.com Signed-off-by: Mark Brown --- include/uapi/sound/intel/avs/tokens.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/sound/intel/avs/tokens.h b/include/uapi/sound/intel/avs/tokens.h index f7cbbfb00227..3ff6d9150822 100644 --- a/include/uapi/sound/intel/avs/tokens.h +++ b/include/uapi/sound/intel/avs/tokens.h @@ -125,6 +125,7 @@ enum avs_tplg_token { AVS_TKN_MOD_KCONTROL_ID_U32 = 1707, AVS_TKN_MOD_INIT_CONFIG_NUM_IDS_U32 = 1708, AVS_TKN_MOD_INIT_CONFIG_ID_U32 = 1709, + AVS_TKN_MOD_NHLT_CONFIG_ID_U32 = 1710, /* struct avs_tplg_path_template */ AVS_TKN_PATH_TMPL_ID_U32 = 1801, -- cgit v1.2.3 From 4d5c668c268b7812ff15452d303974ce247ad378 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 11 Nov 2025 00:17:48 +0000 Subject: ASoC: soc.h: convert to snd_soc_dapm_xxx() This patch converts below functions. dapm->dev -> snd_soc_dapm_to_dev() dapm->card -> snd_soc_dapm_to_card() dapm->component -> snd_soc_dapm_to_component() dapm_kcontrol_get_value() -> snd_soc_dapm_kcontrol_get_value() snd_soc_component_enable_pin() -> snd_soc_dapm_enable_pin() snd_soc_component_enable_pin_unlocked() -> snd_soc_dapm_enable_pin_unlocked() snd_soc_component_disable_pin() -> snd_soc_dapm_disable_pin() snd_soc_component_disable_pin_unlocked() -> snd_soc_dapm_disable_pin_unlocked() snd_soc_component_nc_pin() -> snd_soc_dapm_nc_pin() snd_soc_component_nc_pin_unlocked() -> snd_soc_dapm_nc_pin_unlocked() snd_soc_component_get_pin_status() -> snd_soc_dapm_get_pin_status() snd_soc_component_force_enable_pin() -> snd_soc_dapm_force_enable_pin() snd_soc_component_force_enable_pin_unlocked() -> snd_soc_dapm_force_enable_pin_unlocked() snd_soc_component_force_bias_level() -> snd_soc_dapm_force_bias_level() snd_soc_component_get_bias_level() -> snd_soc_dapm_get_bias_level() snd_soc_component_init_bias_level() -> snd_soc_dapm_init_bias_level() snd_soc_component_get_dapm() -> snd_soc_component_to_dapm() snd_soc_dapm_kcontrol_component() -> snd_soc_dapm_kcontrol_to_component() snd_soc_dapm_kcontrol_widget() -> snd_soc_dapm_kcontrol_to_widget() snd_soc_dapm_kcontrol_dapm() -> snd_soc_dapm_kcontrol_to_dapm() snd_soc_dapm_np_pin() -> snd_soc_dapm_disable_pin() Signed-off-by: Kuninori Morimoto Reviewed-by: Charles Keepax Link: https://patch.msgid.link/874ir1a0cz.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/sound/soc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 53b4129ee97a..37dc6f6fc63f 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -1467,22 +1467,22 @@ static inline void _snd_soc_dapm_mutex_assert_held_c(struct snd_soc_card *card) static inline void _snd_soc_dapm_mutex_lock_root_d(struct snd_soc_dapm_context *dapm) { - _snd_soc_dapm_mutex_lock_root_c(dapm->card); + _snd_soc_dapm_mutex_lock_root_c(snd_soc_dapm_to_card(dapm)); } static inline void _snd_soc_dapm_mutex_lock_d(struct snd_soc_dapm_context *dapm) { - _snd_soc_dapm_mutex_lock_c(dapm->card); + _snd_soc_dapm_mutex_lock_c(snd_soc_dapm_to_card(dapm)); } static inline void _snd_soc_dapm_mutex_unlock_d(struct snd_soc_dapm_context *dapm) { - _snd_soc_dapm_mutex_unlock_c(dapm->card); + _snd_soc_dapm_mutex_unlock_c(snd_soc_dapm_to_card(dapm)); } static inline void _snd_soc_dapm_mutex_assert_held_d(struct snd_soc_dapm_context *dapm) { - _snd_soc_dapm_mutex_assert_held_c(dapm->card); + _snd_soc_dapm_mutex_assert_held_c(snd_soc_dapm_to_card(dapm)); } #define snd_soc_dapm_mutex_lock_root(x) _Generic((x), \ -- cgit v1.2.3 From 8855eb7d29400fb7b2882da33725db2801c410e4 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 11 Nov 2025 00:17:52 +0000 Subject: ASoC: asoc.h: convert to snd_soc_dapm_xxx() This patch converts below functions. dapm->dev -> snd_soc_dapm_to_dev() dapm->card -> snd_soc_dapm_to_card() dapm->component -> snd_soc_dapm_to_component() dapm_kcontrol_get_value() -> snd_soc_dapm_kcontrol_get_value() snd_soc_component_enable_pin() -> snd_soc_dapm_enable_pin() snd_soc_component_enable_pin_unlocked() -> snd_soc_dapm_enable_pin_unlocked() snd_soc_component_disable_pin() -> snd_soc_dapm_disable_pin() snd_soc_component_disable_pin_unlocked() -> snd_soc_dapm_disable_pin_unlocked() snd_soc_component_nc_pin() -> snd_soc_dapm_nc_pin() snd_soc_component_nc_pin_unlocked() -> snd_soc_dapm_nc_pin_unlocked() snd_soc_component_get_pin_status() -> snd_soc_dapm_get_pin_status() snd_soc_component_force_enable_pin() -> snd_soc_dapm_force_enable_pin() snd_soc_component_force_enable_pin_unlocked() -> snd_soc_dapm_force_enable_pin_unlocked() snd_soc_component_force_bias_level() -> snd_soc_dapm_force_bias_level() snd_soc_component_get_bias_level() -> snd_soc_dapm_get_bias_level() snd_soc_component_init_bias_level() -> snd_soc_dapm_init_bias_level() snd_soc_component_get_dapm() -> snd_soc_component_to_dapm() snd_soc_dapm_kcontrol_component() -> snd_soc_dapm_kcontrol_to_component() snd_soc_dapm_kcontrol_widget() -> snd_soc_dapm_kcontrol_to_widget() snd_soc_dapm_kcontrol_dapm() -> snd_soc_dapm_kcontrol_to_dapm() snd_soc_dapm_np_pin() -> snd_soc_dapm_disable_pin() Signed-off-by: Kuninori Morimoto Reviewed-by: Charles Keepax Link: https://patch.msgid.link/87346la0cv.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown --- include/trace/events/asoc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/asoc.h b/include/trace/events/asoc.h index 6696dbcc2b96..4a645549164e 100644 --- a/include/trace/events/asoc.h +++ b/include/trace/events/asoc.h @@ -27,8 +27,8 @@ DECLARE_EVENT_CLASS(snd_soc_dapm, TP_ARGS(dapm, val), TP_STRUCT__entry( - __string( card_name, dapm->card->name) - __string( comp_name, dapm->component ? dapm->component->name : "(none)") + __string( card_name, snd_soc_dapm_to_card(dapm)->name) + __string( comp_name, snd_soc_dapm_to_component(dapm) ? snd_soc_dapm_to_component(dapm)->name : "(none)") __field( int, val) ), -- cgit v1.2.3 From 37d17925480404f1293f24d027fbf3c9975603d7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 29 Sep 2025 11:46:43 +0100 Subject: mm/thp: drop follow_devmap_pmd() default stub follow_devmap_pmd() has already been dropped by the commit fd2825b0760a ("mm/gup: remove pXX_devmap usage from get_user_pages()"). The fallback stub in the header which is now redundant, can be dropped off as well. Link: https://lkml.kernel.org/r/20250929104643.1100421-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Dev Jain Reviewed-by: Alistair Popple Reviewed-by: Wei Yang Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 71ac78b9f834..fee4cf7fa300 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -682,12 +682,6 @@ static inline void mm_put_huge_zero_folio(struct mm_struct *mm) return; } -static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) -{ - return NULL; -} - static inline bool thp_migration_supported(void) { return false; -- cgit v1.2.3 From 9c47753167a6a585d0305663c6912f042e131c2d Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:29 +0200 Subject: mm/vmalloc: defer freeing partly initialized vm_struct __vmalloc_area_node() may call free_vmap_area() or vfree() on error paths, both of which can sleep. This becomes problematic if the function is invoked from an atomic context, such as when GFP_ATOMIC or GFP_NOWAIT is passed via gfp_mask. To fix this, unify error paths and defer the cleanup of partly initialized vm_struct objects to a workqueue. This ensures that freeing happens in a process context and avoids invalid sleeps in atomic regions. Link: https://lkml.kernel.org/r/20251007122035.56347-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index eb54b7b3202f..1e43181369f1 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -50,7 +50,11 @@ struct iov_iter; /* in uio.h */ #endif struct vm_struct { - struct vm_struct *next; + union { + struct vm_struct *next; /* Early registration of vm_areas. */ + struct llist_node llnode; /* Asynchronous freeing on error paths. */ + }; + void *addr; unsigned long size; unsigned long flags; -- cgit v1.2.3 From 8da89ba18ed4e9000d9b9b5b1f699e5004f4abf6 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:30 +0200 Subject: mm/vmalloc: handle non-blocking GFP in __vmalloc_area_node() Make __vmalloc_area_node() respect non-blocking GFP masks such as GFP_ATOMIC and GFP_NOWAIT. - Add memalloc_apply_gfp_scope()/memalloc_restore_scope() helpers to apply a proper scope. - Apply memalloc_apply_gfp_scope()/memalloc_restore_scope() around vmap_pages_range() for page table setup. - Set "nofail" to false if a non-blocking mask is used, as they are mutually exclusive. This is particularly important for page table allocations that internally use GFP_PGTABLE_KERNEL, which may sleep unless such scope restrictions are applied. For example: __pte_alloc_kernel() pte_alloc_one_kernel(&init_mm); pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, 0); Note: in most cases, PTE entries are established only up to the level required by current vmap space usage, meaning the page tables are typically fully populated during the mapping process. Link: https://lkml.kernel.org/r/20251007122035.56347-6-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Acked-by: Michal Hocko Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 1e43181369f1..e8e94f90d686 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -332,4 +332,6 @@ bool vmalloc_dump_obj(void *object); static inline bool vmalloc_dump_obj(void *object) { return false; } #endif +unsigned int memalloc_apply_gfp_scope(gfp_t gfp_mask); +void memalloc_restore_scope(unsigned int flags); #endif /* _LINUX_VMALLOC_H */ -- cgit v1.2.3 From b186a94227b753f2fdcab0df29dfc636c63ac329 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:32 +0200 Subject: kmsan: remove hard-coded GFP_KERNEL flags kmsan_vmap_pages_range_noflush() allocates its temp s_pages/o_pages arrays with GFP_KERNEL, which may sleep. This is inconsistent with vmalloc() as it will support non-blocking requests later. Plumb gfp_mask through the kmsan_vmap_pages_range_noflush(), so it can use it internally for its demand. Please note, the subsequent __vmap_pages_range_noflush() still uses GFP_KERNEL and can sleep. If a caller runs under reclaim constraints, sleeping is forbidden, it must establish the appropriate memalloc scope API. Link: https://lkml.kernel.org/r/20251007122035.56347-8-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Alexander Potapenko Cc: Marco Elver Cc: Andrey Ryabinin Cc: Baoquan He Cc: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index f2fd221107bb..7da9fd506b39 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -133,6 +133,7 @@ void kmsan_kfree_large(const void *ptr); * @prot: page protection flags used for vmap. * @pages: array of pages. * @page_shift: page_shift passed to vmap_range_noflush(). + * @gfp_mask: gfp_mask to use internally. * * KMSAN maps shadow and origin pages of @pages into contiguous ranges in * vmalloc metadata address range. Returns 0 on success, callers must check @@ -142,7 +143,8 @@ int __must_check kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages, - unsigned int page_shift); + unsigned int page_shift, + gfp_t gfp_mask); /** * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. @@ -347,7 +349,7 @@ static inline void kmsan_kfree_large(const void *ptr) static inline int __must_check kmsan_vmap_pages_range_noflush( unsigned long start, unsigned long end, pgprot_t prot, - struct page **pages, unsigned int page_shift) + struct page **pages, unsigned int page_shift, gfp_t gfp_mask) { return 0; } -- cgit v1.2.3 From 7241bb2ea33d5ff50b77a5981342bcc826bef52a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Tue, 7 Oct 2025 14:20:33 +0200 Subject: mm: skip might_alloc() warnings when PF_MEMALLOC is set might_alloc() catches invalid blocking allocations in contexts where sleeping is not allowed. However when PF_MEMALLOC is set, the page allocator already skips reclaim and other blocking paths. In such cases, a blocking gfp_mask does not actually lead to blocking, so triggering might_alloc() splats is misleading. Adjust might_alloc() to skip warnings when the current task has PF_MEMALLOC set, matching the allocator's actual blocking behaviour. Link: https://lkml.kernel.org/r/20251007122035.56347-9-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Reviewed-by: Baoquan He Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Marco Elver Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 0232d983b715..a74582aed747 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -318,6 +318,9 @@ static inline void might_alloc(gfp_t gfp_mask) fs_reclaim_acquire(gfp_mask); fs_reclaim_release(gfp_mask); + if (current->flags & PF_MEMALLOC) + return; + might_sleep_if(gfpflags_allow_blocking(gfp_mask)); } -- cgit v1.2.3 From 590c03ca6a3fbb114396673314e2aa483839608b Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 7 Oct 2025 18:28:21 +0800 Subject: mm/ksm: fix exec/fork inheritance support for prctl Patch series "ksm: fix exec/fork inheritance", v2. This series fixes exec/fork inheritance. See the detailed description of the issue below. This patch (of 2): Background ========== commit d7597f59d1d33 ("mm: add new api to enable ksm per process") introduced MMF_VM_MERGE_ANY for mm->flags, and allowed user to set it by prctl() so that the process's VMAs are forcibly scanned by ksmd. Subsequently, the 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") supported inheriting the MMF_VM_MERGE_ANY flag when a task calls execve(). Finally, commit 3a9e567ca45fb ("mm/ksm: fix ksm exec support for prctl") fixed the issue that ksmd doesn't scan the mm_struct with MMF_VM_MERGE_ANY by adding the mm_slot to ksm_mm_head in __bprm_mm_init(). Problem ======= In some extreme scenarios, however, this inheritance of MMF_VM_MERGE_ANY during exec/fork can fail. For example, when the scanning frequency of ksmd is tuned extremely high, a process carrying MMF_VM_MERGE_ANY may still fail to pass it to the newly exec'd process. This happens because ksm_execve() is executed too early in the do_execve flow (prematurely adding the new mm_struct to the ksm_mm_slot list). As a result, before do_execve completes, ksmd may have already performed a scan and found that this new mm_struct has no VM_MERGEABLE VMAs, thus clearing its MMF_VM_MERGE_ANY flag. Consequently, when the new program executes, the flag MMF_VM_MERGE_ANY inheritance missed. Root reason =========== commit d7597f59d1d33 ("mm: add new api to enable ksm per process") clear the flag MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs. Solution ======== Firstly, Don't clear MMF_VM_MERGE_ANY when ksmd found no VM_MERGEABLE VMAs, because perhaps their mm_struct has just been added to ksm_mm_slot list, and its process has not yet officially started running or has not yet performed mmap/brk to allocate anonymous VMAS. Secondly, recheck MMF_VM_MERGEABLE again if a process takes MMF_VM_MERGE_ANY, and create a mm_slot and join it into ksm_scan_list again. Link: https://lkml.kernel.org/r/20251007182504440BJgK8VXRHh8TD7IGSUIY4@zte.com.cn Link: https://lkml.kernel.org/r/20251007182821572h_SoFqYZXEP1mvWI4n9VL@zte.com.cn Fixes: 3c6f33b7273a ("mm/ksm: support fork/exec for prctl") Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process") Signed-off-by: xu xin Cc: Stefan Roesch Cc: David Hildenbrand Cc: Jinjiang Tu Cc: Wang Yaxin Cc: Yang Yang Cc: Signed-off-by: Andrew Morton --- include/linux/ksm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 067538fc4d58..c982694c987b 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -17,7 +17,7 @@ #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, vm_flags_t *vm_flags); -vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, +vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); @@ -103,7 +103,7 @@ bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ -static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, +static inline vm_flags_t ksm_vma_flags(struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { return vm_flags; -- cgit v1.2.3 From 9ac09bb9feaccc2f45e5606dc48a3f748d478dc4 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 3 Oct 2025 16:53:04 +0100 Subject: mm: consistently use current->mm in mm_get_unmapped_area() mm_get_unmapped_area() is a wrapper around arch_get_unmapped_area() / arch_get_unmapped_area_topdown(), both of which search current->mm for some free space. Neither take an mm_struct - they implicitly operate on current->mm. But the wrapper takes an mm_struct and uses it to decide whether to search bottom up or top down. All callers pass in current->mm for this, so everything is working consistently. But it feels like an accident waiting to happen; eventually someone will call that function with a different mm, expecting to find free space in it, but what gets returned is free space in the current mm. So let's simplify by removing the parameter and have the wrapper use current->mm to decide which end to start at. Now everything is consistent and self-documenting. Link: https://lkml.kernel.org/r/20251003155306.2147572-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Dev Jain Reviewed-by: Anshuman Khandual Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/sched/mm.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index a74582aed747..0e1d73955fa5 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -189,12 +189,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t); -unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags); +unsigned long mm_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); -unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, - struct file *filp, +unsigned long mm_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, -- cgit v1.2.3 From ada5cbe33a5321f8c896a3362c3aafa0bf262110 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Thu, 9 Oct 2025 20:54:03 +0500 Subject: kasan: cleanup of kasan_enabled() checks Deduplication of kasan_enabled() checks which are already used by callers. * Altered functions: check_page_allocation Delete the check because callers have it already in __wrappers in include/linux/kasan.h: __kasan_kfree_large __kasan_mempool_poison_pages __kasan_mempool_poison_object kasan_populate_vmalloc, kasan_release_vmalloc Add __wrappers in include/linux/kasan.h. They are called externally in mm/vmalloc.c. __kasan_unpoison_vmalloc, __kasan_poison_vmalloc Delete checks because there're already kasan_enabled() checks in respective __wrappers in include/linux/kasan.h. release_free_meta -- Delete the check because the higher caller path has it already. See the stack trace: __kasan_slab_free -- has the check already __kasan_mempool_poison_object -- has the check already poison_slab_object kasan_save_free_info release_free_meta kasan_enabled() -- Delete here Link: https://lkml.kernel.org/r/20251009155403.1379150-3-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Baoquan He Cc: Christophe Leroy Cc: Dmitriy Vyukov Cc: "Ritesh Harjani (IBM)" Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- include/linux/kasan.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d12e1a5f5a9a..f335c1d7b61d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -571,11 +571,27 @@ static inline void kasan_init_hw_tags(void) { } #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); -int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); -void kasan_release_vmalloc(unsigned long start, unsigned long end, +int __kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mask); +static inline int kasan_populate_vmalloc(unsigned long addr, + unsigned long size, gfp_t gfp_mask) +{ + if (kasan_enabled()) + return __kasan_populate_vmalloc(addr, size, gfp_mask); + return 0; +} +void __kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end, unsigned long flags); +static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end, + unsigned long flags) +{ + if (kasan_enabled()) + return __kasan_release_vmalloc(start, end, free_region_start, + free_region_end, flags); +} #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ -- cgit v1.2.3 From eb8762dc220c0b0573100a941bfc68df34ece74f Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 14 Oct 2025 21:09:16 +0530 Subject: drivers/base/node: fold register_node() into register_one_node() Patch series "drivers/base/node: fold node register and unregister functions", v2. The first patch merges register_one_node() and register_node(), leaving a single register_node() function. The second patch merges unregister_one_node() and unregister_node(), leaving a single unregister_node() function. There are no functional changes in these patches. This patch (of 2): register_node() is only called from register_one_node(). This patch folds register_node() into its only caller and renames register_one_node() to register_node(). This reduces unnecessary indirection and simplifies the code structure. No functional changes are introduced. [akpm@linux-foundation.org: fix kerneldoc, per David] Link: https://lkml.kernel.org/r/cover.1760097207.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/910853c9dd61f7a2190a56cba101e73e9c6859be.1760097207.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Acked-by: David Hildenbrand Cc: Aboorva Devarajan Cc: Christophe Leroy Cc: Danilo Krummrich Cc: Dave Jiang Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Oscar Salvador Cc: Peter Zijlstra Cc: "Ritesh Harjani (IBM)" Signed-off-by: Andrew Morton --- include/linux/node.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/node.h b/include/linux/node.h index 866e3323f1fd..b7028d3ec3b4 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -176,7 +176,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) #ifdef CONFIG_NUMA extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ -extern int register_one_node(int nid); +int register_node(int nid); extern void unregister_one_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); @@ -189,7 +189,7 @@ extern int register_memory_node_under_compute_node(unsigned int mem_nid, static inline void node_dev_init(void) { } -static inline int register_one_node(int nid) +static inline int register_node(int nid) { return 0; } -- cgit v1.2.3 From d945667dcb1996ddf00ffa8408b579e4ce573652 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 14 Oct 2025 21:09:17 +0530 Subject: drivers/base/node: fold unregister_node() into unregister_one_node() unregister_node() is only called from unregister_one_node(). This patch folds unregister_node() into its only caller and renames unregister_one_node() to unregister_node(). This reduces unnecessary indirection and simplifies the code structure. No functional changes are introduced. [donettom@linux.ibm.com: remove extra spaces before @nid and "All"] Link: https://lkml.kernel.org/r/cff01514-9074-4c97-bcf1-d4e3594e48b0@linux.ibm.com Link: https://lkml.kernel.org/r/32b7d5d8f0f30d313c3e1d8798f591459c8746f9.1760097208.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Cc: Aboorva Devarajan Cc: Christophe Leroy Cc: Danilo Krummrich Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Oscar Salvador Cc: Peter Zijlstra Cc: "Ritesh Harjani (IBM)" Signed-off-by: Andrew Morton --- include/linux/node.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/node.h b/include/linux/node.h index b7028d3ec3b4..0269b064ba65 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -132,8 +132,6 @@ static inline void register_memory_blocks_under_nodes(void) } #endif -extern void unregister_node(struct node *node); - struct node_notify { int nid; }; @@ -177,7 +175,7 @@ static inline int hotplug_node_notifier(notifier_fn_t fn, int pri) extern void node_dev_init(void); /* Core of the node registration - only memory hotplug should use this */ int register_node(int nid); -extern void unregister_one_node(int nid); +void unregister_node(int nid); extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); @@ -193,7 +191,7 @@ static inline int register_node(int nid) { return 0; } -static inline int unregister_one_node(int nid) +static inline int unregister_node(int nid) { return 0; } -- cgit v1.2.3 From 0acc67c4030c39f39ac90413cc5d0abddd3a9527 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 14 Oct 2025 07:50:08 -0700 Subject: mm/page_alloc/vmstat: simplify refresh_cpu_vm_stats change detection Patch series "mm/page_alloc: Batch callers of free_pcppages_bulk", v5. Motivation & Approach ===================== While testing workloads with high sustained memory pressure on large machines in the Meta fleet (1Tb memory, 316 CPUs), we saw an unexpectedly high number of softlockups. Further investigation showed that the zone lock in free_pcppages_bulk was being held for a long time, and was called to free 2k+ pages over 100 times just during boot. This causes starvation in other processes for the zone lock, which can lead to the system stalling as multiple threads cannot make progress without the locks. We can see these issues manifesting as warnings: [ 4512.591979] rcu: INFO: rcu_sched self-detected stall on CPU [ 4512.604370] rcu: 20-....: (9312 ticks this GP) idle=a654/1/0x4000000000000000 softirq=309340/309344 fqs=5426 [ 4512.626401] rcu: hardirqs softirqs csw/system [ 4512.638793] rcu: number: 0 145 0 [ 4512.651177] rcu: cputime: 30 10410 174 ==> 10558(ms) [ 4512.666657] rcu: (t=21077 jiffies g=783665 q=1242213 ncpus=316) While these warnings don't indicate a crash or a kernel panic, they do point to the underlying issue of lock contention. To prevent starvation in both locks, batch the freeing of pages using pcp->batch. Because free_pcppages_bulk is called with the pcp lock and acquires the zone lock, relinquishing and reacquiring the locks are only effective when both of them are broken together (unless the system was built with queued spinlocks). Thus, instead of modifying free_pcppages_bulk to break both locks, batch the freeing from its callers instead. A similar fix has been implemented in the Meta fleet, and we have seen significantly less softlockups. Testing ======= The following are a few synthetic benchmarks, made on three machines. The first is a large machine with 754GiB memory and 316 processors. The second is a relatively smaller machine with 251GiB memory and 176 processors. The third and final is the smallest of the three, which has 62GiB memory and 36 processors. On all machines, I kick off a kernel build with -j$(nproc). Negative delta is better (faster compilation). Large machine (754GiB memory, 316 processors) make -j$(nproc) +------------+---------------+-----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+-----------+ | real | 0.8070 | - 1.4865 | | user | 0.2823 | + 0.4081 | | sys | 5.0267 | -11.8737 | +------------+---------------+-----------+ Medium machine (251GiB memory, 176 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.2806 | +0.0351 | | user | 0.0994 | +0.3170 | | sys | 0.6229 | -0.6277 | +------------+---------------+----------+ Small machine (62GiB memory, 36 processors) make -j$(nproc) +------------+---------------+----------+ | Metric (s) | Variation (%) | Delta(%) | +------------+---------------+----------+ | real | 0.1503 | -2.6585 | | user | 0.0431 | -2.2984 | | sys | 0.1870 | -3.2013 | +------------+---------------+----------+ Here, variation is the coefficient of variation, i.e. standard deviation / mean. Based on these results, it seems like there are varying degrees to how much lock contention this reduces. For the largest and smallest machines that I ran the tests on, it seems like there is quite some significant reduction. There is also some performance increases visible from userspace. Interestingly, the performance gains don't scale with the size of the machine, but rather there seems to be a dip in the gain there is for the medium-sized machine. One possible theory is that because the high watermark depends on both memory and the number of local CPUs, what impacts zone contention the most is not these individual values, but rather the ratio of mem:processors. This patch (of 5): Currently, refresh_cpu_vm_stats returns an int, indicating how many changes were made during its updates. Using this information, callers like vmstat_update can heuristically determine if more work will be done in the future. However, all of refresh_cpu_vm_stats's callers either (a) ignore the result, only caring about performing the updates, or (b) only care about whether changes were made, but not *how many* changes were made. Simplify the code by returning a bool instead to indicate if updates were made. In addition, simplify fold_diff and decay_pcp_high to return a bool for the same reason. Link: https://lkml.kernel.org/r/20251014145011.3427205-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20251014145011.3427205-2-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Reviewed-by: Vlastimil Babka Reviewed-by: SeongJae Park Cc: Brendan Jackman Cc: Chris Mason Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 623bee335383..b155929af5b1 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -387,7 +387,7 @@ extern void free_pages(unsigned long addr, unsigned int order); #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); -int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +bool decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); -- cgit v1.2.3 From d929525c2e30abee621bf71f143ba6104c81ff2b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 16 Oct 2025 09:10:35 -0700 Subject: memcg: net: track network throttling due to memcg memory pressure The kernel can throttle network sockets if the memory cgroup associated with the corresponding socket is under memory pressure. The throttling actions include clamping the transmit window, failing to expand receive or send buffers, aggressively prune out-of-order receive queue, FIN deferred to a retransmitted packet and more. Let's add memcg metric to track such throttling actions. At the moment memcg memory pressure is defined through vmpressure and in future it may be defined using PSI or we may add more flexible way for the users to define memory pressure, maybe through ebpf. However the potential throttling actions will remain the same, so this newly introduced metric will continue to track throttling actions irrespective of how memcg memory pressure is defined. Link: https://lkml.kernel.org/r/20251016161035.86161-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Kuniyuki Iwashima Reviewed-by: Daniel Sedlak Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kacinski Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Neal Cardwell Cc: Paolo Abeni Cc: Simon Horman Cc: Tejun Heo Cc: Willem de Bruijn Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 1 + include/net/sock.h | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 873e510d6f8d..5fe254813123 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -52,6 +52,7 @@ enum memcg_memory_event { MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, + MEMCG_SOCK_THROTTLED, MEMCG_NR_MEMORY_EVENTS, }; diff --git a/include/net/sock.h b/include/net/sock.h index 60bcb13f045c..ff7d49af1619 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2635,8 +2635,12 @@ static inline bool mem_cgroup_sk_under_memory_pressure(const struct sock *sk) #endif /* CONFIG_MEMCG_V1 */ do { - if (time_before64(get_jiffies_64(), mem_cgroup_get_socket_pressure(memcg))) + if (time_before64(get_jiffies_64(), + mem_cgroup_get_socket_pressure(memcg))) { + memcg_memory_event(mem_cgroup_from_sk(sk), + MEMCG_SOCK_THROTTLED); return true; + } } while ((memcg = parent_mem_cgroup(memcg))); return false; -- cgit v1.2.3 From 2f05435df9320e70f7a98149eb4b043ff361a120 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 17 Oct 2025 15:53:07 +0800 Subject: mm: vmscan: simplify the logic for activating dirty file folios After commit 6b0dfabb3555 ("fs: Remove aops->writepage"), we no longer attempt to write back filesystem folios through reclaim. However, in the shrink_folio_list() function, there still remains some logic related to writeback control of dirty file folios. The original logic was that, for direct reclaim, or when folio_test_reclaim() is false, or the PGDAT_DIRTY flag is not set, the dirty file folios would be directly activated to avoid being scanned again; otherwise, it will try to writeback the dirty file folios. However, since we can no longer perform writeback on dirty folios, the dirty file folios will still be activated. Additionally, under the original logic, if we continue to try writeback dirty file folios, we will also check the references flag, sc->may_writepage, and may_enter_fs(), which may result in dirty file folios being left in the inactive list. This is unreasonable. Even if these dirty folios are scanned again, we still cannot clean them. Therefore, the checks on these dirty file folios appear to be redundant and can be removed. Dirty file folios should be directly moved to the active list to avoid being scanned again. Since we set the PG_reclaim flag for the dirty folios, once the writeback is completed, they will be moved back to the tail of the inactive list to be retried for quick reclaim. Link: https://lkml.kernel.org/r/ba5c49955fd93c6850bcc19abf0e02e1573768aa.1760687075.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 7fb7331c5725..4398e027f450 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1060,10 +1060,6 @@ struct zone { } ____cacheline_internodealigned_in_smp; enum pgdat_flags { - PGDAT_DIRTY, /* reclaim scanning has recently found - * many dirty file pages at the tail - * of the LRU. - */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ -- cgit v1.2.3 From d3946c5f4c1c5db63532eb433a55c7d881de1389 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:53 -0700 Subject: mm/damon: document damos_quota_goal->nid use case Patch series "mm/damon: allow DAMOS auto-tuned for per-memcg per-node memory usage". Introduce two new DAMOS quota auto-tuning target metrics for per-cgroup per-NUMA node memory utilization. Expected use cases are cgroup level access-aware NUMA memory managements, such as memory tiering or proactive reclamation on cgroup-based multi-tenant NUMA systems. Background ========== The aim-oriented aggressiveness auto-tuning feature of DAMOS is a highly recommended way for modern DAMOS use cases. Using it, users can specify what system status they want to achieve with what access-aware system operations. For example, reclaim cold memory aiming for 0.5 percent of memory pressure (proactive reclaim), or migrate hot and cold memory between NUMA nodes having different speed (memory tiering). Then DAMOS automatically adjusts the aggressiveness of the system operation (e.g., increase/decrease reclaim target coldness threshold) based on current status of the system. The use case is limited by the supported system status metrics for specifying the target system status. Two new system metrics for per-node memory usage ratio, namely DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, were recently added to extend the use cases for access-aware NUMA nodes management, such as memory tiering. Those are expected to be useful for not only memory tiering but also general access-aware inter-NUMA node page migration, though. Limitation ---------- The per-node memory usage based auto-tuning can be applied only system-wide. For cgroups-based multi-tenant systems, it could arguably harm the fairness. For example, a cgroup may use faster NUMA node memory more than other cgroup, depending on their access pattern. If the user of each cgroup are promised to get the same quality and amount of the system resource, this can arguably be an unfair situation. DAMOS supports cgroup level system operations via DAMOS filter. But the quota auto-tuning system is not aware of cgroups. New DAMOS Quota Tuning Metrics for Per-Cgroup Per-NUMA Memory Usage =================================================================== To overcome the limitation, introduce two new DAMOS quota auto-tuning goal metrics, namely DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP. Those can be thought of as a variant of DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP that extended for cgroups. The two metrics specifies per-cgroup, per-node amount of used and unused memory in ratio to the total memory of the node. For example, let's assume a system has two NUMA nodes of size 100 GiB and 50 GiB. And two cgroups are using 40 GiB and 60 GiB of node 0, 20 GiB and 10 GiB of node 1, respectively, as illustrated by the below table. node-0 node-1 Total memory 100 GiB 50 GiB Cgroup A usage 40 GiB 20 GiB Cgroup B usage 60 GiB 10 GiB Then, DAMOS_QUOTA_NODE_MEMCG_USED_BP for the cgroups for the first node are, 40 GiB / 100 GiB = 4,000 bp (40 percent) and 60 GiB / 100 GiB = 6,000 bp (60 percent), respectively. Those for the second node are, 20 GiB / 50 GiB = 4000 bp (40 percent) and 10 GiB / 50 GiB = 2000 bp (20 percent), respectively. DAMOS_QUOTA_NODE_MEMCG_FREE_BP for the four cases are, 60 GiB /100 GiB = 6000 bp, 40 GiB / 100 GiB = 4000 bp, 30 GiB / 50 GiB = 6000 bp, and 40 GiB / 50 GiB = 8000 bp, respectively. DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-0: 4000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-0: 6000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup A node-1: 4000 bp DAMOS_QUOTA_NODE_MEMCG_USED_BP for cgroup B node-1: 2000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-0: 6000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-0: 4000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup A node-1: 6000 bp DAMOS_QUOTA_NODE_MEMCG_FREE_BP for cgroup B node-1: 8000 bp Using these, users can specify how much [un]used amount of memory for per-cgroup and per-node DAMOS should make as a result of the auto-tuning. Example Usecase: Cgroup Level Memory Tiering ============================================ Let's suppose a typical and simple tiered memory system. The system equips two NUMA nodes. The first node (node 0) is CPU-attached and fast. The second node (node 1) is CPU-unattached and slow. It runs two cgroups that desire to use about 30 percent and 70 percent of the faster node as much as possible for their hot data, respectively. Then, the user can implement DAMOS-based memory tiering for the system using the DAMON user-space tool (damo), like below. # ./damo start \ `# kdamond for node 1 (slow)` \ --numa_node 1 --monitoring_intervals_goal 4% 3 5ms 10s \ `# promotion scheme for cgroup a` \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter allow young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_used_bp 29.7% 0 /workloads/a \ \ `# promotion scheme for cgroup b` \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/b \ --damos_filter allow young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_used_bp 69.7% 0 workloads/b \ \ `# kdamond for node 0 (fast)` \ --numa_node 0 --monitoring_intervals_goal 4% 3 5ms 10s \ `# demotion scheme for cgroup a` \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter reject young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_free_bp 70.5% 0 \ \ `# demotion scheme for cgroup b` \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workloads/a \ --damos_filter reject young \ --damos_quota_interval 1s --damos_quota_space 200MB \ --damos_quota_goal node_memcg_free_bp 30.5% 0 \ \ --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 1 1 1 1 \ --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1 With the command, the user-space tool will ask DAMON to spawn two kernel threads, each for monitoring accesses to node 1 (slow) and node 0 (fast), respectively. It installs two DAMOS schemes on each thread. Let's call them "promotion scheme for cgroup a/b", and "demotion scheme for cgroup a/b" in the order. The promotion schemes are installed on the DAMON thread for node 1 (slow), and demotion schemes are installed on the DAMON thread for node 0 (fast). Cgroup Level Hot Pages Migration (Promotion) -------------------------------------------- Promotion schemes will find memory regions on node 1 (slow), that some access was detected. The schemes will then migrate the found memory to node 0 (fast), hottest pages first. For accurate and effective migration, these schemes use two page level filters. First, the migration will be filtered for only cgroup A and cgroup B. That is, "promotion scheme for cgroup B" will not do the migration if the page is for cgroup A. Secondly, the schemes will ignore pages that having their page table's Accessed bits unset. The per-page Accessed bit check logic will also unset the bit if it was set, for the next check. For controlled amounts of system resource consumption and aiming on the target memory usage, the schemes use quotas setup. The migration is limited to be done only up to 200 MiB per second, to limit the peak system resource usage. And DAMOS_QUOTA_NODE_MEMCG_USED_BP target is set for 29.7% and 69.7% of node 0 (fast), respectively. The target value is lower than the high level goal (30% and 70% system memory), to give headroom on node 0 (fast). DAMOS will adjust the speed of the pages migration based on the target and current per-cgroup node 0 memory usage. For example, if cgroup A is utilizing only 10% of node 0, DAMOS will try to migrate more of cgroup A hot pages from node 1 to node 0, up to 200 MiB per second. If cgroup A utilizes more than 29.7% of node 0 memory, the cgroup A hot pages migration from node 1 to node 0 will be slowed and eventually stopped. Cgroup Level Cold Pages Migration (Demotion) -------------------------------------------- Demotion schemes are similar to promotion schemes, but differ in filtering setup and quota tuning setup. Those filter out pages having their page table Accessed bits set. And set 70.5% and 30.5% of node 0 memory free rate for the cgroup A and B, respectively. Hence, if promotion schemes or something made cgroup A and/or B uses more than 29.5% and 69.5% of node 0, demotion schemes will start migrating cold pages of appropriate cgroups in node 0 to node 1, under the 200 MiB per second speed cap, while adjusting the speed based on how much more than wanted memory is being used. The quota target values are set to overlap with promotion targets, to keep a minimum level of page exchanges between the nodes. This is to avoid a case that the target memory utilization is met, and then access pattern changes (pages in node 1 become hotter than pages in node 0) while the memory utilization is unchanged. Without the overlap, neither promotion of hotter pages in node 1, nor demotion of colder pages in node 0 will happen since both goals are met. As a result, the faster and slower node will unexpectedly serve cold and hot data. Test: Per-cgroup Memory Tiering =============================== I ran a simplified cgroup level memory tiering using the feature, and confirmed it works as intended. Setup ----- I configured a QEMU virtual machine representing a simplified version of the system that described on the above cgroup level memory tiering example use case. The system equips 40 CPU cores and two NUMA nodes each having 30 GiB physical memory. The first node (node 0) represents the faster NUMA node, and the second node (node 1) represents the slower NUMA node. In specific, below qemu command line options are used. [...] -object memory-backend-ram,size=30G,id=m0 \ -object memory-backend-ram,size=30G,id=m1 \ -numa node,cpus=0-39,memdev=m0 \ -numa node,memdev=m1 \ [...] I booted the virtual machine with a kernel that this patch series is applied. On the virtual machine, I created two cgroups, namely workload_a and workload_b. And ran a test program in each cgroup, resulting in one process per cgroup. The test program allocates 10 GiB memory and evenly split it into 10 regions. After the allocation, it repeatedly access the first region for one minute, than the second one for one minute, and so on. After the one minute repeated access for the 10-th region is done, it repeats the access from the first region. So the process has 10 GiB of data in total, but only 1 GiB of it is hot at a given moment, and the hot data is gradually changed. While the processes are running, run DAMON for a simple access-aware memory tiering using below script. It migrates hot and cold data of the cgroups into node 0 and node 1, aiming the first and the second cgroups (workload_a and workload_b, respectively) utilizing about 9.7 percent and 19.7 percent of node 0, respectively. Note that this setup is a simplified version of the above example use case, for ease of test. Also note that we assigned 30 GiB physical memory to node 0, but DAMON in this setup works for only 27 GiB of the memory. It is due to an internal implementation detail of DAMON user-space tool that not really important for this test. #!/bin/bash damo start \ --numa_node 1 \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_a \ --damos_filter allow young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_used_bp 9.7% 0 /workload_a \ --damos_action migrate_hot 0 --damos_access_rate 5% max \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_b \ --damos_filter allow young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_used_bp 19.7% 0 /workload_b \ --numa_node 0 \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_a \ --damos_filter reject young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_free_bp 90.5% 0 /workload_a \ --damos_action migrate_cold 1 --damos_access_rate 0% 0% \ --damos_apply_interval 1s \ --damos_filter allow memcg /workload_b \ --damos_filter reject young \ --damos_quota_interval 1s \ --damos_quota_goal node_memcg_free_bp 80.5% 0 /workload_b \ --damos_nr_quota_goals 1 1 1 1 --damos_nr_filters 2 2 2 2 \ --nr_targets 1 1 --nr_schemes 2 2 --nr_ctxs 1 1 After starting DAMON, the pages continuously be migrated across nodes. A few minutes later, the memory usage of the cgroups converges into the aimed amounts, and keeps the level, as expected. To confirm the status is kept in the target level as expected, I collected the memory usage stat of the cgroups using memory.numa_stat file, after the stats are converged. I repeat the stat collection 42 times with 5 seconds delay between each of the collections. The results are as below: node0_memory_usage average stdev workload_a 2.79GiB 522.06MiB workload_b 5.15GiB 739.10MiB The average values are quite close to the targeted values: 27 GiB * 9.7% = 2.619 GiB for workload_a, and 27 GiB * 19.7% = 5.319 GiB. A level of variances are expected, given the overlap of the promotion/demotion targets, and dynamic data access pattern of the workloads. Give that, the measured variances are at a reasonable level. Patches Sequence ================ The first patch (patch 1) updates the kernel-doc comment of damos_quota_goal struct to clarify usage of optional fields of the struct, since later patches will add such optional fields. Following four patches (patches 2-5) implement a new DAMOS quota goal metric for per-cgroup per-node memory usage. Those extends the core layer interface for the new metric (patch 2), implement the metric value calculation on the core layer (patch 3), add DAMON sysfs interface file for the target cgroup specification (patch 4), and implement support of the new metric on DAMON sysfs interface (patch 5). Next two patches implment the second new DAMOS quota goal metric for per-cgroup per-node free (or, unused) memory. Those implement it in the core layer (patch 6) and DAMON sysfs interface (patch 7), extending the existing implementation for memory usage metric. Final three patches update the design (patch 8), the usage (patch 9), and the ABI (patch 10) documents for the changes that are introduced by this patch series. This patch (of 10): damos_quota_goal kerneldoc comment is not explaining when @metric is used. Update the comment for that. Link: https://lkml.kernel.org/r/20251017212706.183502-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251017212706.183502-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index cae8c613c5fc..dc9c310e0e75 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -176,6 +176,9 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_USER_INPUT, @current_value should be manually * entered by the user, probably inside the kdamond callbacks. Otherwise, * DAMON sets @current_value with self-measured value of @metric. + * + * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node + * id of the target node to account the used/free memory. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; -- cgit v1.2.3 From 6a18bbe48361acad1eae8d86aa47d353b1cfe619 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:54 -0700 Subject: mm/damon: add DAMOS quota goal type for per-memcg per-node memory usage Define a new DAMOS quota auto-tuning target metric for per-cgroup per-node memory usage. For specifying the cgroup of the interest, add a field, namely memcg_id, to damos_quota_goal struct. Note that this commit is only implementing the interface. The handling of the interface (the metric value calculation) will be implemented in the following commit. Link: https://lkml.kernel.org/r/20251017212706.183502-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index dc9c310e0e75..0d63ceb7e6ef 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -147,6 +147,7 @@ enum damos_action { * @DAMOS_QUOTA_SOME_MEM_PSI_US: System level some memory PSI in us. * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. + * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -156,6 +157,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_SOME_MEM_PSI_US, DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, + DAMOS_QUOTA_NODE_MEMCG_USED_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -166,6 +168,7 @@ enum damos_quota_goal_metric { * @current_value: Current value of @metric. * @last_psi_total: Last measured total PSI * @nid: Node id. + * @memcg_id: Memcg id. * @list: List head for siblings. * * Data structure for getting the current score of the quota tuning goal. The @@ -179,6 +182,9 @@ enum damos_quota_goal_metric { * * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node * id of the target node to account the used/free memory. + * + * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents + * the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; @@ -187,7 +193,10 @@ struct damos_quota_goal { /* metric-dependent fields */ union { u64 last_psi_total; - int nid; + struct { + int nid; + unsigned short memcg_id; + }; }; struct list_head list; }; -- cgit v1.2.3 From 98fdce76fb7ed7070df21afbee46a4b36cb6a7c6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 17 Oct 2025 14:26:58 -0700 Subject: mm/damon/core: add DAMOS quota gaol metric for per-memcg per-numa free memory Add a variant of DAMOS_QUOTA_NODE_MEMCG_USED_BP, for the free memory portion. The value of the metric is implemented as the entire memory of the given NUMA node subtracted by the given cgroup's usage. So from a perspective, "unused" could be a better term than "free". But arguably it is not very clear what is better, so use the term "free". Link: https://lkml.kernel.org/r/20251017212706.183502-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0d63ceb7e6ef..0edf41d36ea1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -148,6 +148,7 @@ enum damos_action { * @DAMOS_QUOTA_NODE_MEM_USED_BP: MemUsed ratio of a node. * @DAMOS_QUOTA_NODE_MEM_FREE_BP: MemFree ratio of a node. * @DAMOS_QUOTA_NODE_MEMCG_USED_BP: MemUsed ratio of a node for a cgroup. + * @DAMOS_QUOTA_NODE_MEMCG_FREE_BP: MemFree ratio of a node for a cgroup. * @NR_DAMOS_QUOTA_GOAL_METRICS: Number of DAMOS quota goal metrics. * * Metrics equal to larger than @NR_DAMOS_QUOTA_GOAL_METRICS are unsupported. @@ -158,6 +159,7 @@ enum damos_quota_goal_metric { DAMOS_QUOTA_NODE_MEM_USED_BP, DAMOS_QUOTA_NODE_MEM_FREE_BP, DAMOS_QUOTA_NODE_MEMCG_USED_BP, + DAMOS_QUOTA_NODE_MEMCG_FREE_BP, NR_DAMOS_QUOTA_GOAL_METRICS, }; @@ -183,8 +185,8 @@ enum damos_quota_goal_metric { * If @metric is DAMOS_QUOTA_NODE_MEM_{USED,FREE}_BP, @nid represents the node * id of the target node to account the used/free memory. * - * If @metric is DAMOS_QUOTA_NODE_MEMCG_USED_BP, @nid and @memcg_id represents - * the node id and the cgroup to account the used memory for. + * If @metric is DAMOS_QUOTA_NODE_MEMCG_{USED,FREE}_BP, @nid and @memcg_id + * represents the node id and the cgroup to account the used memory for. */ struct damos_quota_goal { enum damos_quota_goal_metric metric; -- cgit v1.2.3 From e859a224fad65cb4848fe202aea9896a14fdb7f4 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Mon, 20 Oct 2025 21:01:24 +0800 Subject: mm/damon: add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() Patch series "mm/damon: fixes for address alignment issues in DAMON_LRU_SORT and DAMON_RECLAIM", v2. In DAMON_LRU_SORT and DAMON_RECLAIM, damon_set_regions() will apply DAMON_MIN_REGION as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT and DAMON_RECLAIM to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() and use it when calling damon_set_regions(), replacing the direct use of DAMON_MIN_REGION. This patch (of 2): In DAMON_LRU_SORT, damon_set_regions() will apply DAMON_MIN_REGION as the core address alignment, and the monitoring target address ranges would be aligned on DAMON_MIN_REGION * addr_unit. When users 1) set addr_unit to a value larger than 1, and 2) set the monitoring target address range as not aligned on DAMON_MIN_REGION * addr_unit, it will cause DAMON_LRU_SORT to operate on unexpectedly large physical address ranges. For example, if the user sets the monitoring target address range to [4, 8) and addr_unit as 1024, the aimed monitoring target address range is [4 KiB, 8 KiB). Assuming DAMON_MIN_REGION is 4096, so resulting target address range will be [0, 4096) in the DAMON core layer address system, and [0, 4 MiB) in the physical address space, which is an unexpected range. To fix the issue, add a min_sz_region parameter to damon_set_region_biggest_system_ram_default() and use it when calling damon_set_regions(), replacing the direct use of DAMON_MIN_REGION. Link: https://lkml.kernel.org/r/20251020130125.2875164-1-yanquanmin1@huawei.com Link: https://lkml.kernel.org/r/20251020130125.2875164-2-yanquanmin1@huawei.com Fixes: 2e0fe9245d6b ("mm/damon/lru_sort: support addr_unit for DAMON_LRU_SORT") Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 0edf41d36ea1..9ee026c2db53 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -961,7 +961,8 @@ int damon_call(struct damon_ctx *ctx, struct damon_call_control *control); int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control); int damon_set_region_biggest_system_ram_default(struct damon_target *t, - unsigned long *start, unsigned long *end); + unsigned long *start, unsigned long *end, + unsigned long min_sz_region); #endif /* CONFIG_DAMON */ -- cgit v1.2.3 From 54c58a2f5fa191839cf192fa4ebab39395272a3e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:21 +0100 Subject: mm: add vma_desc_size(), vma_desc_pages() helpers It's useful to be able to determine the size of a VMA descriptor range used on f_op->mmap_prepare, expressed both in bytes and pages, so add helpers for both and update code that could make use of it to do so. Link: https://lkml.kernel.org/r/74ef338203c9ff08a9ace73a8f1f6116a79112a0.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c79b3369b82..5752b0c516f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3583,6 +3583,16 @@ static inline unsigned long vma_pages(const struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +static inline unsigned long vma_desc_size(const struct vm_area_desc *desc) +{ + return desc->end - desc->start; +} + +static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) +{ + return vma_desc_size(desc) >> PAGE_SHIFT; +} + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) -- cgit v1.2.3 From 51e38e7d40d617965504f4dcba569ecf9302f245 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:24 +0100 Subject: mm: add remap_pfn_range_prepare(), remap_pfn_range_complete() We need the ability to split PFN remap between updating the VMA and performing the actual remap, in order to do away with the legacy f_op->mmap hook. To do so, update the PFN remap code to provide shared logic, and also make remap_pfn_range_notrack() static, as its one user, io_mapping_map_user() was removed in commit 9a4f90e24661 ("mm: remove mm/io-mapping.c"). Then, introduce remap_pfn_range_prepare(), which accepts VMA descriptor and PFN parameters, and remap_pfn_range_complete() which accepts the same parameters as remap_pfn_rangte(). remap_pfn_range_prepare() will set the cow vma->vm_pgoff if necessary, so it must be supplied with a correct PFN to do so. While we're here, also clean up the duplicated #ifdef __HAVE_PFNMAP_TRACKING check and put into a single #ifdef/#else block. We keep these internal to mm as they should only be used by internal helpers. Link: https://lkml.kernel.org/r/75b55de63249b3aa0fd5b3b08ed1d3ff19255d0d.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Acked-by: Pedro Falcato Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5752b0c516f2..ca5565f4fac4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -489,6 +489,21 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* + * Physically remapped pages are special. Tell the + * rest of the world about it: + * VM_IO tells people not to look at these pages + * (accesses can have side effects). + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. + * VM_DONTEXPAND + * Disable vma merging and expanding with mremap(). + * VM_DONTDUMP + * Omit vma from core dump, even when VM_IO turned off. + */ +#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP) + /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) @@ -3634,10 +3649,9 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); -int remap_pfn_range(struct vm_area_struct *, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t); -int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, - unsigned long pfn, unsigned long size, pgprot_t prot); +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot); + int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); -- cgit v1.2.3 From c707a68f9468e4ef4a3546b636a9dd088fe7b7f1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:25 +0100 Subject: mm: abstract io_remap_pfn_range() based on PFN The only instances in which we customise this function are ones in which we customise the PFN used. Instances where architectures were not passing the pgprot value through pgprot_decrypted() are ones where pgprot_decrypted() was a no-op anyway, so we can simply always pass pgprot through this function. Use this fact to simplify the use of io_remap_pfn_range(), by abstracting the PFN via io_remap_pfn_range_pfn() and using this instead of providing a general io_remap_pfn_range() function per-architecture. Link: https://lkml.kernel.org/r/d086191bf431b58ce3b231b4f4f555d080f60327.1760959442.git.lorenzo.stoakes@oracle.com Suggested-by: Jason Gunthorpe Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index ca5565f4fac4..4441ceec913f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3684,15 +3684,24 @@ static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } -#ifndef io_remap_pfn_range -static inline int io_remap_pfn_range(struct vm_area_struct *vma, - unsigned long addr, unsigned long pfn, - unsigned long size, pgprot_t prot) +#ifndef io_remap_pfn_range_pfn +static inline unsigned long io_remap_pfn_range_pfn(unsigned long pfn, + unsigned long size) { - return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); + return pfn; } #endif +static inline int io_remap_pfn_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long orig_pfn, + unsigned long size, pgprot_t orig_prot) +{ + const unsigned long pfn = io_remap_pfn_range_pfn(orig_pfn, size); + const pgprot_t prot = pgprot_decrypted(orig_prot); + + return remap_pfn_range(vma, addr, pfn, size, prot); +} + static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) -- cgit v1.2.3 From ac0a3fc9c07df79dc8a4ce9d274df00afc7bf12d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:27 +0100 Subject: mm: add ability to take further action in vm_area_desc Some drivers/filesystems need to perform additional tasks after the VMA is set up. This is typically in the form of pre-population. The forms of pre-population most likely to be performed are a PFN remap or the insertion of normal folios and PFNs into a mixed map. We start by implementing the PFN remap functionality, ensuring that we perform the appropriate actions at the appropriate time - that is setting flags at the point of .mmap_prepare, and performing the actual remap at the point at which the VMA is fully established. This prevents the driver from doing anything too crazy with a VMA at any stage, and we retain complete control over how the mm functionality is applied. Unfortunately callers still do often require some kind of custom action, so we add an optional success/error _hook to allow the caller to do something after the action has succeeded or failed. This is done at the point when the VMA has already been established, so the harm that can be done is limited. The error hook can be used to filter errors if necessary. There may be cases in which the caller absolutely must hold the file rmap lock until the operation is entirely complete. It is an edge case, but certainly the hugetlbfs mmap hook requires it. To accommodate this, we add the hide_from_rmap_until_complete flag to the mmap_action type. In this case, if a new VMA is allocated, we will hold the file rmap lock until the operation is entirely completed (including any success/error hooks). Note that we do not need to update __compat_vma_mmap() to accommodate this flag, as this function will be invoked from an .mmap handler whose VMA is not yet visible, so we implicitly hide it from the rmap. If any error arises on these final actions, we simply unmap the VMA altogether. Also update the stacked filesystem compatibility layer to utilise the action behaviour, and update the VMA tests accordingly. While we're here, rename __compat_vma_mmap_prepare() to __compat_vma_mmap() as we are now performing actions invoked by the mmap_prepare in addition to just the mmap_prepare hook. Link: https://lkml.kernel.org/r/2601199a7b2eaeadfcd8ab6e199c6d1706650c94.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/fs.h | 6 ++-- include/linux/mm.h | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 53 ++++++++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..8cf9547a881c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2393,14 +2393,14 @@ static inline bool can_mmap_file(struct file *file) return true; } -int __compat_vma_mmap_prepare(const struct file_operations *f_op, +int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma); -int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); +int compat_vma_mmap(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4441ceec913f..2d060081caa5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3608,6 +3608,80 @@ static inline unsigned long vma_desc_pages(const struct vm_area_desc *desc) return vma_desc_size(desc) >> PAGE_SHIFT; } +/** + * mmap_action_remap - helper for mmap_prepare hook to specify that a pure PFN + * remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_remap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + struct mmap_action *action = &desc->action; + + /* [start, start + size) must be within the VMA. */ + WARN_ON_ONCE(start < desc->start || start >= desc->end); + WARN_ON_ONCE(start + size > desc->end); + + action->type = MMAP_REMAP_PFN; + action->remap.start = start; + action->remap.start_pfn = start_pfn; + action->remap.size = size; + action->remap.pgprot = desc->page_prot; +} + +/** + * mmap_action_remap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_remap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_remap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +/** + * mmap_action_ioremap - helper for mmap_prepare hook to specify that a pure PFN + * I/O remap is required. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start: The virtual address to start the remap from, must be within the VMA. + * @start_pfn: The first PFN in the range to remap. + * @size: The size of the range to remap, in bytes, at most spanning to the end + * of the VMA. + */ +static inline void mmap_action_ioremap(struct vm_area_desc *desc, + unsigned long start, + unsigned long start_pfn, + unsigned long size) +{ + mmap_action_remap(desc, start, start_pfn, size); + desc->action.type = MMAP_IO_REMAP_PFN; +} + +/** + * mmap_action_ioremap_full - helper for mmap_prepare hook to specify that the + * entirety of a VMA should be PFN I/O remapped. + * @desc: The VMA descriptor for the VMA requiring remap. + * @start_pfn: The first PFN in the range to remap. + */ +static inline void mmap_action_ioremap_full(struct vm_area_desc *desc, + unsigned long start_pfn) +{ + mmap_action_ioremap(desc, desc->start, start_pfn, vma_desc_size(desc)); +} + +void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc); +int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma); + /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..5021047485a9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -773,6 +773,56 @@ struct pfnmap_track_ctx { }; #endif +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -796,6 +846,9 @@ struct vm_area_desc { /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; /* -- cgit v1.2.3 From ea52cb24cd3fb121283754ab82b2cb3044609359 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:29 +0100 Subject: mm/hugetlbfs: update hugetlbfs to use mmap_prepare Since we can now perform actions after the VMA is established via mmap_prepare, use desc->action_success_hook to set up the hugetlb lock once the VMA is setup. We also make changes throughout hugetlbfs to make this possible. Note that we must hide newly established hugetlb VMAs from the rmap until the operation is entirely complete as we establish a hugetlb lock during VMA setup that can be raced by rmap users. Link: https://lkml.kernel.org/r/b1afa16d3cfa585a03df9ae215ae9f905b3f0ed7.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Tested-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 9 +++++++-- include/linux/hugetlb_inline.h | 15 ++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8e63e46b8e1f..2387513d6ae5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, struct folio **foliop); #endif /* CONFIG_USERFAULTFD */ long hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags); + struct vm_area_desc *desc, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); @@ -280,6 +279,7 @@ bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ @@ -466,6 +466,11 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} +static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + return 0; +} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index 0660a03d37d9..a27aa0162918 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -2,22 +2,27 @@ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H -#ifdef CONFIG_HUGETLB_PAGE - #include -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +#ifdef CONFIG_HUGETLB_PAGE + +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { - return !!(vma->vm_flags & VM_HUGETLB); + return !!(vm_flags & VM_HUGETLB); } #else -static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags) { return false; } #endif +static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) +{ + return is_vm_hugetlb_flags(vma->vm_flags); +} + #endif -- cgit v1.2.3 From 89646d9c748c0902600090f37ae585f3b99deb4d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 20 Oct 2025 13:11:30 +0100 Subject: mm: add shmem_zero_setup_desc() Add the ability to set up a shared anonymous mapping based on a VMA descriptor rather than a VMA. This is a prerequisite for converting to the char mm driver to use the mmap_prepare hook. Link: https://lkml.kernel.org/r/d9181517a7e3d6b014a5697c6990d3722c2c9fcd.1760959442.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jason Gunthorpe Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Baolin Wang Cc: Baoquan He Cc: Chatre, Reinette Cc: Christian Borntraeger Cc: Christian Brauner Cc: Dan Williams Cc: Dave Jiang Cc: Dave Martin Cc: Dave Young Cc: David Hildenbrand Cc: David S. Miller Cc: Dmitriy Vyukov Cc: Greg Kroah-Hartman Cc: Guo Ren Cc: Heiko Carstens Cc: Hugh Dickins Cc: James Morse Cc: Jan Kara Cc: Jann Horn Cc: Jonathan Corbet Cc: Kevin Tian Cc: Konstantin Komarov Cc: Liam Howlett Cc: "Luck, Tony" Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Nicolas Pitre Cc: Oscar Salvador Cc: Pedro Falcato Cc: Robin Murohy Cc: Sumanth Korikkar Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Bogendoerfer Cc: "Uladzislau Rezki (Sony)" Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0e47465ef0fd..5b368f9549d6 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -94,7 +94,8 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags); extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name, loff_t size, unsigned long flags); -extern int shmem_zero_setup(struct vm_area_struct *); +int shmem_zero_setup(struct vm_area_struct *vma); +int shmem_zero_setup_desc(struct vm_area_desc *desc); extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); -- cgit v1.2.3 From 5ff592bec75ad79ed7f1a817477ab6eef8dc5efc Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 21 Oct 2025 16:44:25 -0700 Subject: memcg: manually uninline __memcg_memory_event __memcg_memory_event() has been unnecessarily marked inline even when it is not really performance critical. It is usually called to track extreme conditions. Over the time, it has evolved to include more functionality and inlining it is causing more harm. Before the patch: $ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o text data bss dec hex filename 35645 10574 4192 50411 c4eb mm/memcontrol.o 54738 1658 0 56396 dc4c net/ipv4/tcp_input.o 34644 1065 0 35709 8b7d net/ipv4/tcp_output.o After the patch: $ size mm/memcontrol.o net/ipv4/tcp_input.o net/ipv4/tcp_output.o text data bss dec hex filename 35137 10446 4192 49775 c26f mm/memcontrol.o 54322 1562 0 55884 da4c net/ipv4/tcp_input.o 34492 1017 0 35509 8ab5 net/ipv4/tcp_output.o [akpm@linux-foundation.org: use EXPORT_SYMBOL_GPL for __memcg_memory_event, per Michal and Christoph] Link: https://lkml.kernel.org/r/20251021234425.1885471-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: SeongJae Park Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5fe254813123..8c0f15e5978f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1002,36 +1002,8 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, count_memcg_events_mm(mm, idx, 1); } -static inline void __memcg_memory_event(struct mem_cgroup *memcg, - enum memcg_memory_event event, - bool allow_spinning) -{ - bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || - event == MEMCG_SWAP_FAIL; - - /* For now only MEMCG_MAX can happen with !allow_spinning context. */ - VM_WARN_ON_ONCE(!allow_spinning && event != MEMCG_MAX); - - atomic_long_inc(&memcg->memory_events_local[event]); - if (!swap_event && allow_spinning) - cgroup_file_notify(&memcg->events_local_file); - - do { - atomic_long_inc(&memcg->memory_events[event]); - if (allow_spinning) { - if (swap_event) - cgroup_file_notify(&memcg->swap_events_file); - else - cgroup_file_notify(&memcg->events_file); - } - - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) - break; - if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) - break; - } while ((memcg = parent_mem_cgroup(memcg)) && - !mem_cgroup_is_root(memcg)); -} +void __memcg_memory_event(struct mem_cgroup *memcg, + enum memcg_memory_event event, bool allow_spinning); static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) -- cgit v1.2.3 From 27bfafac65d87c58639f5d7af1353ec1e7886963 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:28 +0800 Subject: mm: add a ptdesc flag to mark kernel page tables The page tables used to map the kernel and userspace often have very different handling rules. There are frequently *_kernel() variants of functions just for kernel page tables. That's not great and has lead to code duplication. Instead of having completely separate call paths, allow a 'ptdesc' to be marked as being for kernel mappings. Introduce helpers to set and clear this status. Note: this uses the PG_referenced bit. Page flags are a great fit for this since it is truly a single bit of information. Use PG_referenced itself because it's a fairly benign flag (as opposed to things like PG_lock). It's also (according to Willy) unlikely to go away any time soon. PG_referenced is not in PAGE_FLAGS_CHECK_AT_FREE. It does not need to be cleared before freeing the page, and pages coming out of the allocator should have it cleared. Regardless, introduce an API to clear it anyway. Having symmetry in the API makes it easier to change the underlying implementation later, like if there was a need to move to a PAGE_FLAGS_CHECK_AT_FREE bit. Link: https://lkml.kernel.org/r/20251022082635.2462433-3-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2d060081caa5..5c887c4ea29e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2962,6 +2962,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #endif /* CONFIG_MMU */ enum pt_flags { + PT_kernel = PG_referenced, PT_reserved = PG_reserved, /* High bits are used for zone/node/section */ }; @@ -2987,6 +2988,46 @@ static inline bool pagetable_is_reserved(struct ptdesc *pt) return test_bit(PT_reserved, &pt->pt_flags.f); } +/** + * ptdesc_set_kernel - Mark a ptdesc used to map the kernel + * @ptdesc: The ptdesc to be marked + * + * Kernel page tables often need special handling. Set a flag so that + * the handling code knows this ptdesc will not be used for userspace. + */ +static inline void ptdesc_set_kernel(struct ptdesc *ptdesc) +{ + set_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_clear_kernel - Mark a ptdesc as no longer used to map the kernel + * @ptdesc: The ptdesc to be unmarked + * + * Use when the ptdesc is no longer used to map the kernel and no longer + * needs special handling. + */ +static inline void ptdesc_clear_kernel(struct ptdesc *ptdesc) +{ + /* + * Note: the 'PG_referenced' bit does not strictly need to be + * cleared before freeing the page. But this is nice for + * symmetry. + */ + clear_bit(PT_kernel, &ptdesc->pt_flags.f); +} + +/** + * ptdesc_test_kernel - Check if a ptdesc is used to map the kernel + * @ptdesc: The ptdesc being tested + * + * Call to tell if the ptdesc used to map the kernel. + */ +static inline bool ptdesc_test_kernel(const struct ptdesc *ptdesc) +{ + return test_bit(PT_kernel, &ptdesc->pt_flags.f); +} + /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags -- cgit v1.2.3 From 977870522af34359b461060597ee3a86f27450d6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:29 +0800 Subject: mm: actually mark kernel page table pages Now that the API is in place, mark kernel page table pages just after they are allocated. Unmark them just before they are freed. Note: Unconditionally clearing the 'kernel' marking (via ptdesc_clear_kernel()) would be functionally identical to what is here. But having the if() makes it logically clear that this function can be used for kernel and non-kernel page tables. Link: https://lkml.kernel.org/r/20251022082635.2462433-4-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 18 ++++++++++++++++++ include/linux/mm.h | 3 +++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 3c8ec3bfea44..b9d2a7c79b93 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -28,6 +28,8 @@ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) return NULL; } + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pte_alloc_one_kernel(...) alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__)) @@ -146,6 +148,10 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad pagetable_free(ptdesc); return NULL; } + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define pmd_alloc_one(...) alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__)) @@ -179,6 +185,10 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long return NULL; pagetable_pud_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pud_alloc_one(...) alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__)) @@ -233,6 +243,10 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long return NULL; pagetable_p4d_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __p4d_alloc_one(...) alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__)) @@ -277,6 +291,10 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order return NULL; pagetable_pgd_ctor(ptdesc); + + if (mm == &init_mm) + ptdesc_set_kernel(ptdesc); + return ptdesc_address(ptdesc); } #define __pgd_alloc(...) alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__)) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c887c4ea29e..8f46048875a7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,6 +3057,9 @@ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); + if (ptdesc_test_kernel(pt)) + ptdesc_clear_kernel(pt); + __free_pages(page, compound_order(page)); } -- cgit v1.2.3 From 01894295672335ff304beed4359f30d14d5765f2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:31 +0800 Subject: mm: introduce pure page table freeing function The pages used for ptdescs are currently freed back to the allocator in a single location. They will shortly be freed from a second location. Create a simple helper that just frees them back to the allocator. Link: https://lkml.kernel.org/r/20251022082635.2462433-6-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f46048875a7..88c0a0fae43a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3046,6 +3046,13 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) +static inline void __pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3055,12 +3062,10 @@ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int orde */ static inline void pagetable_free(struct ptdesc *pt) { - struct page *page = ptdesc_page(pt); - if (ptdesc_test_kernel(pt)) ptdesc_clear_kernel(pt); - __free_pages(page, compound_order(page)); + __pagetable_free(pt); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) -- cgit v1.2.3 From 5ba2f0a1556479638ac11a3c201421f5515e89f5 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 22 Oct 2025 16:26:33 +0800 Subject: mm: introduce deferred freeing for kernel page tables This introduces a conditional asynchronous mechanism, enabled by CONFIG_ASYNC_KERNEL_PGTABLE_FREE. When enabled, this mechanism defers the freeing of pages that are used as page tables for kernel address mappings. These pages are now queued to a work struct instead of being freed immediately. This deferred freeing allows for batch-freeing of page tables, providing a safe context for performing a single expensive operation (TLB flush) for a batch of kernel page tables instead of performing that expensive operation for each page table. Link: https://lkml.kernel.org/r/20251022082635.2462433-8-baolu.lu@linux.intel.com Signed-off-by: Dave Hansen Signed-off-by: Lu Baolu Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Ingo Molnar Cc: Jann Horn Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vasant Hegde Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 88c0a0fae43a..a6fd9f5aaf30 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3053,6 +3053,14 @@ static inline void __pagetable_free(struct ptdesc *pt) __free_pages(page, compound_order(page)); } +#ifdef CONFIG_ASYNC_KERNEL_PGTABLE_FREE +void pagetable_free_kernel(struct ptdesc *pt); +#else +static inline void pagetable_free_kernel(struct ptdesc *pt) +{ + __pagetable_free(pt); +} +#endif /** * pagetable_free - Free pagetables * @pt: The page table descriptor @@ -3062,10 +3070,12 @@ static inline void __pagetable_free(struct ptdesc *pt) */ static inline void pagetable_free(struct ptdesc *pt) { - if (ptdesc_test_kernel(pt)) + if (ptdesc_test_kernel(pt)) { ptdesc_clear_kernel(pt); - - __pagetable_free(pt); + pagetable_free_kernel(pt); + } else { + __pagetable_free(pt); + } } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) -- cgit v1.2.3 From e37d5a2d60a338c5917c45296bac65da1382eda5 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Wed, 22 Oct 2025 16:26:34 +0800 Subject: iommu/sva: invalidate stale IOTLB entries for kernel address space Introduce a new IOMMU interface to flush IOTLB paging cache entries for the CPU kernel address space. This interface is invoked from the x86 architecture code that manages combined user and kernel page tables, specifically before any kernel page table page is freed and reused. This addresses the main issue with vfree() which is a common occurrence and can be triggered by unprivileged users. While this resolves the primary problem, it doesn't address some extremely rare case related to memory unplug of memory that was present as reserved memory at boot, which cannot be triggered by unprivileged users. The discussion can be found at the link below. Enable SVA on x86 architecture since the IOMMU can now receive notification to flush the paging cache before freeing the CPU kernel page table pages. Link: https://lkml.kernel.org/r/20251022082635.2462433-9-baolu.lu@linux.intel.com Link: https://lore.kernel.org/linux-iommu/04983c62-3b1d-40d4-93ae-34ca04b827e5@intel.com/ Co-developed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe Signed-off-by: Lu Baolu Suggested-by: Jann Horn Reviewed-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Reviewed-by: Kevin Tian Cc: Alistair Popple Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Dave Hansen Cc: David Hildenbrand Cc: Ingo Molnar Cc: Jean-Philippe Brucker Cc: Joerg Roedel Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Peter Zijlstra Cc: Robin Murohy Cc: Thomas Gleinxer Cc: "Uladzislau Rezki (Sony)" Cc: Vinicius Costa Gomes Cc: Vlastimil Babka Cc: Will Deacon Cc: Yi Lai Signed-off-by: Andrew Morton --- include/linux/iommu.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c30d12e16473..66e4abb2df0d 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1134,7 +1134,9 @@ struct iommu_sva { struct iommu_mm_data { u32 pasid; + struct mm_struct *mm; struct list_head sva_domains; + struct list_head mm_list_elm; }; int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode); @@ -1615,6 +1617,7 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm); void iommu_sva_unbind_device(struct iommu_sva *handle); u32 iommu_sva_get_pasid(struct iommu_sva *handle); +void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end); #else static inline struct iommu_sva * iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) @@ -1639,6 +1642,7 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm) } static inline void mm_pasid_drop(struct mm_struct *mm) {} +static inline void iommu_sva_invalidate_kva_range(unsigned long start, unsigned long end) {} #endif /* CONFIG_IOMMU_SVA */ #ifdef CONFIG_IOMMU_IOPF -- cgit v1.2.3 From a983471cfc454afeba23526ee5d17fd8cdc7876f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Fri, 24 Oct 2025 02:00:41 +0800 Subject: mm, swap: cleanup swap entry allocation parameter We no longer need this GFP parameter after commit 8578e0c00dcf ("mm, swap: use the swap table for the swap cache and switch API"). Before that commit the GFP parameter is already almost identical for all callers, so nothing changed by that commit. Swap table just moved the GFP to lower layer and make it more defined and changes depend on atomic or sleep allocation. Now this parameter is no longer used, just remove it. No behavior change. Link: https://lkml.kernel.org/r/20251024-swap-clean-after-swap-table-p1-v2-3-a709469052e7@tencent.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Baolin Wang Reviewed-by: David Hildenbrand Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index e818fbade1e2..a4b264817735 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -462,7 +462,7 @@ static inline long get_nr_swap_pages(void) } extern void si_swapinfo(struct sysinfo *); -int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask); +int folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); @@ -560,7 +560,7 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask) +static inline int folio_alloc_swap(struct folio *folio) { return -EINVAL; } -- cgit v1.2.3 From adf7d6cdd716e1f3826789befc453c961dfafcf2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 22 Oct 2025 18:25:25 -0700 Subject: mm/damon/core: add damon_target->obsolete for pin-point removal Patch series "mm/damon: support pin-point targets removal". DAMON maintains the targets in a list, and allows committing only an entire list of targets having the new parameters. Targets having same index on the lists are treated as matching source and destination targets. If an existing target cannot find a matching one in the sources list, the target is removed. This means that there is no way to remove only a specific monitoring target in the middle of the current targets list. Such pin-point target removal is really needed in some use cases, though. Monitoring access patterns on virtual address spaces of processes that spawned from the same ancestor is one example. If a process of the group is terminated, the user may want to remove the matching DAMON target as soon as possible, to save in-kernel memory usage for the unnecessary target data. The user may also want to do that without turning DAMON off or removing unnecessary targets, to keep the current monitoring results for other active processes. Extend DAMON kernel API and sysfs ABI to support the pin-point removal in the following way. For API, add a new damon_target field, namely 'obsolete'. If the field on parameters commit source target is set, it means the matching destination target is obsolete. Then the parameters commit logic removes the destination target from the existing targets list. For sysfs ABI, add a new file under the target directory, namely 'obsolete_target'. It is connected with the 'obsolete' field of the commit source targets, so internally using the new API. Also add a selftest for the new feature. The related helper scripts for manipulating the sysfs interface and dumping in-kernel DAMON status are also extended for this. Note that the selftest part was initially posted as an individual RFC series [1], but now merged into this one. Bijan Tabatabai has originally reported this issue, and participated in this solution design on a GitHub issue [1] for DAMON user-space tool. This patch (of 9): DAMON's monitoring targets parameters update function, damon_commit_targets(), is not providing a way to remove a target in the middle of the existing targets list. Extend the API by adding a field to struct damon_target. If the field of a damon_commit_targets() source target is set, it indicates the matching target on the existing targets list is obsolete. damon_commit_targets() understands that and removes those from the list, while respecting the index based matching for other non-obsolete targets. Link: https://lkml.kernel.org/r/20251023012535.69625-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251023012535.69625-2-sj@kernel.org Link: https://github.com/damonitor/damo/issues/36 [1] Signed-off-by: SeongJae Park Reviewed-by: Bijan Tabatabai Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 9ee026c2db53..f3566b978cdf 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -91,17 +91,23 @@ struct damon_region { * @nr_regions: Number of monitoring target regions of this target. * @regions_list: Head of the monitoring target regions of this target. * @list: List head for siblings. + * @obsolete: Whether the commit destination target is obsolete. * * Each monitoring context could have multiple targets. For example, a context * for virtual memory address spaces could have multiple target processes. The * @pid should be set for appropriate &struct damon_operations including the * virtual address spaces monitoring operations. + * + * @obsolete is used only for damon_commit_targets() source targets, to specify + * the matching destination targets are obsolete. Read damon_commit_targets() + * to see how it is handled. */ struct damon_target { struct pid *pid; unsigned int nr_regions; struct list_head regions_list; struct list_head list; + bool obsolete; }; /** -- cgit v1.2.3 From b734b9d973ccd7ad1cfebc2e1f7db693824a37ef Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 24 Oct 2025 10:09:02 +0100 Subject: mm/vma: small VMA lock cleanups We declare vma_start_read() as a static function in mm/mmap_lock.c, so there is no need to provide a stub for !CONFIG_PER_VMA_LOCK. __is_vma_write_locked() is declared in a header and should therefore be static inline. Put parens around (refcnt & VMA_LOCK_OFFSET) in is_vma_writer_only() to make precedence clear. Link: https://lkml.kernel.org/r/20251024090902.1118174-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 2c9fffa58714..e05da70dc0cb 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -130,7 +130,7 @@ static inline bool is_vma_writer_only(int refcnt) * a detached vma happens only in vma_mark_detached() and is a rare * case, therefore most of the time there will be no unnecessary wakeup. */ - return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; + return (refcnt & VMA_LOCK_OFFSET) && refcnt <= VMA_LOCK_OFFSET + 1; } static inline void vma_refcount_put(struct vm_area_struct *vma) @@ -183,7 +183,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) +static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -281,9 +281,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int return true; } static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) - { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) -- cgit v1.2.3 From 272239dc8fcb109b9f1ec1a73bb85405dac92eda Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 21 Oct 2025 03:56:38 +0100 Subject: mm: make INVALID_PHYS_ADDR a generic macro INVALID_PHYS_ADDR has very similar definitions across the code base. Hence just move that inside header for more generic usage. Also drop the now redundant ones which are no longer required. Link: https://lkml.kernel.org/r/20251021025638.2420216-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Acked-by: Alexander Gordeev [s390] Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index a6fd9f5aaf30..7bcd9e6fbc3c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -105,6 +105,8 @@ extern int mmap_rnd_compat_bits __read_mostly; # endif #endif +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + #include #include -- cgit v1.2.3 From 8e689f8ea45ffdae20350246dd37d124d7092c92 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Tue, 28 Oct 2025 11:43:07 +0800 Subject: mm/swap: do not choose swap device according to numa node Patch series "mm/swapfile.c: select swap devices of default priority round robin", v5. Currently, on system with multiple swap devices, swap allocation will select one swap device according to priority. The swap device with the highest priority will be chosen to allocate firstly. People can specify a priority from 0 to 32767 when swapon a swap device, or the system will set it from -2 then downwards by default. Meanwhile, on NUMA system, the swap device with node_id will be considered first on that NUMA node of the node_id. In the current code, an array of plist, swap_avail_heads[nid], is used to organize swap devices on each NUMA node. For each NUMA node, there is a plist organizing all swap devices. The 'prio' value in the plist is the negated value of the device's priority due to plist being sorted from low to high. The swap device owning one node_id will be promoted to the front position on that NUMA node, then other swap devices are put in order of their default priority. E.g I got a system with 8 NUMA nodes, and I setup 4 zram partition as swap devices. Current behaviour: their priorities will be(note that -1 is skipped): NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 0B -2 /dev/zram1 partition 16G 0B -3 /dev/zram2 partition 16G 0B -4 /dev/zram3 partition 16G 0B -5 And their positions in the 8 swap_avail_lists[nid] will be: swap_avail_lists[0]: /* node 0's available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:1 prio:3 prio:4 prio:5 swap_avali_lists[1]: /* node 1's available swap device list */ zram1 -> zram0 -> zram2 -> zram3 prio:1 prio:2 prio:4 prio:5 swap_avail_lists[2]: /* node 2's available swap device list */ zram2 -> zram0 -> zram1 -> zram3 prio:1 prio:2 prio:3 prio:5 swap_avail_lists[3]: /* node 3's available swap device list */ zram3 -> zram0 -> zram1 -> zram2 prio:1 prio:2 prio:3 prio:4 swap_avail_lists[4-7]: /* node 4,5,6,7's available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:2 prio:3 prio:4 prio:5 The adjustment for swap device with node_id intended to decrease the pressure of lock contention for one swap device by taking different swap device on different node. The adjustment was introduced in commit a2468cc9bfdf ("swap: choose swap device according to numa node"). However, the adjustment is a little coarse-grained. On the node, the swap device sharing the node's id will always be selected firstly by node's CPUs until exhausted, then next one. And on other nodes where no swap device shares its node id, swap device with priority '-2' will be selected firstly until exhausted, then next with priority '-3'. This is the swapon output during the process high pressure vm-scability test is being taken. It's clearly showing zram0 is heavily exploited until exhausted. =================================== [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 15.7G -2 /dev/zram1 partition 16G 3.4G -3 /dev/zram2 partition 16G 3.4G -4 /dev/zram3 partition 16G 2.6G -5 The node based strategy on selecting swap device is much better then the old way one by one selecting swap device. However it is still unreasonable because swap devices are assumed to have similar accessing speed if no priority is specified when swapon. It's unfair and doesn't make sense just because one swap device is swapped on firstly, its priority will be higher than the one swapped on later. So in this patchset, change is made to select the swap device round robin if default priority. In code, the plist array swap_avail_heads[nid] is replaced with a plist swap_avail_head which reverts commit a2468cc9bfdf. Meanwhile, on top of the revert, further change is taken to make any device w/o specified priority get the same default priority '-1'. Surely, swap device with specified priority are always put foremost, this is not impacted. If you care about their different accessing speed, then use 'swapon -p xx' to deploy priority for your swap devices. New behaviour: swap_avail_list: /* one global available swap device list */ zram0 -> zram1 -> zram2 -> zram3 prio:1 prio:1 prio:1 prio:1 This is the swapon output during the process high pressure vm-scability being taken, all is selected round robin: ======================================= [root@hp-dl385g10-03 linux]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 12.6G -1 /dev/zram1 partition 16G 12.6G -1 /dev/zram2 partition 16G 12.6G -1 /dev/zram3 partition 16G 12.6G -1 With the change, we can see about 18% efficiency promotion as below: vm-scability test: ================== Test with: usemem --init-time -O -y -x -n 31 2G (4G memcg, zram as swap) Before: After: System time: 637.92 s 526.74 s (lower is better) Sum Throughput: 3546.56 MB/s 4207.56 MB/s (higher is better) Single process Throughput: 114.40 MB/s 135.72 MB/s (higher is better) free latency: 10138455.99 us 6810119.01 us (low is better) This patch (of 2): This reverts commit a2468cc9bfdf ("swap: choose swap device according to numa node"). After this patch, the behaviour will change back to pre-commit a2468cc9bfdf. Means the priority will be set from -1 then downwards by default, and when swapping, it will exhault swap device one by one according to priority from high to low. This is preparation work for later change. [root@hp-dl385g10-03 ~]# swapon NAME TYPE SIZE USED PRIO /dev/zram0 partition 16G 16G -1 /dev/zram1 partition 16G 966.2M -2 /dev/zram2 partition 16G 0B -3 /dev/zram3 partition 16G 0B -4 Link: https://lkml.kernel.org/r/20251028034308.929550-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20251028034308.929550-2-bhe@redhat.com Signed-off-by: Baoquan He Suggested-by: Chris Li Acked-by: Chris Li Acked-by: Nhat Pham Reviewed-by: Kairui Song Cc: Barry Song Cc: Kemeng Shi Signed-off-by: Andrew Morton --- include/linux/swap.h | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index a4b264817735..38ca3df68716 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -301,16 +301,7 @@ struct swap_info_struct { struct work_struct discard_work; /* discard worker */ struct work_struct reclaim_work; /* reclaim worker */ struct list_head discard_clusters; /* discard clusters list */ - struct plist_node avail_lists[]; /* - * entries in swap_avail_heads, one - * entry per node. - * Must be last as the number of the - * array is nr_node_ids, which is not - * a fixed value so have to allocate - * dynamically. - * And it has to be an array so that - * plist_for_each_* can work. - */ + struct plist_node avail_list; /* entry in swap_avail_head */ }; static inline swp_entry_t page_swap_entry(struct page *page) -- cgit v1.2.3 From 1a4f70f6851a1916c4f0e52731c7ecfe99bf36e6 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:28 +0000 Subject: mm: convert memory block states (MEM_*) macros to enum Patch series "mm: Convert memory block states (MEM_*) macros to enums", v2. The MEM_* constants indicating the state of a memory block are currently defined as macros, meaning their definitions will be omitted from the debuginfo on most kernel builds. This makes it harder for debuggers to correctly map the block state at runtime, which can be quite useful when analysing errors related to memory hot plugging and unplugging with tools such as drgn. Converting the constants to an enum ensures the correct information is emitted by the compiler and available for the debugger, without needing to hard-code them into the debugger and track their changes. This patch series aims to replace the current macros with a newly created enum named memory_block_state, while also taking advantage of the compile time guarantees that we get when using enums. The first patch does the conversion of the macros to an enum, while the 2nd and 3rd patches use this enum to clean up some type declarations and make sure that only valid values are used. This patch (of 3): Converting the MEM_* constants from macros to an enum ensures that their values will be correctly emitted in the debug symbols, making it easier to trace the meaning of each value when debugging with tools such as drgn, without the need to hard-code the values. Since the values are mutually exclusive and they are not exposed directly to userspace, I also dropped the misleading pattern (1< Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/memory.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index 0c214256216f..f4e358477c6a 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -64,6 +64,18 @@ struct memory_group { }; }; +enum memory_block_state { + /* These states are exposed to userspace as text strings in sysfs */ + MEM_ONLINE, /* exposed to userspace */ + MEM_GOING_OFFLINE, /* exposed to userspace */ + MEM_OFFLINE, /* exposed to userspace */ + MEM_GOING_ONLINE, + MEM_CANCEL_ONLINE, + MEM_CANCEL_OFFLINE, + MEM_PREPARE_ONLINE, + MEM_FINISH_OFFLINE, +}; + struct memory_block { unsigned long start_section_nr; unsigned long state; /* serialized by the dev->lock */ @@ -89,16 +101,6 @@ int arch_get_memory_phys_device(unsigned long start_pfn); unsigned long memory_block_size_bytes(void); int set_memory_block_size_order(unsigned int order); -/* These states are exposed to userspace as text strings in sysfs */ -#define MEM_ONLINE (1<<0) /* exposed to userspace */ -#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */ -#define MEM_OFFLINE (1<<2) /* exposed to userspace */ -#define MEM_GOING_ONLINE (1<<3) -#define MEM_CANCEL_ONLINE (1<<4) -#define MEM_CANCEL_OFFLINE (1<<5) -#define MEM_PREPARE_ONLINE (1<<6) -#define MEM_FINISH_OFFLINE (1<<7) - struct memory_notify { /* * The altmap_start_pfn and altmap_nr_pages fields are designated for -- cgit v1.2.3 From 8bc7ba3d265d6ee698de4b1941b7e8f7d91a0562 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:30 +0000 Subject: mm: change type of state in struct memory_block The state of a memory block should be restricted to values specified in the documentation of the memory hotplug API. However, since the state field in the memory_block struct was defined as an unsigned long, this restriction was not enforced at compile time. With the introduction of the enum memory_block_state, it is now possible to incorporate the desired semantics in the field declaration and enforce these restrictions at compile time. [akpm@linux-foundation.org: fix whitespace, per Randy] Link: https://lkml.kernel.org/r/20251029195617.2210700-3-linux@israelbatista.dev.br Signed-off-by: Israel Batista Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index f4e358477c6a..ca20cbdd71f2 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -78,7 +78,7 @@ enum memory_block_state { struct memory_block { unsigned long start_section_nr; - unsigned long state; /* serialized by the dev->lock */ + enum memory_block_state state; /* serialized by the dev->lock */ int online_type; /* for passing data to online routine */ int nid; /* NID for this memory block */ /* -- cgit v1.2.3 From ed1f8855dd7b82a0ad87960b1729a3e848dc5589 Mon Sep 17 00:00:00 2001 From: Israel Batista Date: Wed, 29 Oct 2025 19:56:32 +0000 Subject: mm: change type of parameter for memory_notify memory_notify() is responsible for sending events related to memory hotplugging to a notification queue. Since all the events must match one of the values from the enum memory_block_state, it is appropriate to change the function parameter type to make this condition explicit at compile time. Link: https://lkml.kernel.org/r/20251029195617.2210700-4-linux@israelbatista.dev.br Signed-off-by: Israel Batista Acked-by: Mike Rapoport (Microsoft) Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Omar Sandoval Cc: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/memory.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memory.h b/include/linux/memory.h index ca20cbdd71f2..ca3eb1db6cc8 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -141,7 +141,7 @@ static inline int register_memory_notifier(struct notifier_block *nb) static inline void unregister_memory_notifier(struct notifier_block *nb) { } -static inline int memory_notify(unsigned long val, void *v) +static inline int memory_notify(enum memory_block_state state, void *v) { return 0; } @@ -165,7 +165,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); -extern int memory_notify(unsigned long val, void *v); +extern int memory_notify(enum memory_block_state state, void *v); extern struct memory_block *find_memory_block(unsigned long section_nr); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, -- cgit v1.2.3 From 2ec41967189cd65a8f79c760dd1b50c4f56e8ac6 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Sun, 2 Nov 2025 18:44:33 +0000 Subject: mm: handle poisoning of pfn without struct pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Poison (or ECC) errors can be very common on a large size cluster. The kernel MM currently does not handle ECC errors / poison on a memory region that is not backed by struct pages. If a memory region mapped using remap_pfn_range() for example, but not added to the kernel, MM will not have associated struct pages. Add a new mechanism to handle memory failure on such memory. Make kernel MM expose a function to allow modules managing the device memory to register the device memory SPA and the address space associated it. MM maintains this information as an interval tree. On poison, MM can search for the range that the poisoned PFN belong and use the address_space to determine the mapping VMA. In this implementation, kernel MM follows the following sequence that is largely similar to the memory_failure() handler for struct page backed memory: 1. memory_failure() is triggered on reception of a poison error. An absence of struct page is detected and consequently memory_failure_pfn() is executed. 2. memory_failure_pfn() collects the processes mapped to the PFN. 3. memory_failure_pfn() sends SIGBUS to all the processes mapping the faulty PFN using kill_procs(). Note that there is one primary difference versus the handling of the poison on struct pages, which is to skip unmapping to the faulty PFN. This is done to handle the huge PFNMAP support added recently [1] that enables VM_PFNMAP vmas to map at PMD or PUD level. A poison to a PFN mapped in such as way would need breaking the PMD/PUD mapping into PTEs that will get mirrored into the S2. This can greatly increase the cost of table walks and have a major performance impact. Link: https://lore.kernel.org/all/20240826204353.2228736-1-peterx@redhat.com/ [1] Link: https://lkml.kernel.org/r/20251102184434.2406-3-ankita@nvidia.com Signed-off-by: Ankit Agrawal Cc: Aniket Agashe Cc: Borislav Betkov Cc: David Hildenbrand Cc: Hanjun Guo Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Joanthan Cameron Cc: Kevin Tian Cc: Kirti Wankhede Cc: Len Brown Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Luck, Tony" Cc: Matthew R. Ochs Cc: Mauro Carvalho Chehab Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Neo Jia Cc: Peter Zijlstra Cc: Shuai Xue Cc: Smita Koralahalli Channabasappa Cc: Suren Baghdasaryan Cc: Tarun Gupta Cc: Uwe Kleine-König Cc: Vikram Sethi Cc: Vlastimil Babka Cc: Zhi Wang Signed-off-by: Andrew Morton --- include/linux/memory-failure.h | 17 +++++++++++++++++ include/linux/mm.h | 1 + include/ras/ras_event.h | 1 + 3 files changed, 19 insertions(+) create mode 100644 include/linux/memory-failure.h (limited to 'include') diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h new file mode 100644 index 000000000000..bc326503d2d2 --- /dev/null +++ b/include/linux/memory-failure.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_FAILURE_H +#define _LINUX_MEMORY_FAILURE_H + +#include + +struct pfn_address_space; + +struct pfn_address_space { + struct interval_tree_node node; + struct address_space *mapping; +}; + +int register_pfn_address_space(struct pfn_address_space *pfn_space); +void unregister_pfn_address_space(struct pfn_address_space *pfn_space); + +#endif /* _LINUX_MEMORY_FAILURE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7bcd9e6fbc3c..b636d12bb651 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4285,6 +4285,7 @@ enum mf_action_page_type { MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, + MF_MSG_PFN_MAP, MF_MSG_UNKNOWN, }; diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index c8cd0f00c845..fecfeb7c8be7 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -375,6 +375,7 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) /* -- cgit v1.2.3 From a73d4a055622d0973e371382b16a13f9795ffec7 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 31 Oct 2025 12:21:31 +0100 Subject: drivers/xen/xenbus: Replace deprecated strcpy in xenbus_transaction_end strcpy() is deprecated; inline the read-only string instead. Fix the function comment and use bool instead of int while we're at it. Link: https://github.com/KSPP/linux/issues/88 Reviewed-by: Juergen Gross Signed-off-by: Thorsten Blum Signed-off-by: Juergen Gross Message-ID: <20251031112145.103257-2-thorsten.blum@linux.dev> --- include/xen/xenbus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 7dab04cf4a36..c94caf852aea 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -158,7 +158,7 @@ int xenbus_exists(struct xenbus_transaction t, const char *dir, const char *node); int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node); int xenbus_transaction_start(struct xenbus_transaction *t); -int xenbus_transaction_end(struct xenbus_transaction t, int abort); +int xenbus_transaction_end(struct xenbus_transaction t, bool abort); /* Single read and scanf: returns -errno or num scanned if > 0. */ __scanf(4, 5) -- cgit v1.2.3 From 197b3f3c70d61ff1c7ca24f66d567e06fe8ed3d9 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 12 Nov 2025 14:55:30 +0100 Subject: string: provide strends() Implement a function for checking if a string ends with a different string and add its kunit test cases. Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20251112-gpio-shared-v4-1-b51f97b1abd8@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/string.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index fdd3442c6bcb..929d05d1247c 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -562,4 +562,22 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } +/** + * strends - Check if a string ends with another string. + * @str - NULL-terminated string to check against @suffix + * @suffix - NULL-terminated string defining the suffix to look for in @str + * + * Returns: + * True if @str ends with @suffix. False in all other cases. + */ +static inline bool strends(const char *str, const char *suffix) +{ + unsigned int str_len = strlen(str), suffix_len = strlen(suffix); + + if (str_len < suffix_len) + return false; + + return !(strcmp(str + str_len - suffix_len, suffix)); +} + #endif /* _LINUX_STRING_H_ */ -- cgit v1.2.3 From eb374f764a7012eda28019266a6d9191670c4fa5 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 12 Nov 2025 14:55:35 +0100 Subject: gpio: provide gpiod_is_shared() Provide an interface allowing consumers to check if a GPIO descriptor represents a GPIO that can potentially be shared by multiple consumers at the same time. This is exposed to allow subsystems that already work around the limitations of the current non-exclusive GPIO handling in some ways, to gradually convert to relying on the new shared GPIO feature of GPIOLIB. Extend the gpiolib-shared module to mark the GPIO shared proxy descriptors with a flag checked by the new interface. Reviewed-by: Linus Walleij Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20251112-gpio-shared-v4-6-b51f97b1abd8@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/consumer.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 00df68c51405..a8acb7c0b5af 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -167,6 +167,8 @@ int gpiod_cansleep(const struct gpio_desc *desc); int gpiod_to_irq(const struct gpio_desc *desc); int gpiod_set_consumer_name(struct gpio_desc *desc, const char *name); +bool gpiod_is_shared(const struct gpio_desc *desc); + /* Convert between the old gpio_ and new gpiod_ interfaces */ struct gpio_desc *gpio_to_desc(unsigned gpio); int desc_to_gpio(const struct gpio_desc *desc); @@ -520,6 +522,13 @@ static inline int gpiod_set_consumer_name(struct gpio_desc *desc, return -EINVAL; } +static inline bool gpiod_is_shared(const struct gpio_desc *desc) +{ + /* GPIO can never have been requested */ + WARN_ON(desc); + return false; +} + static inline struct gpio_desc *gpio_to_desc(unsigned gpio) { return NULL; -- cgit v1.2.3 From b98994cb9bc24f5c7575c86650f96c384576fdfa Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Mon, 17 Nov 2025 02:54:19 +0000 Subject: mtd: spinand: esmt: add support for F50L1G41LC This adds support for ESMT F50L1G41LC, which appears to be an updated version of the already supported F50L1G41LB. Add esmt_8c SPI_NAND manufacturer to account for the newly used vendor ID with support for the ESMT F50L1G41LC chip. Link: https://github.com/openwrt/openwrt/pull/15214#issuecomment-3514824435 Signed-off-by: Daniel Golle Signed-off-by: Miquel Raynal --- include/linux/mtd/spinand.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mtd/spinand.h b/include/linux/mtd/spinand.h index 927c10d78769..ce76f5c632e1 100644 --- a/include/linux/mtd/spinand.h +++ b/include/linux/mtd/spinand.h @@ -354,6 +354,7 @@ struct spinand_manufacturer { /* SPI NAND manufacturers */ extern const struct spinand_manufacturer alliancememory_spinand_manufacturer; extern const struct spinand_manufacturer ato_spinand_manufacturer; +extern const struct spinand_manufacturer esmt_8c_spinand_manufacturer; extern const struct spinand_manufacturer esmt_c8_spinand_manufacturer; extern const struct spinand_manufacturer fmsh_spinand_manufacturer; extern const struct spinand_manufacturer foresee_spinand_manufacturer; -- cgit v1.2.3 From e678c2a0063ec931642b3c5935fb0c3c1282b6b3 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 4 Nov 2025 14:16:44 +0200 Subject: PCI: Add Intel Nova Lake S audio Device ID Add Nova Lake S (NVL-S) audio Device ID The ID will be used by HDA legacy, SOF audio stack and the driver to determine which audio stack should be used (intel-dsp-config). Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Ranjani Sridharan Acked-by: Bjorn Helgaas Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20251104121650.21872-2-peter.ujfalusi@linux.intel.com --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 92ffc4373f6d..a9a089566b7c 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3075,6 +3075,7 @@ #define PCI_DEVICE_ID_INTEL_5100_22 0x65f6 #define PCI_DEVICE_ID_INTEL_IOAT_SCNB 0x65ff #define PCI_DEVICE_ID_INTEL_HDA_FCL 0x67a8 +#define PCI_DEVICE_ID_INTEL_HDA_NVL_S 0x6e50 #define PCI_DEVICE_ID_INTEL_82371SB_0 0x7000 #define PCI_DEVICE_ID_INTEL_82371SB_1 0x7010 #define PCI_DEVICE_ID_INTEL_82371SB_2 0x7020 -- cgit v1.2.3 From 2bd7bf3ccc83074dbaf53c941539732652451b09 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 4 Nov 2025 14:16:46 +0200 Subject: ASoC: Intel: soc-acpi: add NVL match tables For now the tables are basic for mockup devices Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Ranjani Sridharan Acked-by: Mark Brown Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20251104121650.21872-4-peter.ujfalusi@linux.intel.com --- include/sound/soc-acpi-intel-match.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/soc-acpi-intel-match.h b/include/sound/soc-acpi-intel-match.h index daed7123df9d..382029724e85 100644 --- a/include/sound/soc-acpi-intel-match.h +++ b/include/sound/soc-acpi-intel-match.h @@ -34,6 +34,7 @@ extern struct snd_soc_acpi_mach snd_soc_acpi_intel_mtl_machines[]; extern struct snd_soc_acpi_mach snd_soc_acpi_intel_lnl_machines[]; extern struct snd_soc_acpi_mach snd_soc_acpi_intel_arl_machines[]; extern struct snd_soc_acpi_mach snd_soc_acpi_intel_ptl_machines[]; +extern struct snd_soc_acpi_mach snd_soc_acpi_intel_nvl_machines[]; extern struct snd_soc_acpi_mach snd_soc_acpi_intel_cnl_sdw_machines[]; extern struct snd_soc_acpi_mach snd_soc_acpi_intel_cfl_sdw_machines[]; @@ -46,6 +47,7 @@ extern struct snd_soc_acpi_mach snd_soc_acpi_intel_mtl_sdw_machines[]; extern struct snd_soc_acpi_mach snd_soc_acpi_intel_lnl_sdw_machines[]; extern struct snd_soc_acpi_mach snd_soc_acpi_intel_arl_sdw_machines[]; extern struct snd_soc_acpi_mach snd_soc_acpi_intel_ptl_sdw_machines[]; +extern struct snd_soc_acpi_mach snd_soc_acpi_intel_nvl_sdw_machines[]; /* * generic table used for HDA codec-based platforms, possibly with -- cgit v1.2.3 From 33cf66d88306663d16e4759e9d24766b0aaa2e17 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Nov 2025 17:01:31 +0100 Subject: sched/fair: Proportional newidle balance Add a randomized algorithm that runs newidle balancing proportional to its success rate. This improves schbench significantly: 6.18-rc4: 2.22 Mrps/s 6.18-rc4+revert: 2.04 Mrps/s 6.18-rc4+revert+random: 2.18 Mrps/S Conversely, per Adam Li this affects SpecJBB slightly, reducing it by 1%: 6.17: -6% 6.17+revert: 0% 6.17+revert+random: -1% Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Tested-by: Chris Mason Link: https://lkml.kernel.org/r/6825c50d-7fa7-45d8-9b81-c6e7e25738e2@meta.com Link: https://patch.msgid.link/20251107161739.770122091@infradead.org --- include/linux/sched/topology.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index bbcfdf12aa6e..45c0022b91ce 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -92,6 +92,9 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ /* idle_balance() stats */ + unsigned int newidle_call; + unsigned int newidle_success; + unsigned int newidle_ratio; u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; -- cgit v1.2.3 From 96498e804cb6629e02747336a0a33e4955449732 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 17 Nov 2025 17:12:47 +0100 Subject: spi: davinci: remove platform data header There are no longer any board files including the DaVinci SPI platform data header. Let's move the bits and pieces that are used in the driver into the driver .c file itself and remove the header. Signed-off-by: Bartosz Golaszewski Link: https://patch.msgid.link/20251117-davinci-spi-v2-1-cd799d17f04a@linaro.org Signed-off-by: Mark Brown --- include/linux/platform_data/spi-davinci.h | 73 ------------------------------- 1 file changed, 73 deletions(-) delete mode 100644 include/linux/platform_data/spi-davinci.h (limited to 'include') diff --git a/include/linux/platform_data/spi-davinci.h b/include/linux/platform_data/spi-davinci.h deleted file mode 100644 index 2cb5cc70fd9d..000000000000 --- a/include/linux/platform_data/spi-davinci.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright 2009 Texas Instruments. - */ - -#ifndef __ARCH_ARM_DAVINCI_SPI_H -#define __ARCH_ARM_DAVINCI_SPI_H - -#include - -#define SPI_INTERN_CS 0xFF - -enum { - SPI_VERSION_1, /* For DM355/DM365/DM6467 */ - SPI_VERSION_2, /* For DA8xx */ -}; - -/** - * davinci_spi_platform_data - Platform data for SPI master device on DaVinci - * - * @version: version of the SPI IP. Different DaVinci devices have slightly - * varying versions of the same IP. - * @num_chipselect: number of chipselects supported by this SPI master - * @intr_line: interrupt line used to connect the SPI IP to the ARM interrupt - * controller withn the SoC. Possible values are 0 and 1. - * @cshold_bug: set this to true if the SPI controller on your chip requires - * a write to CSHOLD bit in between transfers (like in DM355). - * @dma_event_q: DMA event queue to use if SPI_IO_TYPE_DMA is used for any - * device on the bus. - */ -struct davinci_spi_platform_data { - u8 version; - u8 num_chipselect; - u8 intr_line; - u8 prescaler_limit; - bool cshold_bug; - enum dma_event_q dma_event_q; -}; - -/** - * davinci_spi_config - Per-chip-select configuration for SPI slave devices - * - * @wdelay: amount of delay between transmissions. Measured in number of - * SPI module clocks. - * @odd_parity: polarity of parity flag at the end of transmit data stream. - * 0 - odd parity, 1 - even parity. - * @parity_enable: enable transmission of parity at end of each transmit - * data stream. - * @io_type: type of IO transfer. Choose between polled, interrupt and DMA. - * @timer_disable: disable chip-select timers (setup and hold) - * @c2tdelay: chip-select setup time. Measured in number of SPI module clocks. - * @t2cdelay: chip-select hold time. Measured in number of SPI module clocks. - * @t2edelay: transmit data finished to SPI ENAn pin inactive time. Measured - * in number of SPI clocks. - * @c2edelay: chip-select active to SPI ENAn signal active time. Measured in - * number of SPI clocks. - */ -struct davinci_spi_config { - u8 wdelay; - u8 odd_parity; - u8 parity_enable; -#define SPI_IO_TYPE_INTR 0 -#define SPI_IO_TYPE_POLL 1 -#define SPI_IO_TYPE_DMA 2 - u8 io_type; - u8 timer_disable; - u8 c2tdelay; - u8 t2cdelay; - u8 t2edelay; - u8 c2edelay; -}; - -#endif /* __ARCH_ARM_DAVINCI_SPI_H */ -- cgit v1.2.3 From f49ae86483c494ddc793d889f6df5ea68d138569 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Mon, 17 Nov 2025 10:47:54 +0000 Subject: memregion: Drop unused IORES_DESC_* parameter from cpu_cache_invalidate_memregion() The res_desc parameter was originally introduced for documentation purposes and with the idea that with HDM-DB CXL invalidation could be triggered from the device. That has not come to pass and the continued existence of the option is confusing when we add a range in the following patch which might not be a strict subset of the res_desc. So avoid that confusion by dropping the parameter. Link: https://lore.kernel.org/linux-mm/686eedb25ed02_24471002e@dwillia2-xfh.jf.intel.com.notmuch/ Reviewed-by: Dan Williams Suggested-by: Dan Williams Signed-off-by: Jonathan Cameron Signed-off-by: Conor Dooley --- include/linux/memregion.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/memregion.h b/include/linux/memregion.h index c01321467789..945646bde825 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -26,8 +26,7 @@ static inline void memregion_free(int id) /** * cpu_cache_invalidate_memregion - drop any CPU cached data for - * memregions described by @res_desc - * @res_desc: one of the IORES_DESC_* types + * memregion * * Perform cache maintenance after a memory event / operation that * changes the contents of physical memory in a cache-incoherent manner. @@ -46,7 +45,7 @@ static inline void memregion_free(int id) * the cache maintenance. */ #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION -int cpu_cache_invalidate_memregion(int res_desc); +int cpu_cache_invalidate_memregion(void); bool cpu_cache_has_invalidate_memregion(void); #else static inline bool cpu_cache_has_invalidate_memregion(void) @@ -54,7 +53,7 @@ static inline bool cpu_cache_has_invalidate_memregion(void) return false; } -static inline int cpu_cache_invalidate_memregion(int res_desc) +static inline int cpu_cache_invalidate_memregion(void) { WARN_ON_ONCE("CPU cache invalidation required"); return -ENXIO; -- cgit v1.2.3 From b43652d867cf2a5f31b14e3d9a320ad01fca0992 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 17 Nov 2025 10:47:55 +0000 Subject: memregion: Support fine grained invalidate by cpu_cache_invalidate_memregion() Extend cpu_cache_invalidate_memregion() to support invalidating a particular range of memory by introducing start and length parameters. Control of types of invalidation is left for when use cases turn up. For now everything is Clean and Invalidate. Where the range is unknown, use the provided cpu_cache_invalidate_all() helper to act as documentation of intent in a fashion that is clearer than passing (0, -1) to cpu_cache_invalidate_memregion(). Signed-off-by: Yicong Yang Reviewed-by: Dan Williams Acked-by: Davidlohr Bueso Signed-off-by: Jonathan Cameron Signed-off-by: Conor Dooley --- include/linux/memregion.h | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/memregion.h b/include/linux/memregion.h index 945646bde825..a55f62cc5266 100644 --- a/include/linux/memregion.h +++ b/include/linux/memregion.h @@ -27,6 +27,9 @@ static inline void memregion_free(int id) /** * cpu_cache_invalidate_memregion - drop any CPU cached data for * memregion + * @start: start physical address of the target memory region. + * @len: length of the target memory region. -1 for all the regions of + * the target type. * * Perform cache maintenance after a memory event / operation that * changes the contents of physical memory in a cache-incoherent manner. @@ -45,7 +48,7 @@ static inline void memregion_free(int id) * the cache maintenance. */ #ifdef CONFIG_ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION -int cpu_cache_invalidate_memregion(void); +int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len); bool cpu_cache_has_invalidate_memregion(void); #else static inline bool cpu_cache_has_invalidate_memregion(void) @@ -53,10 +56,16 @@ static inline bool cpu_cache_has_invalidate_memregion(void) return false; } -static inline int cpu_cache_invalidate_memregion(void) +static inline int cpu_cache_invalidate_memregion(phys_addr_t start, size_t len) { WARN_ON_ONCE("CPU cache invalidation required"); return -ENXIO; } #endif + +static inline int cpu_cache_invalidate_all(void) +{ + return cpu_cache_invalidate_memregion(0, -1); +} + #endif /* _MEMREGION_H_ */ -- cgit v1.2.3 From e275d9091c01b3b46f3ec534ce4ac77cffc9e3ae Mon Sep 17 00:00:00 2001 From: Erni Sri Satya Vennela Date: Fri, 14 Nov 2025 03:43:18 -0800 Subject: net: mana: Move hardware counter stats from per-port to per-VF context Move hardware counter (HC) statistics from mana_port_context to mana_context to enable sharing stats across multiple network ports on the same MANA VF. Previously, each network port queried hardware counters independently using MANA_QUERY_GF_STAT command (GF = Generic Function stats from GDMA hardware), resulting in redundant queries when multiple ports existed on the same device. Isolate hardware counter stats by introducing mana_ethtool_hc_stats in mana_context and update the code to ensure all stats are properly reported via ethtool -S , maintaining consistency with previous behavior. Signed-off-by: Erni Sri Satya Vennela Reviewed-by: Haiyang Zhang Link: https://patch.msgid.link/1763120599-6331-2-git-send-email-ernis@linux.microsoft.com Signed-off-by: Jakub Kicinski --- include/net/mana/mana.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 8906901535f5..3484f42803e3 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -375,6 +375,13 @@ struct mana_tx_qp { struct mana_ethtool_stats { u64 stop_queue; u64 wake_queue; + u64 tx_cqe_err; + u64 tx_cqe_unknown_type; + u64 rx_coalesced_err; + u64 rx_cqe_unknown_type; +}; + +struct mana_ethtool_hc_stats { u64 hc_rx_discards_no_wqe; u64 hc_rx_err_vport_disabled; u64 hc_rx_bytes; @@ -402,10 +409,6 @@ struct mana_ethtool_stats { u64 hc_tx_mcast_pkts; u64 hc_tx_mcast_bytes; u64 hc_tx_err_gdma; - u64 tx_cqe_err; - u64 tx_cqe_unknown_type; - u64 rx_coalesced_err; - u64 rx_cqe_unknown_type; }; struct mana_ethtool_phy_stats { @@ -473,6 +476,7 @@ struct mana_context { u16 num_ports; u8 bm_hostmode; + struct mana_ethtool_hc_stats hc_stats; struct mana_eq *eqs; struct dentry *mana_eqs_debugfs; @@ -577,7 +581,7 @@ u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq, struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); -void mana_query_gf_stats(struct mana_port_context *apc); +void mana_query_gf_stats(struct mana_context *ac); int mana_query_link_cfg(struct mana_port_context *apc); int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, int enable_clamping); -- cgit v1.2.3 From be4f1d67ec56f23f37714ac73c01094e63c7ff28 Mon Sep 17 00:00:00 2001 From: Erni Sri Satya Vennela Date: Fri, 14 Nov 2025 03:43:19 -0800 Subject: net: mana: Add standard counter rx_missed_errors Report standard counter stats->rx_missed_errors using hc_rx_discards_no_wqe from the hardware. Add a global workqueue to periodically run mana_query_gf_stats every 2 seconds to get the latest info in eth_stats and define a driver capability flag to notify hardware of the periodic queries. To avoid repeated failures and log flooding, the workqueue is not rescheduled if mana_query_gf_stats fails on HWC timeout error and the stats are reset to 0. Other errors are transient which will not need a VF reset for recovery. Signed-off-by: Erni Sri Satya Vennela Reviewed-by: Haiyang Zhang Link: https://patch.msgid.link/1763120599-6331-3-git-send-email-ernis@linux.microsoft.com Signed-off-by: Jakub Kicinski --- include/net/mana/gdma.h | 6 +++++- include/net/mana/mana.h | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 637f42485dba..2e4f2f3175e5 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -592,6 +592,9 @@ enum { #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) #define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6) +/* Driver can send HWC periodically to query stats */ +#define GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY BIT(21) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ @@ -601,7 +604,8 @@ enum { GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \ GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \ GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \ - GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE) + GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE | \ + GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY) #define GDMA_DRV_CAP_FLAGS2 0 diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 3484f42803e3..d37f4cea0ac3 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -480,6 +480,10 @@ struct mana_context { struct mana_eq *eqs; struct dentry *mana_eqs_debugfs; + /* Workqueue for querying hardware stats */ + struct delayed_work gf_stats_work; + bool hwc_timeout_occurred; + struct net_device *ports[MAX_PORTS_IN_MANA_DEV]; /* Link state change work */ @@ -581,7 +585,7 @@ u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq, struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); -void mana_query_gf_stats(struct mana_context *ac); +int mana_query_gf_stats(struct mana_context *ac); int mana_query_link_cfg(struct mana_port_context *apc); int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, int enable_clamping); -- cgit v1.2.3 From 24afd7827efb7c69adfc41835390470e3eec4740 Mon Sep 17 00:00:00 2001 From: Inochi Amaoto Date: Fri, 14 Nov 2025 08:38:04 +0800 Subject: net: phy: Add helper for fixing RGMII PHY mode based on internal mac delay The "phy-mode" property of devicetree indicates whether the PCB has delay now, which means the mac needs to modify the PHY mode based on whether there is an internal delay in the mac. This modification is similar for many ethernet drivers. To simplify code, define the helper phy_fix_phy_mode_for_mac_delays(speed, mac_txid, mac_rxid) to fix PHY mode based on whether mac adds internal delay. Suggested-by: Russell King (Oracle) Signed-off-by: Inochi Amaoto Reviewed-by: Maxime Chevallier Reviewed-by: Simon Horman Link: https://patch.msgid.link/20251114003805.494387-3-inochiama@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index bf5457341ca8..65b0c3ca6a2b 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -2040,6 +2040,9 @@ static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev) return phydev->is_pseudo_fixed_link; } +phy_interface_t phy_fix_phy_mode_for_mac_delays(phy_interface_t interface, + bool mac_txid, bool mac_rxid); + int phy_save_page(struct phy_device *phydev); int phy_select_page(struct phy_device *phydev, int page); int phy_restore_page(struct phy_device *phydev, int oldpage, int ret); -- cgit v1.2.3 From fc45aee66223253ec5547094d7552819914abdfb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 10 Mar 2025 00:06:29 -0400 Subject: get rid of kill_litter_super() Not used anymore. Signed-off-by: Al Viro --- include/linux/dcache.h | 1 - include/linux/fs.h | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6ec4066825e3..20a85144a00e 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -198,7 +198,6 @@ enum dentry_flags { DCACHE_REFERENCED = BIT(6), /* Recently used, don't discard. */ DCACHE_DONTCACHE = BIT(7), /* Purge from memory on final dput() */ DCACHE_CANT_MOUNT = BIT(8), - DCACHE_GENOCIDE = BIT(9), DCACHE_SHRINK_LIST = BIT(10), DCACHE_OP_WEAK_REVALIDATE = BIT(11), /* diff --git a/include/linux/fs.h b/include/linux/fs.h index f5037c556f61..95933ceaae51 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2728,7 +2728,6 @@ void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); -void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); -- cgit v1.2.3 From ca459ca70f60ce05445845eca74c788b0d5ddb1b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 25 Oct 2025 18:34:49 -0400 Subject: kill securityfs_recursive_remove() it's an unused alias for securityfs_remove() Acked-by: Paul Moore Signed-off-by: Al Viro --- include/linux/security.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/security.h b/include/linux/security.h index 92ac3f27b973..9e710cfee744 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2258,8 +2258,6 @@ static inline void securityfs_remove(struct dentry *dentry) #endif -#define securityfs_recursive_remove securityfs_remove - #ifdef CONFIG_BPF_SYSCALL union bpf_attr; struct bpf_map; -- cgit v1.2.3 From eb028c33451af08bb34f45c6be6967ef1c98cbd1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Oct 2025 18:32:21 -0400 Subject: d_make_discardable(): warn if given a non-persistent dentry At this point there are very few call chains that might lead to d_make_discardable() on a dentry that hadn't been made persistent: calls of simple_unlink() and simple_rmdir() in configfs and apparmorfs. Both filesystems do pin (part of) their contents in dcache, but they are currently playing very unusual games with that. Converting them to more usual patterns might be possible, but it's definitely going to be a long series of changes in both cases. For now the easiest solution is to have both stop using simple_unlink() and simple_rmdir() - that allows to make d_make_discardable() warn when given a non-persistent dentry. Rather than giving them full-blown private copies (with calls of d_make_discardable() replaced with dput()), let's pull the parts of simple_unlink() and simple_rmdir() that deal with timestamps and link counts into separate helpers (__simple_unlink() and __simple_rmdir() resp.) and have those used by configfs and apparmorfs. Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 95933ceaae51..ef842adbd418 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3621,6 +3621,8 @@ extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); +extern void __simple_unlink(struct inode *, struct dentry *); +extern void __simple_rmdir(struct inode *, struct dentry *); void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, -- cgit v1.2.3 From 7dc211c1159d991db609bdf4b0fb9033c04adcbc Mon Sep 17 00:00:00 2001 From: Pu Lehui Date: Sat, 15 Nov 2025 10:23:43 +0000 Subject: bpf: Fix invalid prog->stats access when update_effective_progs fails Syzkaller triggers an invalid memory access issue following fault injection in update_effective_progs. The issue can be described as follows: __cgroup_bpf_detach update_effective_progs compute_effective_progs bpf_prog_array_alloc <-- fault inject purge_effective_progs /* change to dummy_bpf_prog */ array->items[index] = &dummy_bpf_prog.prog ---softirq start--- __do_softirq ... __cgroup_bpf_run_filter_skb __bpf_prog_run_save_cb bpf_prog_run stats = this_cpu_ptr(prog->stats) /* invalid memory access */ flags = u64_stats_update_begin_irqsave(&stats->syncp) ---softirq end--- static_branch_dec(&cgroup_bpf_enabled_key[atype]) The reason is that fault injection caused update_effective_progs to fail and then changed the original prog into dummy_bpf_prog.prog in purge_effective_progs. Then a softirq came, and accessing the members of dummy_bpf_prog.prog in the softirq triggers invalid mem access. To fix it, skip updating stats when stats is NULL. Fixes: 492ecee892c2 ("bpf: enable program stats") Signed-off-by: Pu Lehui Link: https://lore.kernel.org/r/20251115102343.2200727-1-pulehui@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/filter.h b/include/linux/filter.h index 973233b82dc1..569de3b14279 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -712,11 +712,13 @@ static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, ret = dfunc(ctx, prog->insnsi, prog->bpf_func); duration = sched_clock() - start; - stats = this_cpu_ptr(prog->stats); - flags = u64_stats_update_begin_irqsave(&stats->syncp); - u64_stats_inc(&stats->cnt); - u64_stats_add(&stats->nsecs, duration); - u64_stats_update_end_irqrestore(&stats->syncp, flags); + if (likely(prog->stats)) { + stats = this_cpu_ptr(prog->stats); + flags = u64_stats_update_begin_irqsave(&stats->syncp); + u64_stats_inc(&stats->cnt); + u64_stats_add(&stats->nsecs, duration); + u64_stats_update_end_irqrestore(&stats->syncp, flags); + } } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } -- cgit v1.2.3 From 945865a0ddf3e3950aea32e23e10d815ee9b21bc Mon Sep 17 00:00:00 2001 From: Antheas Kapenekakis Date: Sun, 26 Oct 2025 20:16:34 +0100 Subject: ALSA: hda/tas2781: fix speaker id retrieval for multiple probes Currently, on ASUS projects, the TAS2781 codec attaches the speaker GPIO to the first tasdevice_priv instance using devm. This causes tas2781_read_acpi to fail on subsequent probes since the GPIO is already managed by the first device. This causes a failure on Xbox Ally X, because it has two amplifiers, and prevents us from quirking both the Xbox Ally and Xbox Ally X in the realtek codec driver. It is unnecessary to attach the GPIO to a device as it is static. Therefore, instead of attaching it and then reading it when loading the firmware, read its value directly in tas2781_read_acpi and store it in the private data structure. Then, make reading the value non-fatal so that ASUS projects that miss a speaker pin can still work, perhaps using fallback firmware. Fixes: 4e7035a75da9 ("ALSA: hda/tas2781: Add speaker id check for ASUS projects") Cc: stable@vger.kernel.org # 6.17 Signed-off-by: Antheas Kapenekakis Reviewed-by: Baojun Xu Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20251026191635.2447593-1-lkml@antheas.dev --- include/sound/tas2781.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h index 0fbcdb15c74b..29d15ba65f04 100644 --- a/include/sound/tas2781.h +++ b/include/sound/tas2781.h @@ -197,7 +197,6 @@ struct tasdevice_priv { struct acoustic_data acou_data; #endif struct tasdevice_fw *fmw; - struct gpio_desc *speaker_id; struct gpio_desc *reset; struct mutex codec_lock; struct regmap *regmap; @@ -215,6 +214,7 @@ struct tasdevice_priv { unsigned int magic_num; unsigned int chip_id; unsigned int sysclk; + int speaker_id; int irq; int cur_prog; -- cgit v1.2.3 From ae8966b7b5bd69b86209cc34bcca1ba9f18b68e6 Mon Sep 17 00:00:00 2001 From: Peter Hutterer Date: Thu, 6 Nov 2025 21:45:34 +1000 Subject: Input: rename INPUT_PROP_HAPTIC_TOUCHPAD to INPUT_PROP_PRESSUREPAD And expand it to encompass all pressure pads. Definition: "pressure pad" as used here as includes all touchpads that use physical pressure to convert to click, without physical hinges. Also called haptic touchpads in general parlance, Synaptics calls them ForcePads. Most (all?) pressure pads are currently advertised as INPUT_PROP_BUTTONPAD. The suggestion to identify them as pressure pads by defining the resolution on ABS_MT_PRESSURE has been in the docs since commit 20ccc8dd38a3 ("Documentation: input: define ABS_PRESSURE/ABS_MT_PRESSURE resolution as grams") but few devices provide this information. In userspace it's thus impossible to determine whether a device is a true pressure pad (pressure equals pressure) or a normal clickpad with (pressure equals finger size). Commit 7075ae4ac9db ("Input: add INPUT_PROP_HAPTIC_TOUCHPAD") introduces INPUT_PROP_HAPTIC_TOUCHPAD but restricted it to those touchpads that have support for userspace-controlled effects. Let's expand and rename that definition to include all pressure pad touchpads since those that do support FF effects can be identified by the presence of the FF_HAPTIC bit. This means: - clickpad: INPUT_PROP_BUTTONPAD - pressurepad: INPUT_PROP_BUTTONPAD + INPUT_PROP_PRESSUREPAD - pressurepad with configurable haptics: INPUT_PROP_BUTTONPAD + INPUT_PROP_PRESSUREPAD + FF_HAPTIC Signed-off-by: Peter Hutterer Acked-by: Benjamin Tissoires Link: https://patch.msgid.link/20251106114534.GA405512@tassie Signed-off-by: Dmitry Torokhov --- include/uapi/linux/input-event-codes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 9cd89bcc1d9c..30f3c9eaafaa 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -27,7 +27,7 @@ #define INPUT_PROP_TOPBUTTONPAD 0x04 /* softbuttons at top of pad */ #define INPUT_PROP_POINTING_STICK 0x05 /* is a pointing stick */ #define INPUT_PROP_ACCELEROMETER 0x06 /* has accelerometer */ -#define INPUT_PROP_HAPTIC_TOUCHPAD 0x07 /* is a haptic touchpad */ +#define INPUT_PROP_PRESSUREPAD 0x07 /* pressure triggers clicks */ #define INPUT_PROP_MAX 0x1f #define INPUT_PROP_CNT (INPUT_PROP_MAX + 1) -- cgit v1.2.3 From 9c7dacf5d51910f34a3bd709403f6a82ffc8c960 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:14 +0100 Subject: platform/x86: asus-armoury: add apu-mem control support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the APU memory size control under the asus-armoury module using the fw_attributes class. This allows the APU allocated memory size to be adjusted depending on the users priority. A reboot is required after change. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Link: https://patch.msgid.link/20251102215319.3126879-5-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 3cc235b20be4..9a6433d08973 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -136,6 +136,8 @@ /* dgpu on/off */ #define ASUS_WMI_DEVID_DGPU 0x00090020 +#define ASUS_WMI_DEVID_APU_MEM 0x000600C1 + /* gpu mux switch, 0 = dGPU, 1 = Optimus */ #define ASUS_WMI_DEVID_GPU_MUX 0x00090016 #define ASUS_WMI_DEVID_GPU_MUX_VIVO 0x00090026 -- cgit v1.2.3 From 7725a2dc58632cb44eeef2e5b959ab7b7931298d Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:16 +0100 Subject: platform/x86: asus-armoury: add screen auto-brightness toggle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add screen_auto_brightness toggle supported on some laptops. Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Reviewed-by: Ilpo Järvinen Link: https://patch.msgid.link/20251102215319.3126879-7-denis.benato@linux.dev Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 9a6433d08973..3af075baf9f7 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -82,6 +82,7 @@ #define ASUS_WMI_DEVID_LID_FLIP_ROG 0x00060077 #define ASUS_WMI_DEVID_MINI_LED_MODE 0x0005001E #define ASUS_WMI_DEVID_MINI_LED_MODE2 0x0005002E +#define ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS 0x0005002A /* Storage */ #define ASUS_WMI_DEVID_CARDREADER 0x00080013 -- cgit v1.2.3 From d849a9f2380d5287d5133eac5bae602a147b86c2 Mon Sep 17 00:00:00 2001 From: Denis Benato Date: Sun, 2 Nov 2025 22:53:18 +0100 Subject: platform/x86: asus-wmi: rename ASUS_WMI_DEVID_PPT_FPPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintain power-related WMI macros naming consistency: rename ASUS_WMI_DEVID_PPT_FPPT to ASUS_WMI_DEVID_PPT_PL3_FPPT. Link: https://lore.kernel.org/all/cad7b458-5a7a-4975-94a1-d0c74f6f3de5@oracle.com/ Suggested-by: ALOK TIWARI Signed-off-by: Denis Benato Link: https://.../ Link: https://patch.msgid.link/20251102215319.3126879-9-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index 3af075baf9f7..e7c95e9d29db 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -107,7 +107,7 @@ #define ASUS_WMI_DEVID_PPT_PL1_SPL 0x001200A3 #define ASUS_WMI_DEVID_PPT_APU_SPPT 0x001200B0 #define ASUS_WMI_DEVID_PPT_PLAT_SPPT 0x001200B1 -#define ASUS_WMI_DEVID_PPT_FPPT 0x001200C1 +#define ASUS_WMI_DEVID_PPT_PL3_FPPT 0x001200C1 #define ASUS_WMI_DEVID_NV_DYN_BOOST 0x001200C0 #define ASUS_WMI_DEVID_NV_THERM_TARGET 0x001200C2 -- cgit v1.2.3 From 39ae6c50e599aa0cf62ea3d0dcf06492f7690ed7 Mon Sep 17 00:00:00 2001 From: "Luke D. Jones" Date: Sun, 2 Nov 2025 22:53:19 +0100 Subject: platform/x86: asus-armoury: add ppt_* and nv_* tuning knobs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the ppt_* and nv_* tuning knobs that are available via WMI methods and adds proper min/max levels plus defaults. The min/max are defined by ASUS and typically gained by looking at what they allow in the ASUS Armoury Crate application - ASUS does not share the values outside of this. It could also be possible to gain the AMD values by use of ryzenadj and testing for the minimum stable value. The general rule of thumb for adding to the match table is that if the model range has a single CPU used throughout, then the DMI match can omit the last letter of the model number as this is the GPU model. If a min or max value is not provided it is assumed that the particular setting is not supported. for example ppt_pl2_sppt_min/max is not set. If a _def is not set then the default is assumed to be _max It is assumed that at least AC settings are available so that the firmware attributes will be created - if no DC table is available and power is on DC, then reading the attributes is -ENODEV. Co-developed-by: Denis Benato Signed-off-by: Denis Benato Signed-off-by: Luke D. Jones Reviewed-by: Mario Limonciello Tested-by: Mateusz Schyboll Tested-by: Porfet Lillian Link: https://patch.msgid.link/20251102215319.3126879-10-denis.benato@linux.dev Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/asus-wmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h index e7c95e9d29db..419491d4abca 100644 --- a/include/linux/platform_data/x86/asus-wmi.h +++ b/include/linux/platform_data/x86/asus-wmi.h @@ -139,6 +139,9 @@ #define ASUS_WMI_DEVID_APU_MEM 0x000600C1 +#define ASUS_WMI_DEVID_DGPU_BASE_TGP 0x00120099 +#define ASUS_WMI_DEVID_DGPU_SET_TGP 0x00120098 + /* gpu mux switch, 0 = dGPU, 1 = Optimus */ #define ASUS_WMI_DEVID_GPU_MUX 0x00090016 #define ASUS_WMI_DEVID_GPU_MUX_VIVO 0x00090026 -- cgit v1.2.3 From 32e3fee88a4ac183541b478f5bc94084ea76436c Mon Sep 17 00:00:00 2001 From: Armin Wolf Date: Tue, 11 Nov 2025 14:11:24 +0100 Subject: platform/x86: wmi: Remove extern keyword from prototypes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The external function definitions do not need the "extern" keyword. Remove it to silence the associated checkpatch warnings. Signed-off-by: Armin Wolf Link: https://patch.msgid.link/20251111131125.3379-4-W_Armin@gmx.de Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/wmi.h | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/wmi.h b/include/linux/wmi.h index 10751c8e5e6a..665ea7dc8a92 100644 --- a/include/linux/wmi.h +++ b/include/linux/wmi.h @@ -36,13 +36,10 @@ struct wmi_device { */ #define to_wmi_device(device) container_of_const(device, struct wmi_device, dev) -extern acpi_status wmidev_evaluate_method(struct wmi_device *wdev, - u8 instance, u32 method_id, - const struct acpi_buffer *in, - struct acpi_buffer *out); +acpi_status wmidev_evaluate_method(struct wmi_device *wdev, u8 instance, u32 method_id, + const struct acpi_buffer *in, struct acpi_buffer *out); -extern union acpi_object *wmidev_block_query(struct wmi_device *wdev, - u8 instance); +union acpi_object *wmidev_block_query(struct wmi_device *wdev, u8 instance); acpi_status wmidev_block_set(struct wmi_device *wdev, u8 instance, const struct acpi_buffer *in); @@ -81,9 +78,9 @@ struct wmi_driver { */ #define to_wmi_driver(drv) container_of_const(drv, struct wmi_driver, driver) -extern int __must_check __wmi_driver_register(struct wmi_driver *driver, - struct module *owner); -extern void wmi_driver_unregister(struct wmi_driver *driver); +int __must_check __wmi_driver_register(struct wmi_driver *driver, struct module *owner); + +void wmi_driver_unregister(struct wmi_driver *driver); /** * wmi_driver_register() - Helper macro to register a WMI driver -- cgit v1.2.3 From 6eb2e056b0e418718fc5a3cfe79bdb41d9a2851d Mon Sep 17 00:00:00 2001 From: Dnyaneshwar Bhadane Date: Mon, 22 Sep 2025 20:33:15 +0530 Subject: drm/pcids: Split PTL pciids group to make wcl subplatform To form the WCL platform as a subplatform of PTL in definition, WCL pci ids are splited into saparate group from PTL. So update the pciidlist struct to cover all the pci ids. v2: - Squash wcl description in single patch for display and xe.(jani,gustavo) Fixes: 3c0f211bc8fc ("drm/xe: Add Wildcat Lake device IDs to PTL list") Signed-off-by: Dnyaneshwar Bhadane Reviewed-by: Gustavo Sousa Signed-off-by: Suraj Kandpal Link: https://lore.kernel.org/r/20250922150317.2334680-2-dnyaneshwar.bhadane@intel.com (cherry picked from commit 32620e176443bf23ec81bfe8f177c6721a904864) Signed-off-by: Rodrigo Vivi [Rodrigo added the Fixes tag when porting it to fixes] --- include/drm/intel/pciids.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/intel/pciids.h b/include/drm/intel/pciids.h index da6301a6fcea..69d4ae92d822 100644 --- a/include/drm/intel/pciids.h +++ b/include/drm/intel/pciids.h @@ -877,7 +877,10 @@ MACRO__(0xB08F, ## __VA_ARGS__), \ MACRO__(0xB090, ## __VA_ARGS__), \ MACRO__(0xB0A0, ## __VA_ARGS__), \ - MACRO__(0xB0B0, ## __VA_ARGS__), \ + MACRO__(0xB0B0, ## __VA_ARGS__) + +/* WCL */ +#define INTEL_WCL_IDS(MACRO__, ...) \ MACRO__(0xFD80, ## __VA_ARGS__), \ MACRO__(0xFD81, ## __VA_ARGS__) -- cgit v1.2.3 From 80adaccf0e1c8c8fff44be2d959f6dba80af0491 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 18 Nov 2025 13:52:13 +0300 Subject: rseq: Delete duplicate if statement in rseq_virt_userspace_exit() This if statement is indented weirdly. It's a duplicate and doesn't affect runtime (beyond wasting a little time). Delete it. Signed-off-by: Dan Carpenter Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/aRxP3YcwscrP1BU_@stanley.mountain --- include/linux/rseq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index b5e4803c4ebe..bf8a6bf315f3 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -126,7 +126,6 @@ static inline void rseq_force_update(void) */ static inline void rseq_virt_userspace_exit(void) { - if (current->rseq.event.sched_switch) /* * The generic optimization for deferring RSEQ updates until the next * exit relies on having a dedicated TIF_RSEQ. -- cgit v1.2.3 From 5bebe8de19264946d398ead4e6c20c229454a552 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Nov 2025 08:21:27 -0800 Subject: mm/huge_memory: Fix initialization of huge zero folio The recent fix to properly initialize the tags of the huge zero folio had an unfortunate not-so-subtle side effect: it caused the actual *contents* of the huge zero folio to not be initialized at all when the hardware didn't support the memory tagging. The reason was the unfortunate semantics of tag_clear_highpage(): on hardware that didn't do the tagging, it would silently just not do anything at all. And since this is done only on arm64 with MTE support, that basically meant most hardware. It wasn't necessarily immediately obvious since the huge zero page isn't necessarily very heavily used - or because it might already be zero because all-zeroes is the most common pattern. But it ends up causing random odd user space failures when you do hit it. The unfortunate semantics have been around for a while, but became a real bug only when we started actively using __GFP_ZEROTAGS in the generic get_huge_zero_folio() function - before that, it had only ever been used in code that checked that the hardware supported it. Fix this by simply changing the semantics of tag_clear_highpage() to return whether it actually successfully did something or not. While at it, also make it initialize multiple pages in one go, since that's actually what the only caller wants it to do and it simplifies the whole logic. Fixes: adfb6609c680 ("mm/huge_memory: initialise the tags of the huge zero folio") Link: https://lore.kernel.org/all/20251117082023.90176-1-00107082@163.com/ Reviewed-by: David Hildenbrand (Red Hat) Reported-and-tested-by: David Wang <00107082@163.com> Reported-and-tested-by: Carlos Llamas Signed-off-by: Linus Torvalds --- include/linux/highmem.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 105cc4c00cc3..abc20f9810fd 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -249,10 +249,12 @@ static inline void clear_highpage_kasan_tagged(struct page *page) kunmap_local(kaddr); } -#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE +#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGES -static inline void tag_clear_highpage(struct page *page) +/* Return false to let people know we did not initialize the pages */ +static inline bool tag_clear_highpages(struct page *page, int numpages) { + return false; } #endif -- cgit v1.2.3 From c57210bc15371caa06a5d4040e7d8aaeed4cb661 Mon Sep 17 00:00:00 2001 From: Alexey Minnekhanov Date: Sun, 16 Nov 2025 04:12:33 +0300 Subject: dt-bindings: clock: mmcc-sdm660: Add missing MDSS reset Add definition for display subsystem reset control, so display driver can reset display controller properly, clearing any configuration left there by bootloader. Since 6.17 after PM domains rework it became necessary for display to function. Fixes: 0e789b491ba0 ("pmdomain: core: Leave powered-on genpds on until sync_state") Cc: stable@vger.kernel.org # 6.17 Signed-off-by: Alexey Minnekhanov Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20251116-sdm660-mdss-reset-v2-1-6219bec0a97f@postmarketos.org Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,mmcc-sdm660.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,mmcc-sdm660.h b/include/dt-bindings/clock/qcom,mmcc-sdm660.h index f9dbc21cb5c7..ee2a89dae72d 100644 --- a/include/dt-bindings/clock/qcom,mmcc-sdm660.h +++ b/include/dt-bindings/clock/qcom,mmcc-sdm660.h @@ -157,6 +157,7 @@ #define BIMC_SMMU_GDSC 7 #define CAMSS_MICRO_BCR 0 +#define MDSS_BCR 1 #endif -- cgit v1.2.3 From 23818ebb9c76bac8dfedec252cf33157230efc23 Mon Sep 17 00:00:00 2001 From: Xuyang Dong Date: Tue, 30 Sep 2025 17:32:18 +0800 Subject: dt-bindings: reset: eswin: Documentation for eic7700 SoC Add device tree binding documentation and header file for the ESWIN eic7700 reset controller module. Signed-off-by: Yifeng Huang Signed-off-by: Xuyang Dong Reviewed-by: Krzysztof Kozlowski Signed-off-by: Philipp Zabel --- include/dt-bindings/reset/eswin,eic7700-reset.h | 298 ++++++++++++++++++++++++ 1 file changed, 298 insertions(+) create mode 100644 include/dt-bindings/reset/eswin,eic7700-reset.h (limited to 'include') diff --git a/include/dt-bindings/reset/eswin,eic7700-reset.h b/include/dt-bindings/reset/eswin,eic7700-reset.h new file mode 100644 index 000000000000..a370c9f74307 --- /dev/null +++ b/include/dt-bindings/reset/eswin,eic7700-reset.h @@ -0,0 +1,298 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright 2025, Beijing ESWIN Computing Technology Co., Ltd.. + * All rights reserved. + * + * Device Tree binding constants for EIC7700 reset controller. + * + * Authors: + * Yifeng Huang + * Xuyang Dong + */ + +#ifndef __DT_ESWIN_EIC7700_RESET_H__ +#define __DT_ESWIN_EIC7700_RESET_H__ + +#define EIC7700_RESET_NOC_NSP 0 +#define EIC7700_RESET_NOC_CFG 1 +#define EIC7700_RESET_RNOC_NSP 2 +#define EIC7700_RESET_SNOC_TCU 3 +#define EIC7700_RESET_SNOC_U84 4 +#define EIC7700_RESET_SNOC_PCIE_XSR 5 +#define EIC7700_RESET_SNOC_PCIE_XMR 6 +#define EIC7700_RESET_SNOC_PCIE_PR 7 +#define EIC7700_RESET_SNOC_NPU 8 +#define EIC7700_RESET_SNOC_JTAG 9 +#define EIC7700_RESET_SNOC_DSP 10 +#define EIC7700_RESET_SNOC_DDRC1_P2 11 +#define EIC7700_RESET_SNOC_DDRC1_P1 12 +#define EIC7700_RESET_SNOC_DDRC0_P2 13 +#define EIC7700_RESET_SNOC_DDRC0_P1 14 +#define EIC7700_RESET_SNOC_D2D 15 +#define EIC7700_RESET_SNOC_AON 16 +#define EIC7700_RESET_GPU_AXI 17 +#define EIC7700_RESET_GPU_CFG 18 +#define EIC7700_RESET_GPU_GRAY 19 +#define EIC7700_RESET_GPU_JONES 20 +#define EIC7700_RESET_GPU_SPU 21 +#define EIC7700_RESET_DSP_AXI 22 +#define EIC7700_RESET_DSP_CFG 23 +#define EIC7700_RESET_DSP_DIV4 24 +#define EIC7700_RESET_DSP_DIV0 25 +#define EIC7700_RESET_DSP_DIV1 26 +#define EIC7700_RESET_DSP_DIV2 27 +#define EIC7700_RESET_DSP_DIV3 28 +#define EIC7700_RESET_D2D_AXI 29 +#define EIC7700_RESET_D2D_CFG 30 +#define EIC7700_RESET_D2D_PRST 31 +#define EIC7700_RESET_D2D_RAW_PCS 32 +#define EIC7700_RESET_D2D_RX 33 +#define EIC7700_RESET_D2D_TX 34 +#define EIC7700_RESET_D2D_CORE 35 +#define EIC7700_RESET_DDR1_ARST 36 +#define EIC7700_RESET_DDR1_TRACE 37 +#define EIC7700_RESET_DDR0_ARST 38 +#define EIC7700_RESET_DDR_CFG 39 +#define EIC7700_RESET_DDR0_TRACE 40 +#define EIC7700_RESET_DDR_CORE 41 +#define EIC7700_RESET_DDR_PRST 42 +#define EIC7700_RESET_TCU_AXI 43 +#define EIC7700_RESET_TCU_CFG 44 +#define EIC7700_RESET_TCU_TBU0 45 +#define EIC7700_RESET_TCU_TBU1 46 +#define EIC7700_RESET_TCU_TBU2 47 +#define EIC7700_RESET_TCU_TBU3 48 +#define EIC7700_RESET_TCU_TBU4 49 +#define EIC7700_RESET_TCU_TBU5 50 +#define EIC7700_RESET_TCU_TBU6 51 +#define EIC7700_RESET_TCU_TBU7 52 +#define EIC7700_RESET_TCU_TBU8 53 +#define EIC7700_RESET_TCU_TBU9 54 +#define EIC7700_RESET_TCU_TBU10 55 +#define EIC7700_RESET_TCU_TBU11 56 +#define EIC7700_RESET_TCU_TBU12 57 +#define EIC7700_RESET_TCU_TBU13 58 +#define EIC7700_RESET_TCU_TBU14 59 +#define EIC7700_RESET_TCU_TBU15 60 +#define EIC7700_RESET_TCU_TBU16 61 +#define EIC7700_RESET_NPU_AXI 62 +#define EIC7700_RESET_NPU_CFG 63 +#define EIC7700_RESET_NPU_CORE 64 +#define EIC7700_RESET_NPU_E31CORE 65 +#define EIC7700_RESET_NPU_E31BUS 66 +#define EIC7700_RESET_NPU_E31DBG 67 +#define EIC7700_RESET_NPU_LLC 68 +#define EIC7700_RESET_HSP_AXI 69 +#define EIC7700_RESET_HSP_CFG 70 +#define EIC7700_RESET_HSP_POR 71 +#define EIC7700_RESET_MSHC0_PHY 72 +#define EIC7700_RESET_MSHC1_PHY 73 +#define EIC7700_RESET_MSHC2_PHY 74 +#define EIC7700_RESET_MSHC0_TXRX 75 +#define EIC7700_RESET_MSHC1_TXRX 76 +#define EIC7700_RESET_MSHC2_TXRX 77 +#define EIC7700_RESET_SATA_ASIC0 78 +#define EIC7700_RESET_SATA_OOB 79 +#define EIC7700_RESET_SATA_PMALIVE 80 +#define EIC7700_RESET_SATA_RBC 81 +#define EIC7700_RESET_DMA0 82 +#define EIC7700_RESET_HSP_DMA 83 +#define EIC7700_RESET_USB0_VAUX 84 +#define EIC7700_RESET_USB1_VAUX 85 +#define EIC7700_RESET_HSP_SD1_PRST 86 +#define EIC7700_RESET_HSP_SD0_PRST 87 +#define EIC7700_RESET_HSP_EMMC_PRST 88 +#define EIC7700_RESET_HSP_DMA_PRST 89 +#define EIC7700_RESET_HSP_SD1_ARST 90 +#define EIC7700_RESET_HSP_SD0_ARST 91 +#define EIC7700_RESET_HSP_EMMC_ARST 92 +#define EIC7700_RESET_HSP_DMA_ARST 93 +#define EIC7700_RESET_HSP_ETH1_ARST 94 +#define EIC7700_RESET_HSP_ETH0_ARST 95 +#define EIC7700_RESET_SATA_ARST 96 +#define EIC7700_RESET_PCIE_CFG 97 +#define EIC7700_RESET_PCIE_POWEUP 98 +#define EIC7700_RESET_PCIE_PERST 99 +#define EIC7700_RESET_I2C0 100 +#define EIC7700_RESET_I2C1 101 +#define EIC7700_RESET_I2C2 102 +#define EIC7700_RESET_I2C3 103 +#define EIC7700_RESET_I2C4 104 +#define EIC7700_RESET_I2C5 105 +#define EIC7700_RESET_I2C6 106 +#define EIC7700_RESET_I2C7 107 +#define EIC7700_RESET_I2C8 108 +#define EIC7700_RESET_I2C9 109 +#define EIC7700_RESET_FAN 110 +#define EIC7700_RESET_PVT0 111 +#define EIC7700_RESET_PVT1 112 +#define EIC7700_RESET_MBOX0 113 +#define EIC7700_RESET_MBOX1 114 +#define EIC7700_RESET_MBOX2 115 +#define EIC7700_RESET_MBOX3 116 +#define EIC7700_RESET_MBOX4 117 +#define EIC7700_RESET_MBOX5 118 +#define EIC7700_RESET_MBOX6 119 +#define EIC7700_RESET_MBOX7 120 +#define EIC7700_RESET_MBOX8 121 +#define EIC7700_RESET_MBOX9 122 +#define EIC7700_RESET_MBOX10 123 +#define EIC7700_RESET_MBOX11 124 +#define EIC7700_RESET_MBOX12 125 +#define EIC7700_RESET_MBOX13 126 +#define EIC7700_RESET_MBOX14 127 +#define EIC7700_RESET_MBOX15 128 +#define EIC7700_RESET_UART0 129 +#define EIC7700_RESET_UART1 130 +#define EIC7700_RESET_UART2 131 +#define EIC7700_RESET_UART3 132 +#define EIC7700_RESET_UART4 133 +#define EIC7700_RESET_GPIO0 134 +#define EIC7700_RESET_GPIO1 135 +#define EIC7700_RESET_TIMER 136 +#define EIC7700_RESET_SSI0 137 +#define EIC7700_RESET_SSI1 138 +#define EIC7700_RESET_WDT0 139 +#define EIC7700_RESET_WDT1 140 +#define EIC7700_RESET_WDT2 141 +#define EIC7700_RESET_WDT3 142 +#define EIC7700_RESET_LSP_CFG 143 +#define EIC7700_RESET_U84_CORE0 144 +#define EIC7700_RESET_U84_CORE1 145 +#define EIC7700_RESET_U84_CORE2 146 +#define EIC7700_RESET_U84_CORE3 147 +#define EIC7700_RESET_U84_BUS 148 +#define EIC7700_RESET_U84_DBG 149 +#define EIC7700_RESET_U84_TRACECOM 150 +#define EIC7700_RESET_U84_TRACE0 151 +#define EIC7700_RESET_U84_TRACE1 152 +#define EIC7700_RESET_U84_TRACE2 153 +#define EIC7700_RESET_U84_TRACE3 154 +#define EIC7700_RESET_SCPU_CORE 155 +#define EIC7700_RESET_SCPU_BUS 156 +#define EIC7700_RESET_SCPU_DBG 157 +#define EIC7700_RESET_LPCPU_CORE 158 +#define EIC7700_RESET_LPCPU_BUS 159 +#define EIC7700_RESET_LPCPU_DBG 160 +#define EIC7700_RESET_VC_CFG 161 +#define EIC7700_RESET_VC_AXI 162 +#define EIC7700_RESET_VC_MONCFG 163 +#define EIC7700_RESET_JD_CFG 164 +#define EIC7700_RESET_JD_AXI 165 +#define EIC7700_RESET_JE_CFG 166 +#define EIC7700_RESET_JE_AXI 167 +#define EIC7700_RESET_VD_CFG 168 +#define EIC7700_RESET_VD_AXI 169 +#define EIC7700_RESET_VE_AXI 170 +#define EIC7700_RESET_VE_CFG 171 +#define EIC7700_RESET_G2D_CORE 172 +#define EIC7700_RESET_G2D_CFG 173 +#define EIC7700_RESET_G2D_AXI 174 +#define EIC7700_RESET_VI_AXI 175 +#define EIC7700_RESET_VI_CFG 176 +#define EIC7700_RESET_VI_DWE 177 +#define EIC7700_RESET_DVP 178 +#define EIC7700_RESET_ISP0 179 +#define EIC7700_RESET_ISP1 180 +#define EIC7700_RESET_SHUTTR0 181 +#define EIC7700_RESET_SHUTTR1 182 +#define EIC7700_RESET_SHUTTR2 183 +#define EIC7700_RESET_SHUTTR3 184 +#define EIC7700_RESET_SHUTTR4 185 +#define EIC7700_RESET_SHUTTR5 186 +#define EIC7700_RESET_VO_MIPI 187 +#define EIC7700_RESET_VO_PRST 188 +#define EIC7700_RESET_VO_HDMI_PRST 189 +#define EIC7700_RESET_VO_HDMI_PHY 190 +#define EIC7700_RESET_VO_HDMI 191 +#define EIC7700_RESET_VO_I2S 192 +#define EIC7700_RESET_VO_I2S_PRST 193 +#define EIC7700_RESET_VO_AXI 194 +#define EIC7700_RESET_VO_CFG 195 +#define EIC7700_RESET_VO_DC 196 +#define EIC7700_RESET_VO_DC_PRST 197 +#define EIC7700_RESET_BOOTSPI_HRST 198 +#define EIC7700_RESET_BOOTSPI 199 +#define EIC7700_RESET_ANO1 200 +#define EIC7700_RESET_ANO0 201 +#define EIC7700_RESET_DMA1_ARST 202 +#define EIC7700_RESET_DMA1_HRST 203 +#define EIC7700_RESET_FPRT 204 +#define EIC7700_RESET_HBLOCK 205 +#define EIC7700_RESET_SECSR 206 +#define EIC7700_RESET_OTP 207 +#define EIC7700_RESET_PKA 208 +#define EIC7700_RESET_SPACC 209 +#define EIC7700_RESET_TRNG 210 +#define EIC7700_RESET_TIMER0_0 211 +#define EIC7700_RESET_TIMER0_1 212 +#define EIC7700_RESET_TIMER0_2 213 +#define EIC7700_RESET_TIMER0_3 214 +#define EIC7700_RESET_TIMER0_4 215 +#define EIC7700_RESET_TIMER0_5 216 +#define EIC7700_RESET_TIMER0_6 217 +#define EIC7700_RESET_TIMER0_7 218 +#define EIC7700_RESET_TIMER0_N 219 +#define EIC7700_RESET_TIMER1_0 220 +#define EIC7700_RESET_TIMER1_1 221 +#define EIC7700_RESET_TIMER1_2 222 +#define EIC7700_RESET_TIMER1_3 223 +#define EIC7700_RESET_TIMER1_4 224 +#define EIC7700_RESET_TIMER1_5 225 +#define EIC7700_RESET_TIMER1_6 226 +#define EIC7700_RESET_TIMER1_7 227 +#define EIC7700_RESET_TIMER1_N 228 +#define EIC7700_RESET_TIMER2_0 229 +#define EIC7700_RESET_TIMER2_1 230 +#define EIC7700_RESET_TIMER2_2 231 +#define EIC7700_RESET_TIMER2_3 232 +#define EIC7700_RESET_TIMER2_4 233 +#define EIC7700_RESET_TIMER2_5 234 +#define EIC7700_RESET_TIMER2_6 235 +#define EIC7700_RESET_TIMER2_7 236 +#define EIC7700_RESET_TIMER2_N 237 +#define EIC7700_RESET_TIMER3_0 238 +#define EIC7700_RESET_TIMER3_1 239 +#define EIC7700_RESET_TIMER3_2 240 +#define EIC7700_RESET_TIMER3_3 241 +#define EIC7700_RESET_TIMER3_4 242 +#define EIC7700_RESET_TIMER3_5 243 +#define EIC7700_RESET_TIMER3_6 244 +#define EIC7700_RESET_TIMER3_7 245 +#define EIC7700_RESET_TIMER3_N 246 +#define EIC7700_RESET_RTC 247 +#define EIC7700_RESET_MNOC_SNOC_NSP 248 +#define EIC7700_RESET_MNOC_VC 249 +#define EIC7700_RESET_MNOC_CFG 250 +#define EIC7700_RESET_MNOC_HSP 251 +#define EIC7700_RESET_MNOC_GPU 252 +#define EIC7700_RESET_MNOC_DDRC1_P3 253 +#define EIC7700_RESET_MNOC_DDRC0_P3 254 +#define EIC7700_RESET_RNOC_VO 255 +#define EIC7700_RESET_RNOC_VI 256 +#define EIC7700_RESET_RNOC_SNOC_NSP 257 +#define EIC7700_RESET_RNOC_CFG 258 +#define EIC7700_RESET_MNOC_DDRC1_P4 259 +#define EIC7700_RESET_MNOC_DDRC0_P4 260 +#define EIC7700_RESET_CNOC_VO_CFG 261 +#define EIC7700_RESET_CNOC_VI_CFG 262 +#define EIC7700_RESET_CNOC_VC_CFG 263 +#define EIC7700_RESET_CNOC_TCU_CFG 264 +#define EIC7700_RESET_CNOC_PCIE_CFG 265 +#define EIC7700_RESET_CNOC_NPU_CFG 266 +#define EIC7700_RESET_CNOC_LSP_CFG 267 +#define EIC7700_RESET_CNOC_HSP_CFG 268 +#define EIC7700_RESET_CNOC_GPU_CFG 269 +#define EIC7700_RESET_CNOC_DSPT_CFG 270 +#define EIC7700_RESET_CNOC_DDRT1_CFG 271 +#define EIC7700_RESET_CNOC_DDRT0_CFG 272 +#define EIC7700_RESET_CNOC_D2D_CFG 273 +#define EIC7700_RESET_CNOC_CFG 274 +#define EIC7700_RESET_CNOC_CLMM_CFG 275 +#define EIC7700_RESET_CNOC_AON_CFG 276 +#define EIC7700_RESET_LNOC_CFG 277 +#define EIC7700_RESET_LNOC_NPU_LLC 278 +#define EIC7700_RESET_LNOC_DDRC1_P0 279 +#define EIC7700_RESET_LNOC_DDRC0_P0 280 + +#endif /* __DT_ESWIN_EIC7700_RESET_H__ */ -- cgit v1.2.3 From 8bffbfdc01dff26f17f8b382266e71d48e63c5e9 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Wed, 22 Oct 2025 15:51:32 +0200 Subject: reset: remove legacy reset lookup code There are no more users of this code. Let's remove the exported symbols and the implementation from reset core. Reviewed-by: Philipp Zabel Signed-off-by: Bartosz Golaszewski [p.zabel@pengutronix.de: folded in 8e6ec20e-8965-4b42-99fc-0462269ff2f1@paulmck-laptop] Signed-off-by: Philipp Zabel --- include/linux/reset-controller.h | 33 --------------------------------- 1 file changed, 33 deletions(-) (limited to 'include') diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h index 357df16ede32..46514cb1b9e0 100644 --- a/include/linux/reset-controller.h +++ b/include/linux/reset-controller.h @@ -26,31 +26,6 @@ struct module; struct device_node; struct of_phandle_args; -/** - * struct reset_control_lookup - represents a single lookup entry - * - * @list: internal list of all reset lookup entries - * @provider: name of the reset controller device controlling this reset line - * @index: ID of the reset controller in the reset controller device - * @dev_id: name of the device associated with this reset line - * @con_id: name of the reset line (can be NULL) - */ -struct reset_control_lookup { - struct list_head list; - const char *provider; - unsigned int index; - const char *dev_id; - const char *con_id; -}; - -#define RESET_LOOKUP(_provider, _index, _dev_id, _con_id) \ - { \ - .provider = _provider, \ - .index = _index, \ - .dev_id = _dev_id, \ - .con_id = _con_id, \ - } - /** * struct reset_controller_dev - reset controller entity that might * provide multiple reset controls @@ -90,9 +65,6 @@ void reset_controller_unregister(struct reset_controller_dev *rcdev); struct device; int devm_reset_controller_register(struct device *dev, struct reset_controller_dev *rcdev); - -void reset_controller_add_lookup(struct reset_control_lookup *lookup, - unsigned int num_entries); #else static inline int reset_controller_register(struct reset_controller_dev *rcdev) { @@ -108,11 +80,6 @@ static inline int devm_reset_controller_register(struct device *dev, { return 0; } - -static inline void reset_controller_add_lookup(struct reset_control_lookup *lookup, - unsigned int num_entries) -{ -} #endif #endif -- cgit v1.2.3 From 5334eb9de76c74e24821aae89e111e27398b5add Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Tue, 14 Oct 2025 13:10:28 +0000 Subject: dt-bindings: reset: thead,th1520-reset: Remove non-VO-subsystem resets Registers in control of TH1520_RESET_ID_{NPU,WDT0,WDT1} belong to AP reset controller, not the VO one which is documented as "thead,th1520-reset" and is the only reset controller supported for TH1520 for now. Let's remove the IDs, leaving them to be implemented by AP-subsystem reset controller in the future. Fixes: 30e7573babdc ("dt-bindings: reset: Add T-HEAD TH1520 SoC Reset Controller") Signed-off-by: Yao Zi Acked-by: Rob Herring (Arm) Reviewed-by: Drew Fustini Acked-by: Guo Ren Signed-off-by: Philipp Zabel --- include/dt-bindings/reset/thead,th1520-reset.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/reset/thead,th1520-reset.h b/include/dt-bindings/reset/thead,th1520-reset.h index ee799286c175..e51d6314d131 100644 --- a/include/dt-bindings/reset/thead,th1520-reset.h +++ b/include/dt-bindings/reset/thead,th1520-reset.h @@ -9,9 +9,6 @@ #define TH1520_RESET_ID_GPU 0 #define TH1520_RESET_ID_GPU_CLKGEN 1 -#define TH1520_RESET_ID_NPU 2 -#define TH1520_RESET_ID_WDT0 3 -#define TH1520_RESET_ID_WDT1 4 #define TH1520_RESET_ID_DPU_AHB 5 #define TH1520_RESET_ID_DPU_AXI 6 #define TH1520_RESET_ID_DPU_CORE 7 -- cgit v1.2.3 From a35ac6f3bdb135debc8e1ff599d0009bc64dc329 Mon Sep 17 00:00:00 2001 From: Yao Zi Date: Tue, 14 Oct 2025 13:10:29 +0000 Subject: dt-bindings: reset: thead,th1520-reset: Add controllers for more subsys TH1520 SoC is divided into several subsystems, most of them have distinct reset controllers. Let's document reset controllers other than the one for VO subsystem and IDs for their reset signals. Signed-off-by: Yao Zi Acked-by: Rob Herring (Arm) Reviewed-by: Drew Fustini Acked-by: Guo Ren Signed-off-by: Philipp Zabel --- include/dt-bindings/reset/thead,th1520-reset.h | 216 +++++++++++++++++++++++++ 1 file changed, 216 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/reset/thead,th1520-reset.h b/include/dt-bindings/reset/thead,th1520-reset.h index e51d6314d131..ba6805b6b12a 100644 --- a/include/dt-bindings/reset/thead,th1520-reset.h +++ b/include/dt-bindings/reset/thead,th1520-reset.h @@ -7,6 +7,200 @@ #ifndef _DT_BINDINGS_TH1520_RESET_H #define _DT_BINDINGS_TH1520_RESET_H +/* AO Subsystem */ +#define TH1520_RESET_ID_SYSTEM 0 +#define TH1520_RESET_ID_RTC_APB 1 +#define TH1520_RESET_ID_RTC_REF 2 +#define TH1520_RESET_ID_AOGPIO_DB 3 +#define TH1520_RESET_ID_AOGPIO_APB 4 +#define TH1520_RESET_ID_AOI2C_APB 5 +#define TH1520_RESET_ID_PVT_APB 6 +#define TH1520_RESET_ID_E902_CORE 7 +#define TH1520_RESET_ID_E902_HAD 8 +#define TH1520_RESET_ID_AOTIMER_APB 9 +#define TH1520_RESET_ID_AOTIMER_CORE 10 +#define TH1520_RESET_ID_AOWDT_APB 11 +#define TH1520_RESET_ID_APSYS 12 +#define TH1520_RESET_ID_NPUSYS 13 +#define TH1520_RESET_ID_DDRSYS 14 +#define TH1520_RESET_ID_AXI_AP2CP 15 +#define TH1520_RESET_ID_AXI_CP2AP 16 +#define TH1520_RESET_ID_AXI_CP2SRAM 17 +#define TH1520_RESET_ID_AUDSYS_CORE 18 +#define TH1520_RESET_ID_AUDSYS_IOPMP 19 +#define TH1520_RESET_ID_AUDSYS 20 +#define TH1520_RESET_ID_DSP0 21 +#define TH1520_RESET_ID_DSP1 22 +#define TH1520_RESET_ID_GPU_MODULE 23 +#define TH1520_RESET_ID_VDEC 24 +#define TH1520_RESET_ID_VENC 25 +#define TH1520_RESET_ID_ADC_APB 26 +#define TH1520_RESET_ID_AUDGPIO_DB 27 +#define TH1520_RESET_ID_AUDGPIO_APB 28 +#define TH1520_RESET_ID_AOUART_IF 29 +#define TH1520_RESET_ID_AOUART_APB 30 +#define TH1520_RESET_ID_SRAM_AXI_P0 31 +#define TH1520_RESET_ID_SRAM_AXI_P1 32 +#define TH1520_RESET_ID_SRAM_AXI_P2 33 +#define TH1520_RESET_ID_SRAM_AXI_P3 34 +#define TH1520_RESET_ID_SRAM_AXI_P4 35 +#define TH1520_RESET_ID_SRAM_AXI_CORE 36 +#define TH1520_RESET_ID_SE 37 + +/* AP Subsystem */ +#define TH1520_RESET_ID_BROM 0 +#define TH1520_RESET_ID_C910_TOP 1 +#define TH1520_RESET_ID_NPU 2 +#define TH1520_RESET_ID_WDT0 3 +#define TH1520_RESET_ID_WDT1 4 +#define TH1520_RESET_ID_C910_C0 5 +#define TH1520_RESET_ID_C910_C1 6 +#define TH1520_RESET_ID_C910_C2 7 +#define TH1520_RESET_ID_C910_C3 8 +#define TH1520_RESET_ID_CHIP_DBG_CORE 9 +#define TH1520_RESET_ID_CHIP_DBG_AXI 10 +#define TH1520_RESET_ID_AXI4_CPUSYS2_AXI 11 +#define TH1520_RESET_ID_AXI4_CPUSYS2_APB 12 +#define TH1520_RESET_ID_X2H_CPUSYS 13 +#define TH1520_RESET_ID_AHB2_CPUSYS 14 +#define TH1520_RESET_ID_APB3_CPUSYS 15 +#define TH1520_RESET_ID_MBOX0_APB 16 +#define TH1520_RESET_ID_MBOX1_APB 17 +#define TH1520_RESET_ID_MBOX2_APB 18 +#define TH1520_RESET_ID_MBOX3_APB 19 +#define TH1520_RESET_ID_TIMER0_APB 20 +#define TH1520_RESET_ID_TIMER0_CORE 21 +#define TH1520_RESET_ID_TIMER1_APB 22 +#define TH1520_RESET_ID_TIMER1_CORE 23 +#define TH1520_RESET_ID_PERISYS_AHB 24 +#define TH1520_RESET_ID_PERISYS_APB1 25 +#define TH1520_RESET_ID_PERISYS_APB2 26 +#define TH1520_RESET_ID_GMAC0_APB 27 +#define TH1520_RESET_ID_GMAC0_AHB 28 +#define TH1520_RESET_ID_GMAC0_CLKGEN 29 +#define TH1520_RESET_ID_GMAC0_AXI 30 +#define TH1520_RESET_ID_UART0_APB 31 +#define TH1520_RESET_ID_UART0_IF 32 +#define TH1520_RESET_ID_UART1_APB 33 +#define TH1520_RESET_ID_UART1_IF 34 +#define TH1520_RESET_ID_UART2_APB 35 +#define TH1520_RESET_ID_UART2_IF 36 +#define TH1520_RESET_ID_UART3_APB 37 +#define TH1520_RESET_ID_UART3_IF 38 +#define TH1520_RESET_ID_UART4_APB 39 +#define TH1520_RESET_ID_UART4_IF 40 +#define TH1520_RESET_ID_UART5_APB 41 +#define TH1520_RESET_ID_UART5_IF 42 +#define TH1520_RESET_ID_QSPI0_IF 43 +#define TH1520_RESET_ID_QSPI0_APB 44 +#define TH1520_RESET_ID_QSPI1_IF 45 +#define TH1520_RESET_ID_QSPI1_APB 46 +#define TH1520_RESET_ID_SPI_IF 47 +#define TH1520_RESET_ID_SPI_APB 48 +#define TH1520_RESET_ID_I2C0_APB 49 +#define TH1520_RESET_ID_I2C0_CORE 50 +#define TH1520_RESET_ID_I2C1_APB 51 +#define TH1520_RESET_ID_I2C1_CORE 52 +#define TH1520_RESET_ID_I2C2_APB 53 +#define TH1520_RESET_ID_I2C2_CORE 54 +#define TH1520_RESET_ID_I2C3_APB 55 +#define TH1520_RESET_ID_I2C3_CORE 56 +#define TH1520_RESET_ID_I2C4_APB 57 +#define TH1520_RESET_ID_I2C4_CORE 58 +#define TH1520_RESET_ID_I2C5_APB 59 +#define TH1520_RESET_ID_I2C5_CORE 60 +#define TH1520_RESET_ID_GPIO0_DB 61 +#define TH1520_RESET_ID_GPIO0_APB 62 +#define TH1520_RESET_ID_GPIO1_DB 63 +#define TH1520_RESET_ID_GPIO1_APB 64 +#define TH1520_RESET_ID_GPIO2_DB 65 +#define TH1520_RESET_ID_GPIO2_APB 66 +#define TH1520_RESET_ID_PWM_COUNTER 67 +#define TH1520_RESET_ID_PWM_APB 68 +#define TH1520_RESET_ID_PADCTRL0_APB 69 +#define TH1520_RESET_ID_CPU2PERI_X2H 70 +#define TH1520_RESET_ID_CPU2AON_X2H 71 +#define TH1520_RESET_ID_AON2CPU_A2X 72 +#define TH1520_RESET_ID_NPUSYS_AXI 73 +#define TH1520_RESET_ID_NPUSYS_AXI_APB 74 +#define TH1520_RESET_ID_CPU2VP_X2P 75 +#define TH1520_RESET_ID_CPU2VI_X2H 76 +#define TH1520_RESET_ID_BMU_AXI 77 +#define TH1520_RESET_ID_BMU_APB 78 +#define TH1520_RESET_ID_DMAC_CPUSYS_AXI 79 +#define TH1520_RESET_ID_DMAC_CPUSYS_AHB 80 +#define TH1520_RESET_ID_SPINLOCK 81 +#define TH1520_RESET_ID_CFG2TEE 82 +#define TH1520_RESET_ID_DSMART 83 +#define TH1520_RESET_ID_GPIO3_DB 84 +#define TH1520_RESET_ID_GPIO3_APB 85 +#define TH1520_RESET_ID_PERI_I2S 86 +#define TH1520_RESET_ID_PERI_APB3 87 +#define TH1520_RESET_ID_PERI2PERI1_APB 88 +#define TH1520_RESET_ID_VPSYS_APB 89 +#define TH1520_RESET_ID_PERISYS_APB4 90 +#define TH1520_RESET_ID_GMAC1_APB 91 +#define TH1520_RESET_ID_GMAC1_AHB 92 +#define TH1520_RESET_ID_GMAC1_CLKGEN 93 +#define TH1520_RESET_ID_GMAC1_AXI 94 +#define TH1520_RESET_ID_GMAC_AXI 95 +#define TH1520_RESET_ID_GMAC_AXI_APB 96 +#define TH1520_RESET_ID_PADCTRL1_APB 97 +#define TH1520_RESET_ID_VOSYS_AXI 98 +#define TH1520_RESET_ID_VOSYS_AXI_APB 99 +#define TH1520_RESET_ID_VOSYS_AXI_X2X 100 +#define TH1520_RESET_ID_MISC2VP_X2X 101 +#define TH1520_RESET_ID_DSPSYS 102 +#define TH1520_RESET_ID_VISYS 103 +#define TH1520_RESET_ID_VOSYS 104 +#define TH1520_RESET_ID_VPSYS 105 + +/* DSP Subsystem */ +#define TH1520_RESET_ID_X2X_DSP1 0 +#define TH1520_RESET_ID_X2X_DSP0 1 +#define TH1520_RESET_ID_X2X_SLAVE_DSP1 2 +#define TH1520_RESET_ID_X2X_SLAVE_DSP0 3 +#define TH1520_RESET_ID_DSP0_CORE 4 +#define TH1520_RESET_ID_DSP0_DEBUG 5 +#define TH1520_RESET_ID_DSP0_APB 6 +#define TH1520_RESET_ID_DSP1_CORE 7 +#define TH1520_RESET_ID_DSP1_DEBUG 8 +#define TH1520_RESET_ID_DSP1_APB 9 +#define TH1520_RESET_ID_DSPSYS_APB 10 +#define TH1520_RESET_ID_AXI4_DSPSYS_SLV 11 +#define TH1520_RESET_ID_AXI4_DSPSYS 12 +#define TH1520_RESET_ID_AXI4_DSP_RS 13 + +/* MISC Subsystem */ +#define TH1520_RESET_ID_EMMC_SDIO_CLKGEN 0 +#define TH1520_RESET_ID_EMMC 1 +#define TH1520_RESET_ID_MISCSYS_AXI 2 +#define TH1520_RESET_ID_MISCSYS_AXI_APB 3 +#define TH1520_RESET_ID_SDIO0 4 +#define TH1520_RESET_ID_SDIO1 5 +#define TH1520_RESET_ID_USB3_APB 6 +#define TH1520_RESET_ID_USB3_PHY 7 +#define TH1520_RESET_ID_USB3_VCC 8 + +/* VI Subsystem */ +#define TH1520_RESET_ID_ISP0 0 +#define TH1520_RESET_ID_ISP1 1 +#define TH1520_RESET_ID_CSI0_APB 2 +#define TH1520_RESET_ID_CSI1_APB 3 +#define TH1520_RESET_ID_CSI2_APB 4 +#define TH1520_RESET_ID_MIPI_FIFO 5 +#define TH1520_RESET_ID_ISP_VENC_APB 6 +#define TH1520_RESET_ID_VIPRE_APB 7 +#define TH1520_RESET_ID_VIPRE_AXI 8 +#define TH1520_RESET_ID_DW200_APB 9 +#define TH1520_RESET_ID_VISYS3_AXI 10 +#define TH1520_RESET_ID_VISYS2_AXI 11 +#define TH1520_RESET_ID_VISYS1_AXI 12 +#define TH1520_RESET_ID_VISYS_AXI 13 +#define TH1520_RESET_ID_VISYS_APB 14 +#define TH1520_RESET_ID_ISP_VENC_AXI 15 + +/* VO Subsystem */ #define TH1520_RESET_ID_GPU 0 #define TH1520_RESET_ID_GPU_CLKGEN 1 #define TH1520_RESET_ID_DPU_AHB 5 @@ -16,5 +210,27 @@ #define TH1520_RESET_ID_DSI1_APB 9 #define TH1520_RESET_ID_HDMI 10 #define TH1520_RESET_ID_HDMI_APB 11 +#define TH1520_RESET_ID_VOAXI 12 +#define TH1520_RESET_ID_VOAXI_APB 13 +#define TH1520_RESET_ID_X2H_DPU_AXI 14 +#define TH1520_RESET_ID_X2H_DPU_AHB 15 +#define TH1520_RESET_ID_X2H_DPU1_AXI 16 +#define TH1520_RESET_ID_X2H_DPU1_AHB 17 + +/* VP Subsystem */ +#define TH1520_RESET_ID_VPSYS_AXI_APB 0 +#define TH1520_RESET_ID_VPSYS_AXI 1 +#define TH1520_RESET_ID_FCE_APB 2 +#define TH1520_RESET_ID_FCE_CORE 3 +#define TH1520_RESET_ID_FCE_X2X_MASTER 4 +#define TH1520_RESET_ID_FCE_X2X_SLAVE 5 +#define TH1520_RESET_ID_G2D_APB 6 +#define TH1520_RESET_ID_G2D_ACLK 7 +#define TH1520_RESET_ID_G2D_CORE 8 +#define TH1520_RESET_ID_VDEC_APB 9 +#define TH1520_RESET_ID_VDEC_ACLK 10 +#define TH1520_RESET_ID_VDEC_CORE 11 +#define TH1520_RESET_ID_VENC_APB 12 +#define TH1520_RESET_ID_VENC_CORE 13 #endif /* _DT_BINDINGS_TH1520_RESET_H */ -- cgit v1.2.3 From f3d8b64ee46c9b4b0b82b1a4642027728bac95b8 Mon Sep 17 00:00:00 2001 From: Encrow Thorne Date: Mon, 10 Nov 2025 14:10:37 +0800 Subject: reset: fix BIT macro reference RESET_CONTROL_FLAGS_BIT_* macros use BIT(), but reset.h does not include bits.h. This causes compilation errors when including reset.h standalone. Include bits.h to make reset.h self-contained. Suggested-by: Troy Mitchell Reviewed-by: Troy Mitchell Reviewed-by: Philipp Zabel Signed-off-by: Encrow Thorne Signed-off-by: Philipp Zabel --- include/linux/reset.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/reset.h b/include/linux/reset.h index 840d75d172f6..44f9e3415f92 100644 --- a/include/linux/reset.h +++ b/include/linux/reset.h @@ -2,6 +2,7 @@ #ifndef _LINUX_RESET_H_ #define _LINUX_RESET_H_ +#include #include #include #include -- cgit v1.2.3 From 4edf654be5471659e3260be0a557eaa2ece668ab Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Wed, 12 Nov 2025 16:27:06 +0000 Subject: phy: add new phy_notify_state() api Add a new phy_notify_state() api that notifies and configures a phy for a given state transition. This is intended to be used by phy drivers which need to do some runtime configuration of parameters that can't be handled by phy_calibrate() or phy_power_{on|off}(). The first usage of this API is in the Samsung UFS phy that needs to issue some register writes when entering and exiting the hibernate link state. Signed-off-by: Peter Griffin Reviewed-by: Neil Armstrong Link: https://patch.msgid.link/20251112-phy-notify-pmstate-v5-1-39df622d8fcb@linaro.org Signed-off-by: Vinod Koul --- include/linux/phy/phy.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/phy/phy.h b/include/linux/phy/phy.h index 13add0c2c407..2af0d01ebb39 100644 --- a/include/linux/phy/phy.h +++ b/include/linux/phy/phy.h @@ -53,6 +53,15 @@ enum phy_media { PHY_MEDIA_DAC, }; +enum phy_ufs_state { + PHY_UFS_HIBERN8_ENTER, + PHY_UFS_HIBERN8_EXIT, +}; + +union phy_notify { + enum phy_ufs_state ufs_state; +}; + /** * union phy_configure_opts - Opaque generic phy configuration * @@ -83,6 +92,7 @@ union phy_configure_opts { * @set_speed: set the speed of the phy (optional) * @reset: resetting the phy * @calibrate: calibrate the phy + * @notify_phystate: notify and configure the phy for a particular state * @release: ops to be performed while the consumer relinquishes the PHY * @owner: the module owner containing the ops */ @@ -132,6 +142,7 @@ struct phy_ops { int (*connect)(struct phy *phy, int port); int (*disconnect)(struct phy *phy, int port); + int (*notify_phystate)(struct phy *phy, union phy_notify state); void (*release)(struct phy *phy); struct module *owner; }; @@ -255,6 +266,7 @@ int phy_reset(struct phy *phy); int phy_calibrate(struct phy *phy); int phy_notify_connect(struct phy *phy, int port); int phy_notify_disconnect(struct phy *phy, int port); +int phy_notify_state(struct phy *phy, union phy_notify state); static inline int phy_get_bus_width(struct phy *phy) { return phy->attrs.bus_width; @@ -412,6 +424,13 @@ static inline int phy_notify_disconnect(struct phy *phy, int index) return -ENOSYS; } +static inline int phy_notify_state(struct phy *phy, union phy_notify state) +{ + if (!phy) + return 0; + return -ENOSYS; +} + static inline int phy_configure(struct phy *phy, union phy_configure_opts *opts) { -- cgit v1.2.3 From 01ba82702957225218c54c06ad2c2d468b83f510 Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Chundru Date: Sat, 1 Nov 2025 09:29:33 +0530 Subject: PCI: Add .assert_perst() to control PCIe PERST# Controller driver probes first, enables link training and scans the bus. When the PCI bridge is found, its child DT nodes will be scanned and pwrctrl devices will be created if needed. By the time pwrctrl driver probe gets called, link training is already enabled by controller driver. Certain devices like TC9563, which uses the PCI pwrctl framework, need to configure the device before the PCIe link is up. As the controller driver already enables link training as part of its probe, the moment device is powered on, controller and device participate in link training and link can come up immediately and may not have time to configure the device. So we need to stop the link training by using assert_perst() by asserting PERST# and de-asserting PERST# after device is configured. Signed-off-by: Krishna Chaitanya Chundru Signed-off-by: Bjorn Helgaas Acked-by: Manivannan Sadhasivam Link: https://patch.msgid.link/20251101-tc9563-v9-2-de3429f7787a@oss.qualcomm.com --- include/linux/pci.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index d1fdf81fbe1e..ed5dac663e96 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -829,6 +829,7 @@ struct pci_ops { void __iomem *(*map_bus)(struct pci_bus *bus, unsigned int devfn, int where); int (*read)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val); int (*write)(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val); + int (*assert_perst)(struct pci_bus *bus, bool assert); }; /* -- cgit v1.2.3 From f5cb3ee251b4f9db2761aced191f10579bd7e64e Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 11 Nov 2025 14:06:17 +0000 Subject: ASoC: SDCA: Add companion amp Function Add companion amp into the list of allowed SDCA Functions. More work will be required to fully support companion amp, but this will let parts including companion amp functions boot and it is a good first step to proper support. Signed-off-by: Charles Keepax Reviewed-by: Richard Fitzgerald Tested-by: Richard Fitzgerald Link: https://patch.msgid.link/20251111140617.2997454-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index 99cb978f7099..c97861508a15 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -64,6 +64,7 @@ struct sdca_function_desc; * @SDCA_FUNCTION_TYPE_RJ: Retaskable jack. * @SDCA_FUNCTION_TYPE_SIMPLE_JACK: Subset of UAJ. * @SDCA_FUNCTION_TYPE_HID: Human Interface Device, for e.g. buttons. + * @SDCA_FUNCTION_TYPE_COMPANION_AMP: Sources audio from another amp. * @SDCA_FUNCTION_TYPE_IMP_DEF: Implementation-defined function. * * SDCA Function Types from SDCA specification v1.0a Section 5.1.2 @@ -83,6 +84,7 @@ enum sdca_function_type { SDCA_FUNCTION_TYPE_RJ = 0x07, SDCA_FUNCTION_TYPE_SIMPLE_JACK = 0x08, SDCA_FUNCTION_TYPE_HID = 0x0A, + SDCA_FUNCTION_TYPE_COMPANION_AMP = 0x0B, SDCA_FUNCTION_TYPE_IMP_DEF = 0x1F, }; @@ -96,6 +98,7 @@ enum sdca_function_type { #define SDCA_FUNCTION_TYPE_RJ_NAME "RJ" #define SDCA_FUNCTION_TYPE_SIMPLE_NAME "SimpleJack" #define SDCA_FUNCTION_TYPE_HID_NAME "HID" +#define SDCA_FUNCTION_TYPE_COMPANION_AMP_NAME "CompanionAmp" #define SDCA_FUNCTION_TYPE_IMP_DEF_NAME "ImplementationDefined" /** -- cgit v1.2.3 From d9d0be59be2580f2c5e4b7217aafb980e8c371cf Mon Sep 17 00:00:00 2001 From: Martijn de Gouw Date: Mon, 17 Nov 2025 21:22:14 +0100 Subject: regulator: pca9450: Add support for setting debounce settings Make the different debounce timers configurable from the devicetree. Depending on the board design, these have to be set different than the default register values. Signed-off-by: Martijn de Gouw Link: https://patch.msgid.link/20251117202215.1936139-2-martijn.de.gouw@prodrive-technologies.com Signed-off-by: Mark Brown --- include/linux/regulator/pca9450.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/regulator/pca9450.h b/include/linux/regulator/pca9450.h index 85b4fecc10d8..0df8b3c48082 100644 --- a/include/linux/regulator/pca9450.h +++ b/include/linux/regulator/pca9450.h @@ -223,12 +223,44 @@ enum { #define IRQ_THERM_105 0x02 #define IRQ_THERM_125 0x01 +/* PCA9450_REG_PWRCTRL bits */ +#define T_ON_DEB_MASK 0xC0 +#define T_ON_DEB_120US (0 << 6) +#define T_ON_DEB_20MS (1 << 6) +#define T_ON_DEB_100MS (2 << 6) +#define T_ON_DEB_750MS (3 << 6) +#define T_OFF_DEB_MASK 0x20 +#define T_OFF_DEB_120US (0 << 5) +#define T_OFF_DEB_2MS (1 << 5) +#define T_ON_STEP_MASK 0x18 +#define T_ON_STEP_1MS (0 << 3) +#define T_ON_STEP_2MS (1 << 3) +#define T_ON_STEP_4MS (2 << 3) +#define T_ON_STEP_8MS (3 << 3) +#define T_OFF_STEP_MASK 0x06 +#define T_OFF_STEP_2MS (0 << 1) +#define T_OFF_STEP_4MS (1 << 1) +#define T_OFF_STEP_8MS (2 << 1) +#define T_OFF_STEP_16MS (3 << 1) +#define T_RESTART_MASK 0x01 +#define T_RESTART_250MS 0 +#define T_RESTART_500MS 1 + /* PCA9450_REG_RESET_CTRL bits */ #define WDOG_B_CFG_MASK 0xC0 #define WDOG_B_CFG_NONE 0x00 #define WDOG_B_CFG_WARM 0x40 #define WDOG_B_CFG_COLD_LDO12 0x80 #define WDOG_B_CFG_COLD 0xC0 +#define T_PMIC_RST_DEB_MASK 0x07 +#define T_PMIC_RST_DEB_10MS 0x00 +#define T_PMIC_RST_DEB_50MS 0x01 +#define T_PMIC_RST_DEB_100MS 0x02 +#define T_PMIC_RST_DEB_500MS 0x03 +#define T_PMIC_RST_DEB_1S 0x04 +#define T_PMIC_RST_DEB_2S 0x05 +#define T_PMIC_RST_DEB_4S 0x06 +#define T_PMIC_RST_DEB_8S 0x07 /* PCA9450_REG_CONFIG2 bits */ #define I2C_LT_MASK 0x03 -- cgit v1.2.3 From d85b56af22f371409cbf667bab26f938e6528d2e Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 15 Oct 2025 17:56:30 +0200 Subject: efi: Fix trailing whitespace in header file Resolve an issue with the coding style. Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 0b9eb3d2ff97..60e994096e20 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -290,7 +290,7 @@ typedef efi_status_t efi_get_variable_t (efi_char16_t *name, efi_guid_t *vendor, unsigned long *data_size, void *data); typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor); -typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, +typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data); typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count); -- cgit v1.2.3 From 17029cdd8f9d0182a6499e0b7bfc6391e8463091 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 15 Oct 2025 17:56:33 +0200 Subject: efi/libstub: gop: Add support for reading EDID Add support for EFI_EDID_DISCOVERED_PROTOCOL and EFI_EDID_ACTIVE_PROTOCOL as defined in UEFI 2.8, sec 12.9. Define GUIDs and data structures in the rsp header files. In the GOP setup function, read the EDID of the primary GOP device. First try EFI_EDID_ACTIVE_PROTOCOL, which supports user-specified EDID data. Or else try EFI_EDID_DISCOVERED_PROTOCOL, which returns the display device's native EDID. If no EDID could be retrieved, clear the storage. Rename efi_setup_gop() to efi_setup_graphics() to reflect the changes Let callers pass an optional instance of struct edid_data, if they are interested. While screen_info and edid_info come from the same device handle, they should be considered indendent data. The former refers to the graphics mode, the latter refers to the display device. GOP devices might not provide both. Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Signed-off-by: Ard Biesheuvel --- include/linux/efi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/efi.h b/include/linux/efi.h index 60e994096e20..a01f3fe20dab 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -373,6 +373,8 @@ void efi_native_runtime_setup(void); #define EFI_DEVICE_PATH_TO_TEXT_PROTOCOL_GUID EFI_GUID(0x8b843e20, 0x8132, 0x4852, 0x90, 0xcc, 0x55, 0x1a, 0x4e, 0x4a, 0x7f, 0x1c) #define EFI_DEVICE_PATH_FROM_TEXT_PROTOCOL_GUID EFI_GUID(0x05c99a21, 0xc70f, 0x4ad2, 0x8a, 0x5f, 0x35, 0xdf, 0x33, 0x43, 0xf5, 0x1e) #define EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID EFI_GUID(0x9042a9de, 0x23dc, 0x4a38, 0x96, 0xfb, 0x7a, 0xde, 0xd0, 0x80, 0x51, 0x6a) +#define EFI_EDID_DISCOVERED_PROTOCOL_GUID EFI_GUID(0x1c0c34f6, 0xd380, 0x41fa, 0xa0, 0x49, 0x8a, 0xd0, 0x6c, 0x1a, 0x66, 0xaa) +#define EFI_EDID_ACTIVE_PROTOCOL_GUID EFI_GUID(0xbd8c1056, 0x9f36, 0x44ec, 0x92, 0xa8, 0xa6, 0x33, 0x7f, 0x81, 0x79, 0x86) #define EFI_PCI_IO_PROTOCOL_GUID EFI_GUID(0x4cf5b200, 0x68b8, 0x4ca5, 0x9e, 0xec, 0xb2, 0x3e, 0x3f, 0x50, 0x02, 0x9a) #define EFI_FILE_INFO_ID EFI_GUID(0x09576e92, 0x6d3f, 0x11d2, 0x8e, 0x39, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) #define EFI_SYSTEM_RESOURCE_TABLE_GUID EFI_GUID(0xb122a263, 0x3661, 0x4f68, 0x99, 0x29, 0x78, 0xf8, 0xb0, 0xd6, 0x21, 0x80) -- cgit v1.2.3 From 4d24145a7833c14a6521dfab57c5f10076a0110f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 11 Nov 2025 15:49:45 +0100 Subject: devres: Remove unused devm_free_percpu() Remove unused devm_free_percpu(). By the way, it was never used in the drivers/ from day 1. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20251111145046.997309-2-andriy.shevchenko@linux.intel.com Signed-off-by: Danilo Krummrich --- include/linux/device.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index b031ff71a5bd..0c6377f6631c 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -298,7 +298,6 @@ void device_remove_bin_file(struct device *dev, void __percpu *__devm_alloc_percpu(struct device *dev, size_t size, size_t align); -void devm_free_percpu(struct device *dev, void __percpu *pdata); struct device_dma_parameters { /* -- cgit v1.2.3 From 42adb2d4ef24d2834cbd3bb96a6660826ae763da Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 14 Nov 2025 13:04:06 -0800 Subject: fs: Add the __data_racy annotation to backing_dev_info.ra_pages Some but not all .ra_pages changes happen while block layer I/O is paused with blk_mq_freeze_queue(). Filesystems may read .ra_pages even while block layer I/O is paused, e.g. from inside their .fadvise callback. Annotating all .ra_pages reads with READ_ONCE() would be cumbersome. Hence, add the __data_racy annotatation to the .ra_pages member variable. Cc: Alexander Viro Cc: Christian Brauner Cc: Nilay Shroff Signed-off-by: Bart Van Assche Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c5c9d89c73ed..30f4bd9ff7c8 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -168,7 +168,9 @@ struct backing_dev_info { u64 id; struct rb_node rb_node; /* keyed by ->id */ struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ + /* max readahead in PAGE_SIZE units */ + unsigned long __data_racy ra_pages; + unsigned long io_pages; /* max allowed IO size */ struct kref refcnt; /* Reference counter for the structure */ -- cgit v1.2.3 From 935a20d1bebf6236076785fac3ff81e3931834e9 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 14 Nov 2025 13:04:07 -0800 Subject: block: Remove queue freezing from several sysfs store callbacks Freezing the request queue from inside sysfs store callbacks may cause a deadlock in combination with the dm-multipath driver and the queue_if_no_path option. Additionally, freezing the request queue slows down system boot on systems where sysfs attributes are set synchronously. Fix this by removing the blk_mq_freeze_queue() / blk_mq_unfreeze_queue() calls from the store callbacks that do not strictly need these callbacks. Add the __data_racy annotation to request_queue.rq_timeout to suppress KCSAN data race reports about the rq_timeout reads. This patch may cause a small delay in applying the new settings. For all the attributes affected by this patch, I/O will complete correctly whether the old or the new value of the attribute is used. This patch affects the following sysfs attributes: * io_poll_delay * io_timeout * nomerges * read_ahead_kb * rq_affinity Here is an example of a deadlock triggered by running test srp/002 if this patch is not applied: task:multipathd Call Trace: __schedule+0x8c1/0x1bf0 schedule+0xdd/0x270 schedule_preempt_disabled+0x1c/0x30 __mutex_lock+0xb89/0x1650 mutex_lock_nested+0x1f/0x30 dm_table_set_restrictions+0x823/0xdf0 __bind+0x166/0x590 dm_swap_table+0x2a7/0x490 do_resume+0x1b1/0x610 dev_suspend+0x55/0x1a0 ctl_ioctl+0x3a5/0x7e0 dm_ctl_ioctl+0x12/0x20 __x64_sys_ioctl+0x127/0x1a0 x64_sys_call+0xe2b/0x17d0 do_syscall_64+0x96/0x3a0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 task:(udev-worker) Call Trace: __schedule+0x8c1/0x1bf0 schedule+0xdd/0x270 blk_mq_freeze_queue_wait+0xf2/0x140 blk_mq_freeze_queue_nomemsave+0x23/0x30 queue_ra_store+0x14e/0x290 queue_attr_store+0x23e/0x2c0 sysfs_kf_write+0xde/0x140 kernfs_fop_write_iter+0x3b2/0x630 vfs_write+0x4fd/0x1390 ksys_write+0xfd/0x230 __x64_sys_write+0x76/0xc0 x64_sys_call+0x276/0x17d0 do_syscall_64+0x96/0x3a0 entry_SYSCALL_64_after_hwframe+0x4b/0x53 Cc: Christoph Hellwig Cc: Ming Lei Cc: Nilay Shroff Cc: Martin Wilck Cc: Benjamin Marzinski Cc: stable@vger.kernel.org Fixes: af2814149883 ("block: freeze the queue in queue_attr_store") Signed-off-by: Bart Van Assche Reviewed-by: Nilay Shroff Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2fff8a80dbd2..cb4ba09959ee 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -495,7 +495,7 @@ struct request_queue { */ unsigned long queue_flags; - unsigned int rq_timeout; + unsigned int __data_racy rq_timeout; unsigned int queue_depth; -- cgit v1.2.3 From b190eaea57803da00a4318ba12359625337be9e8 Mon Sep 17 00:00:00 2001 From: Taniya Das Date: Tue, 18 Nov 2025 12:47:08 +0530 Subject: dt-bindings: clock: qcom: Add SM8750 video clock controller Add compatible string for SM8750 video clock controller and the bindings for SM8750 Qualcomm SoC. Signed-off-by: Taniya Das Reviewed-by: Rob Herring (Arm) Link: https://lore.kernel.org/r/20251118-sm8750-videocc-v2-v4-4-049882a70c9f@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,sm8750-videocc.h | 40 +++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 include/dt-bindings/clock/qcom,sm8750-videocc.h (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,sm8750-videocc.h b/include/dt-bindings/clock/qcom,sm8750-videocc.h new file mode 100644 index 000000000000..f3bfa2ba5160 --- /dev/null +++ b/include/dt-bindings/clock/qcom,sm8750-videocc.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries. + */ + +#ifndef _DT_BINDINGS_CLK_QCOM_VIDEO_CC_SM8750_H +#define _DT_BINDINGS_CLK_QCOM_VIDEO_CC_SM8750_H + +/* VIDEO_CC clocks */ +#define VIDEO_CC_AHB_CLK 0 +#define VIDEO_CC_AHB_CLK_SRC 1 +#define VIDEO_CC_MVS0_CLK 2 +#define VIDEO_CC_MVS0_CLK_SRC 3 +#define VIDEO_CC_MVS0_DIV_CLK_SRC 4 +#define VIDEO_CC_MVS0_FREERUN_CLK 5 +#define VIDEO_CC_MVS0_SHIFT_CLK 6 +#define VIDEO_CC_MVS0C_CLK 7 +#define VIDEO_CC_MVS0C_DIV2_DIV_CLK_SRC 8 +#define VIDEO_CC_MVS0C_FREERUN_CLK 9 +#define VIDEO_CC_MVS0C_SHIFT_CLK 10 +#define VIDEO_CC_PLL0 11 +#define VIDEO_CC_SLEEP_CLK 12 +#define VIDEO_CC_SLEEP_CLK_SRC 13 +#define VIDEO_CC_XO_CLK 14 +#define VIDEO_CC_XO_CLK_SRC 15 + +/* VIDEO_CC power domains */ +#define VIDEO_CC_MVS0_GDSC 0 +#define VIDEO_CC_MVS0C_GDSC 1 + +/* VIDEO_CC resets */ +#define VIDEO_CC_INTERFACE_BCR 0 +#define VIDEO_CC_MVS0_BCR 1 +#define VIDEO_CC_MVS0C_CLK_ARES 2 +#define VIDEO_CC_MVS0C_BCR 3 +#define VIDEO_CC_MVS0_FREERUN_CLK_ARES 4 +#define VIDEO_CC_MVS0C_FREERUN_CLK_ARES 5 +#define VIDEO_CC_XO_CLK_ARES 6 + +#endif -- cgit v1.2.3 From c84b824d3a8f14bedec8108cb8061da761180f49 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Tue, 18 Nov 2025 18:33:11 +0100 Subject: dt-bindings: clock: qcom: x1e80100-dispcc: Add USB4 router link resets The router link clock branches also feature some reset logic, which is required to properly power sequence the hardware for DP tunneling over USB4. Describe these missing resets. Signed-off-by: Konrad Dybcio Link: https://lore.kernel.org/r/20251118-topic-usb4_x1e_dispcc-v1-1-14c68d842c71@oss.qualcomm.com Signed-off-by: Bjorn Andersson --- include/dt-bindings/clock/qcom,x1e80100-dispcc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/qcom,x1e80100-dispcc.h b/include/dt-bindings/clock/qcom,x1e80100-dispcc.h index d4a83e4fd0d1..49b3a9e5ce4a 100644 --- a/include/dt-bindings/clock/qcom,x1e80100-dispcc.h +++ b/include/dt-bindings/clock/qcom,x1e80100-dispcc.h @@ -90,6 +90,9 @@ #define DISP_CC_MDSS_CORE_BCR 0 #define DISP_CC_MDSS_CORE_INT2_BCR 1 #define DISP_CC_MDSS_RSCC_BCR 2 +#define DISP_CC_MDSS_DPTX0_USB_ROUTER_LINK_INTF_CLK_ARES 3 +#define DISP_CC_MDSS_DPTX1_USB_ROUTER_LINK_INTF_CLK_ARES 4 +#define DISP_CC_MDSS_DPTX2_USB_ROUTER_LINK_INTF_CLK_ARES 5 /* DISP_CC GDSCR */ #define MDSS_GDSC 0 -- cgit v1.2.3 From 0e854e55356908386605714e66f98c3985d9e266 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 14 Nov 2025 12:13:23 -0800 Subject: bpf: Always charge/uncharge memory when allocating/unlinking storage elements Since commit a96a44aba556 ("bpf: bpf_sk_storage: Fix invalid wait context lockdep report"), {charge,uncharge}_mem are always true when allocating a bpf_local_storage_elem or unlinking a bpf_local_storage_elem from local storage, so drop these arguments. No functional change. Signed-off-by: Amery Hung Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20251114201329.3275875-2-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 782f58feea35..3663eabcc3ff 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -184,7 +184,7 @@ void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, - bool charge_mem, bool swap_uptrs, gfp_t gfp_flags); + bool swap_uptrs, gfp_t gfp_flags); void bpf_selem_free(struct bpf_local_storage_elem *selem, struct bpf_local_storage_map *smap, -- cgit v1.2.3 From e76a33e1c7186526c2c133af73ea70da9275e1ba Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 14 Nov 2025 12:13:24 -0800 Subject: bpf: Remove smap argument from bpf_selem_free() Since selem already saves a pointer to smap, use it instead of an additional argument in bpf_selem_free(). This requires moving the SDATA(selem)->smap assignment from bpf_selem_link_map() to bpf_selem_alloc() since bpf_selem_free() may be called without the selem being linked to smap in bpf_local_storage_update(). Signed-off-by: Amery Hung Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20251114201329.3275875-3-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 3663eabcc3ff..4ab137e75f33 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -187,7 +187,6 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool swap_uptrs, gfp_t gfp_flags); void bpf_selem_free(struct bpf_local_storage_elem *selem, - struct bpf_local_storage_map *smap, bool reuse_now); int -- cgit v1.2.3 From 39a460c4253e4a437b6b372f462c0c043026784d Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 14 Nov 2025 12:13:25 -0800 Subject: bpf: Save memory alloction info in bpf_local_storage Save the memory allocation method used for bpf_local_storage in the struct explicitly so that we don't need to go through the hassle to find out the info. When a later patch replaces BPF memory allocator with kmalloc_noloc(), bpf_local_storage_free() will no longer need smap->storage_ma to return the memory and completely remove the dependency on smap in bpf_local_storage_free(). Signed-off-by: Amery Hung Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20251114201329.3275875-4-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 4ab137e75f33..7fef0cec8340 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -97,6 +97,7 @@ struct bpf_local_storage { */ struct rcu_head rcu; raw_spinlock_t lock; /* Protect adding/removing from the "list" */ + bool bpf_ma; }; /* U16_MAX is much more than enough for sk local storage -- cgit v1.2.3 From f484f4a3e058b5641670ebaeb301c06589848521 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Fri, 14 Nov 2025 12:13:26 -0800 Subject: bpf: Replace bpf memory allocator with kmalloc_nolock() in local storage Replace bpf memory allocator with kmalloc_nolock() to reduce memory wastage due to preallocation. In bpf_selem_free(), an selem now needs to wait for a RCU grace period before being freed when reuse_now == true. Therefore, rcu_barrier() should be always be called in bpf_local_storage_map_free(). In bpf_local_storage_free(), since smap->storage_ma is no longer needed to return the memory, the function is now independent from smap. Remove the outdated comment in bpf_local_storage_alloc(). We already free selem after an RCU grace period in bpf_local_storage_update() when bpf_local_storage_alloc() failed the cmpxchg since commit c0d63f309186 ("bpf: Add bpf_selem_free()"). Signed-off-by: Amery Hung Reviewed-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20251114201329.3275875-5-ameryhung@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 7fef0cec8340..66432248cd81 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -53,9 +53,7 @@ struct bpf_local_storage_map { u32 bucket_log; u16 elem_size; u16 cache_idx; - struct bpf_mem_alloc selem_ma; - struct bpf_mem_alloc storage_ma; - bool bpf_ma; + bool use_kmalloc_nolock; }; struct bpf_local_storage_data { @@ -97,7 +95,7 @@ struct bpf_local_storage { */ struct rcu_head rcu; raw_spinlock_t lock; /* Protect adding/removing from the "list" */ - bool bpf_ma; + bool use_kmalloc_nolock; }; /* U16_MAX is much more than enough for sk local storage @@ -131,7 +129,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); struct bpf_map * bpf_local_storage_map_alloc(union bpf_attr *attr, struct bpf_local_storage_cache *cache, - bool bpf_ma); + bool use_kmalloc_nolock); void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, -- cgit v1.2.3 From fbb9933666e31f84c62e9620e9ec4d220ee31ab4 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Mon, 17 Nov 2025 23:42:08 +0200 Subject: net/mlx5: Abort new commands if all command slots are stalled In case of a FW issue, FW might be not responding to FW commands, causing kernel lockout for a long period of time, e.g. rtnl_lock held while ethtool is trying to collect stats waiting for FW to respond to multiple commands, when all of them will timeout. While there's no immediate indication of the FW lockout, we can safely assume that something is wrong when all command slots are busy and in a timeout state and no FW completion was received on any of them. In such case, start immediately failing new commands. Signed-off-by: Saeed Mahameed Reviewed-by: Moshe Shemesh Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1763415729-1238421-5-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 046396269ccf..7aec53371cf0 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -819,6 +819,7 @@ typedef void (*mlx5_cmd_cbk_t)(int status, void *context); enum { MLX5_CMD_ENT_STATE_PENDING_COMP, + MLX5_CMD_ENT_STATE_TIMEDOUT, }; struct mlx5_cmd_work_ent { -- cgit v1.2.3 From 922a6f34c1756d2b0c35d9b2d915b8af19e85965 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 18 Nov 2025 10:46:31 +0800 Subject: autofs: dont trigger mount if it cant succeed If a mount namespace contains autofs mounts, and they are propagation private, and there is no namespace specific automount daemon to handle possible automounting then attempted path resolution will loop until MAXSYMLINKS is reached before failing causing quite a bit of noise in the log. Add a check for this in autofs ->d_automount() so that the VFS can immediately return an error in this case. Since the mount is propagation private an EPERM return seems most appropriate. Suggested by: Christian Brauner Signed-off-by: Ian Kent Link: https://patch.msgid.link/20251118024631.10854-2-raven@themaw.net Signed-off-by: Christian Brauner --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..a5c2077ce6ed 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3269,6 +3269,7 @@ extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern bool is_subdir(struct dentry *, struct dentry *); extern bool path_is_under(const struct path *, const struct path *); +u64 vfsmount_to_propagation_flags(struct vfsmount *mnt); extern char *file_path(struct file *, char *, int); -- cgit v1.2.3 From 0a75f3d90e7ab9cd182327fca4b4e3bce379afe5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 11 Nov 2025 15:49:46 +0100 Subject: devres: Move devm_alloc_percpu() and related to devres.h Move devm_alloc_percpu() and related to devres.h where it belongs. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20251111145046.997309-3-andriy.shevchenko@linux.intel.com [ Fix minor typo in commit message. - Danilo ] Signed-off-by: Danilo Krummrich --- include/linux/device.h | 18 ------------------ include/linux/device/devres.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index 0c6377f6631c..0be95294b6e6 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -281,24 +281,6 @@ int __must_check device_create_bin_file(struct device *dev, void device_remove_bin_file(struct device *dev, const struct bin_attribute *attr); -/** - * devm_alloc_percpu - Resource-managed alloc_percpu - * @dev: Device to allocate per-cpu memory for - * @type: Type to allocate per-cpu memory for - * - * Managed alloc_percpu. Per-cpu memory allocated with this function is - * automatically freed on driver detach. - * - * RETURNS: - * Pointer to allocated memory on success, NULL on failure. - */ -#define devm_alloc_percpu(dev, type) \ - ((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \ - __alignof__(type))) - -void __percpu *__devm_alloc_percpu(struct device *dev, size_t size, - size_t align); - struct device_dma_parameters { /* * a low level driver may set these to teach IOMMU code about diff --git a/include/linux/device/devres.h b/include/linux/device/devres.h index 8c5f57e0d613..9c1e3d643d69 100644 --- a/include/linux/device/devres.h +++ b/include/linux/device/devres.h @@ -9,6 +9,7 @@ #include #include #include +#include struct device; struct device_node; @@ -96,6 +97,22 @@ devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap); char * __printf(3, 4) __malloc devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...); +/** + * devm_alloc_percpu - Resource-managed alloc_percpu + * @dev: Device to allocate per-cpu memory for + * @type: Type to allocate per-cpu memory for + * + * Managed alloc_percpu. Per-cpu memory allocated with this function is + * automatically freed on driver detach. + * + * RETURNS: + * Pointer to allocated memory on success, NULL on failure. + */ +#define devm_alloc_percpu(dev, type) \ + ((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), __alignof__(type))) + +void __percpu *__devm_alloc_percpu(struct device *dev, size_t size, size_t align); + unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order); void devm_free_pages(struct device *dev, unsigned long addr); -- cgit v1.2.3 From a0c83150eea5807dbedf786f55cd49b14af118a8 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Wed, 12 Nov 2025 09:10:10 +0530 Subject: platform/x86/intel: Introduce Intel Elkhart Lake PSE I/O MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intel Elkhart Lake Programmable Service Engine (PSE) includes two PCI devices that expose two different capabilities of GPIO and Timed I/O as a single PCI function through shared MMIO with below layout. GPIO: 0x0000 - 0x1000 TIO: 0x1000 - 0x2000 This driver enumerates the PCI parent device and creates auxiliary child devices for these capabilities. The actual functionalities are provided by their respective auxiliary drivers. Signed-off-by: Raag Jadav Acked-by: Ilpo Järvinen Link: https://lore.kernel.org/r/20251112034040.457801-2-raag.jadav@intel.com Signed-off-by: Bartosz Golaszewski --- include/linux/ehl_pse_io_aux.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 include/linux/ehl_pse_io_aux.h (limited to 'include') diff --git a/include/linux/ehl_pse_io_aux.h b/include/linux/ehl_pse_io_aux.h new file mode 100644 index 000000000000..afb8587ee5fb --- /dev/null +++ b/include/linux/ehl_pse_io_aux.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Intel Elkhart Lake PSE I/O Auxiliary Device + * + * Copyright (c) 2025 Intel Corporation. + * + * Author: Raag Jadav + */ + +#ifndef _EHL_PSE_IO_AUX_H_ +#define _EHL_PSE_IO_AUX_H_ + +#include + +#define EHL_PSE_IO_NAME "ehl_pse_io" +#define EHL_PSE_GPIO_NAME "gpio" +#define EHL_PSE_TIO_NAME "pps_tio" + +struct ehl_pse_io_data { + struct resource mem; + int irq; +}; + +#endif /* _EHL_PSE_IO_AUX_H_ */ -- cgit v1.2.3 From c200892b46ba3df3dd210b7117a463ec283600c3 Mon Sep 17 00:00:00 2001 From: Coiby Xu Date: Wed, 19 Nov 2025 22:03:25 +0800 Subject: ima: Access decompressed kernel module to verify appended signature Currently, when in-kernel module decompression (CONFIG_MODULE_DECOMPRESS) is enabled, IMA has no way to verify the appended module signature as it can't decompress the module. Define a new kernel_read_file_id enumerate READING_MODULE_COMPRESSED so IMA can calculate the compressed kernel module data hash on READING_MODULE_COMPRESSED and defer appraising/measuring it until on READING_MODULE when the module has been decompressed. Before enabling in-kernel module decompression, a kernel module in initramfs can still be loaded with ima_policy=secure_boot. So adjust the kernel module rule in secure_boot policy to allow either an IMA signature OR an appended signature i.e. to use "appraise func=MODULE_CHECK appraise_type=imasig|modsig". Reported-by: Karel Srot Suggested-by: Mimi Zohar Suggested-by: Paul Moore Signed-off-by: Coiby Xu Signed-off-by: Mimi Zohar --- include/linux/kernel_read_file.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/kernel_read_file.h b/include/linux/kernel_read_file.h index 90451e2e12bd..d613a7b4dd35 100644 --- a/include/linux/kernel_read_file.h +++ b/include/linux/kernel_read_file.h @@ -14,6 +14,7 @@ id(KEXEC_INITRAMFS, kexec-initramfs) \ id(POLICY, security-policy) \ id(X509_CERTIFICATE, x509-certificate) \ + id(MODULE_COMPRESSED, kernel-module-compressed) \ id(MAX_ID, ) #define __fid_enumify(ENUM, dummy) READING_ ## ENUM, -- cgit v1.2.3 From 79301c7d605a10efea35af08167e0a362d8dffb1 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 14 Nov 2025 16:54:02 +0800 Subject: mm: add spurious fault fixing support for huge pmd The page faults may be spurious because of the racy access to the page table. For example, a non-populated virtual page is accessed on 2 CPUs simultaneously, thus the page faults are triggered on both CPUs. However, it's possible that one CPU (say CPU A) cannot find the reason for the page fault if the other CPU (say CPU B) has changed the page table before the PTE is checked on CPU A. Most of the time, the spurious page faults can be ignored safely. However, if the page fault is for the write access, it's possible that a stale read-only TLB entry exists in the local CPU and needs to be flushed on some architectures. This is called the spurious page fault fixing. In the current kernel, there is spurious fault fixing support for pte, but not for huge pmd because no architectures need it. But in the next patch in the series, we will change the write protection fault handling logic on arm64, so that some stale huge pmd entries may remain in the TLB. These entries need to be flushed via the huge pmd spurious fault fixing mechanism. Signed-off-by: Huang Ying Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Zi Yan Cc: Will Deacon Cc: Andrew Morton Cc: Vlastimil Babka Cc: Baolin Wang Cc: Ryan Roberts Cc: Yang Shi Cc: Christoph Lameter (Ampere) Cc: Dev Jain Cc: Barry Song Cc: Anshuman Khandual Cc: Kefeng Wang Cc: Kevin Brodsky Cc: Yin Fengwei Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Signed-off-by: Catalin Marinas --- include/linux/huge_mm.h | 2 +- include/linux/pgtable.h | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f327d62fc985..887a632ce7a0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -11,7 +11,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); -void huge_pmd_set_accessed(struct vm_fault *vmf); +bool huge_pmd_set_accessed(struct vm_fault *vmf); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 32e8457ad535..ee3148ef87f6 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1232,6 +1232,10 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) #define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address) #endif +#ifndef flush_tlb_fix_spurious_fault_pmd +#define flush_tlb_fix_spurious_fault_pmd(vma, address, pmdp) do { } while (0) +#endif + /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no -- cgit v1.2.3 From 4acbfb6c116be5989d5a0e38a48deca2d5b8bb92 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 22 Sep 2025 10:21:06 +0800 Subject: PM: wakeup: Add out-of-band system wakeup support for devices Some devices can wake up the system from suspend even when their power domains are turned off. This is possible because their system-wakeup logic resides in an always-on power domain - indicating that they support out-of-band system wakeup. Currently, PM domain core doesn't power off such devices if they are marked as system wakeup sources. To better represent devices with out-of-band wakeup capability, this patch introduces a new flag out_band_wakeup in 'struct dev_pm_info'. Two helper APIs are added: - device_set_out_band_wakeup() - to mark a device as having out-of-band wakeup capability. - device_out_band_wakeup() - to query the flag. Allow the PM core and drivers to distinguish between regular and out-of-band wakeup sources, enable more accurate power management decision. Signed-off-by: Peng Fan Reviewed-by: Dhruva Gole Signed-off-by: Ulf Hansson --- include/linux/pm.h | 1 + include/linux/pm_wakeup.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/pm.h b/include/linux/pm.h index cc7b2dc28574..5b28a4f2e87e 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -684,6 +684,7 @@ struct dev_pm_info { bool smart_suspend:1; /* Owned by the PM core */ bool must_resume:1; /* Owned by the PM core */ bool may_skip_resume:1; /* Set by subsystems */ + bool out_band_wakeup:1; bool strict_midlayer:1; #else bool should_wakeup:1; diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index c838b4a30f87..41e8f344a205 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -94,6 +94,16 @@ static inline void device_set_wakeup_path(struct device *dev) dev->power.wakeup_path = true; } +static inline void device_set_out_band_wakeup(struct device *dev) +{ + dev->power.out_band_wakeup = true; +} + +static inline bool device_out_band_wakeup(struct device *dev) +{ + return dev->power.out_band_wakeup; +} + /* drivers/base/power/wakeup.c */ extern struct wakeup_source *wakeup_source_register(struct device *dev, const char *name); @@ -162,6 +172,13 @@ static inline bool device_wakeup_path(struct device *dev) static inline void device_set_wakeup_path(struct device *dev) {} +static inline void device_set_out_band_wakeup(struct device *dev) {} + +static inline bool device_out_band_wakeup(struct device *dev) +{ + return false; +} + static inline void __pm_stay_awake(struct wakeup_source *ws) {} static inline void pm_stay_awake(struct device *dev) {} -- cgit v1.2.3 From 854825367a1d28b3b6c757134460d0fe29a0b4a6 Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 22 Sep 2025 10:21:09 +0800 Subject: usb: chipidea: ci_hdrc_imx: Set out of band wakeup for i.MX95 i.MX95 USB2 inside HSIOMIX could still wakeup Linux, even if HSIOMIX power domain(Digital logic) is off. There is still always on logic have the wakeup capability which is out band wakeup capbility. So use device_set_out_band_wakeup for i.MX95 to make sure usb2 could wakeup system even if HSIOMIX power domain is in off state. Tested-by: Xu Yang Reviewed-by: Xu Yang Signed-off-by: Peng Fan Acked-by: Peter Chen Signed-off-by: Ulf Hansson --- include/linux/usb/chipidea.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/usb/chipidea.h b/include/linux/usb/chipidea.h index e17ebeee24e3..c6451191d2de 100644 --- a/include/linux/usb/chipidea.h +++ b/include/linux/usb/chipidea.h @@ -66,6 +66,7 @@ struct ci_hdrc_platform_data { #define CI_HDRC_HAS_PORTSC_PEC_MISSED BIT(17) #define CI_HDRC_FORCE_VBUS_ACTIVE_ALWAYS BIT(18) #define CI_HDRC_HAS_SHORT_PKT_LIMIT BIT(19) +#define CI_HDRC_OUT_BAND_WAKEUP BIT(20) enum usb_dr_mode dr_mode; #define CI_HDRC_CONTROLLER_RESET_EVENT 0 #define CI_HDRC_CONTROLLER_STOPPED_EVENT 1 -- cgit v1.2.3 From ccde6525183c5489de293cf91a441585fff3c847 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 5 Nov 2025 10:54:07 +0100 Subject: smp: Introduce a helper function to check for pending IPIs When governors used during cpuidle try to find the most optimal idle state for a CPU or a group of CPUs, they are known to quite often fail. One reason for this is, that they are not taking into account whether there has been an IPI scheduled for any of the CPUs that are affected by the selected idle state. To enable pending IPIs to be taken into account for cpuidle decisions, introduce a new helper function, cpus_peek_for_pending_ipi(). Suggested-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Signed-off-by: Ulf Hansson --- include/linux/smp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/smp.h b/include/linux/smp.h index 18e9c918325e..91d0ecf3b8d3 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -168,6 +168,7 @@ int smp_call_function_any(const struct cpumask *mask, void kick_all_cpus_sync(void); void wake_up_all_idle_cpus(void); +bool cpus_peek_for_pending_ipi(const struct cpumask *mask); /* * Generic and arch helpers @@ -216,6 +217,10 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, static inline void kick_all_cpus_sync(void) { } static inline void wake_up_all_idle_cpus(void) { } +static inline bool cpus_peek_for_pending_ipi(const struct cpumask *mask) +{ + return false; +} #define setup_max_cpus 0 -- cgit v1.2.3 From 796e29b857aed89f83f70f2c199585c45db5dc0f Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:31 +0000 Subject: ACPI / PPTT: Add a helper to fill a cpumask from a processor container The ACPI MPAM table uses the UID of a processor container specified in the PPTT to indicate the subset of CPUs and cache topology that can access each MPAM System Component (MSC). This information is not directly useful to the kernel. The equivalent cpumask is needed instead. Add a helper to find the processor container by its id, then walk the possible CPUs to fill a cpumask with the CPUs that have this processor container as a parent. CC: Dave Martin Reviewed-by: Sudeep Holla Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Hanjun Guo Reviewed-by: Jeremy Linton Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas --- include/linux/acpi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead..4752ebd48132 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1541,6 +1541,7 @@ int find_acpi_cpu_topology(unsigned int cpu, int level); int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); +void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1562,6 +1563,8 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) { return -EINVAL; } +static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, + cpumask_t *cpus) { } #endif void acpi_arch_init(void); -- cgit v1.2.3 From 41a7bb39fede8ecc053c261b86cdfadea45b7b10 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:34 +0000 Subject: ACPI / PPTT: Find cache level by cache-id The MPAM table identifies caches by id. The MPAM driver also wants to know the cache level to determine if the platform is of the shape that can be managed via resctrl. Cacheinfo has this information, but only for CPUs that are online. Waiting for all CPUs to come online is a problem for platforms where CPUs are brought online late by user-space. Add a helper that walks every possible cache, until it finds the one identified by cache-id, then return the level. Signed-off-by: James Morse Signed-off-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: Fenghua Yu Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Jeremy Linton Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Catalin Marinas --- include/linux/acpi.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 4752ebd48132..be074bdfd4d1 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1542,6 +1542,7 @@ int find_acpi_cpu_topology_cluster(unsigned int cpu); int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); +int find_acpi_cache_level_from_id(u32 cache_id); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1565,6 +1566,10 @@ static inline int find_acpi_cpu_topology_hetero_id(unsigned int cpu) } static inline void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus) { } +static inline int find_acpi_cache_level_from_id(u32 cache_id) +{ + return -ENOENT; +} #endif void acpi_arch_init(void); -- cgit v1.2.3 From a39a723a6f1ed9a1602ccf8dd56392402afa7339 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:35 +0000 Subject: ACPI / PPTT: Add a helper to fill a cpumask from a cache_id MPAM identifies CPUs by the cache_id in the PPTT cache structure. The driver needs to know which CPUs are associated with the cache. The CPUs may not all be online, so cacheinfo does not have the information. Add a helper to pull this information out of the PPTT. CC: Rohit Mathew Reviewed-by: Gavin Shan Reviewed-by: Fenghua Yu Reviewed-by: Shaopeng Tan Reviewed-by: Jonathan Cameron Reviewed-by: Jeremy Linton Tested-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas --- include/linux/acpi.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index be074bdfd4d1..a9dbacabdf89 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -1543,6 +1543,7 @@ int find_acpi_cpu_topology_package(unsigned int cpu); int find_acpi_cpu_topology_hetero_id(unsigned int cpu); void acpi_pptt_get_cpus_from_container(u32 acpi_cpu_id, cpumask_t *cpus); int find_acpi_cache_level_from_id(u32 cache_id); +int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, cpumask_t *cpus); #else static inline int acpi_pptt_cpu_is_thread(unsigned int cpu) { @@ -1570,6 +1571,11 @@ static inline int find_acpi_cache_level_from_id(u32 cache_id) { return -ENOENT; } +static inline int acpi_pptt_get_cpumask_from_cache_id(u32 cache_id, + cpumask_t *cpus) +{ + return -ENOENT; +} #endif void acpi_arch_init(void); -- cgit v1.2.3 From f5915600cc4ca0338a37d5a8a4032e25d939156b Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Wed, 19 Nov 2025 12:22:37 +0000 Subject: platform: Define platform_device_put cleanup handler Define a cleanup helper for use with __free to destroy platform devices automatically when the pointer goes out of scope. This is only intended to be used in error cases and so should be used with return_ptr() or no_free_ptr() directly to avoid the automatic destruction on success. A first use of this is introduced in a subsequent commit. Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: Fenghua Yu Reviewed-by: Shaopeng Tan Tested-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas --- include/linux/platform_device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h index 074754c23d33..23a30ada2d4c 100644 --- a/include/linux/platform_device.h +++ b/include/linux/platform_device.h @@ -232,6 +232,7 @@ extern int platform_device_add_data(struct platform_device *pdev, extern int platform_device_add(struct platform_device *pdev); extern void platform_device_del(struct platform_device *pdev); extern void platform_device_put(struct platform_device *pdev); +DEFINE_FREE(platform_device_put, struct platform_device *, if (_T) platform_device_put(_T)) struct platform_driver { int (*probe)(struct platform_device *); -- cgit v1.2.3 From 96f4a4d53e6660d9b62e8d739388267fbb660e9f Mon Sep 17 00:00:00 2001 From: Ben Horgan Date: Wed, 19 Nov 2025 12:22:38 +0000 Subject: ACPI: Define acpi_put_table cleanup handler and acpi_get_table_pointer() helper Define a cleanup helper for use with __free to release the acpi table when the pointer goes out of scope. Also, introduce the helper acpi_get_table_pointer() to simplify a commonly used pattern involving acpi_get_table(). These are first used in a subsequent commit. Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Shaopeng Tan Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas --- include/linux/acpi.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index a9dbacabdf89..ac8797f95236 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -8,6 +8,7 @@ #ifndef _LINUX_ACPI_H #define _LINUX_ACPI_H +#include #include #include /* for struct resource */ #include @@ -221,6 +222,17 @@ void acpi_reserve_initial_tables (void); void acpi_table_init_complete (void); int acpi_table_init (void); +static inline struct acpi_table_header *acpi_get_table_pointer(char *signature, u32 instance) +{ + struct acpi_table_header *table; + int status = acpi_get_table(signature, instance, &table); + + if (ACPI_FAILURE(status)) + return ERR_PTR(-ENOENT); + return table; +} +DEFINE_FREE(acpi_put_table, struct acpi_table_header *, if (!IS_ERR_OR_NULL(_T)) acpi_put_table(_T)) + int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init_or_acpilib acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, -- cgit v1.2.3 From 115c5325beae7199219ab7c12ec2a2af8dea6c3c Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:39 +0000 Subject: ACPI / MPAM: Parse the MPAM table Add code to parse the arm64 specific MPAM table, looking up the cache level from the PPTT and feeding the end result into the MPAM driver. This happens in two stages. Platform devices are created first for the MSC devices. Once the driver probes it calls acpi_mpam_parse_resources() to discover the RIS entries the MSC contains. For now the MPAM hook mpam_ris_create() is stubbed out, but will update the MPAM driver with optional discovered data about the RIS entries. CC: Carl Worth Link: https://developer.arm.com/documentation/den0065/3-0bet/?lang=en Reviewed-by: Lorenzo Pieralisi Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas --- include/linux/arm_mpam.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 include/linux/arm_mpam.h (limited to 'include') diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h new file mode 100644 index 000000000000..4b7f335181e0 --- /dev/null +++ b/include/linux/arm_mpam.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2025 Arm Ltd. */ + +#ifndef __LINUX_ARM_MPAM_H +#define __LINUX_ARM_MPAM_H + +#include +#include + +struct mpam_msc; + +enum mpam_msc_iface { + MPAM_IFACE_MMIO, /* a real MPAM MSC */ + MPAM_IFACE_PCC, /* a fake MPAM MSC */ +}; + +enum mpam_class_types { + MPAM_CLASS_CACHE, /* Caches, e.g. L2, L3 */ + MPAM_CLASS_MEMORY, /* Main memory */ + MPAM_CLASS_UNKNOWN, /* Everything else, e.g. SMMU */ +}; + +#define MPAM_CLASS_ID_DEFAULT 255 + +#ifdef CONFIG_ACPI_MPAM +int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc); + +int acpi_mpam_count_msc(void); +#else +static inline int acpi_mpam_parse_resources(struct mpam_msc *msc, + struct acpi_mpam_msc_node *tbl_msc) +{ + return -EINVAL; +} + +static inline int acpi_mpam_count_msc(void) { return -EINVAL; } +#endif + +static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, + int component_id) +{ + return -EINVAL; +} + +#endif /* __LINUX_ARM_MPAM_H */ -- cgit v1.2.3 From 01fb4b8224726aa0f2170b63e4685cf0eec85d8d Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:41 +0000 Subject: arm_mpam: Add the class and component structures for firmware described ris An MSC is a container of resources, each identified by their RIS index. Some RIS are described by firmware to provide their position in the system. Others are discovered when the driver probes the hardware. To configure a resource it needs to be found by its class, e.g. 'L2'. There are two kinds of grouping, a class is a set of components, which are visible to user-space as there are likely to be multiple instances of the L2 cache. (e.g. one per cluster or package) Add support for creating and destroying structures to allow a hierarchy of resources to be created. Reviewed-by: Gavin Shan Reviewed-by: Jonathan Cameron Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: James Morse Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas --- include/linux/arm_mpam.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 4b7f335181e0..13a8ac5c2cbd 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -37,11 +37,16 @@ static inline int acpi_mpam_parse_resources(struct mpam_msc *msc, static inline int acpi_mpam_count_msc(void) { return -EINVAL; } #endif +#ifdef CONFIG_ARM64_MPAM_DRIVER +int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, + enum mpam_class_types type, u8 class_id, int component_id); +#else static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, enum mpam_class_types type, u8 class_id, int component_id) { return -EINVAL; } +#endif #endif /* __LINUX_ARM_MPAM_H */ -- cgit v1.2.3 From bd221f9f82afb616887e0b88b43fbb937479d744 Mon Sep 17 00:00:00 2001 From: James Morse Date: Wed, 19 Nov 2025 12:22:44 +0000 Subject: arm_mpam: Probe hardware to find the supported partid/pmg values CPUs can generate traffic with a range of PARTID and PMG values, but each MSC may also have its own maximum size for these fields. Before MPAM can be used, the driver needs to probe each RIS on each MSC, to find the system-wide smallest value that can be used. The limits from requestors (e.g. CPUs) also need taking into account. While doing this, RIS entries that firmware didn't describe are created under MPAM_CLASS_UNKNOWN. This adds the low level MSC write accessors. While we're here, implement the mpam_register_requestor() call for the arch code to register the CPU limits. Future callers of this will tell us about the SMMU and ITS. Signed-off-by: James Morse Reviewed-by: Jonathan Cameron Reviewed-by: Ben Horgan Reviewed-by: Gavin Shan Reviewed-by: Shaopeng Tan Reviewed-by: Fenghua Yu Tested-by: Fenghua Yu Tested-by: Shaopeng Tan Tested-by: Peter Newman Tested-by: Carl Worth Tested-by: Gavin Shan Tested-by: Zeng Heng Tested-by: Hanjun Guo Signed-off-by: Ben Horgan Signed-off-by: Catalin Marinas --- include/linux/arm_mpam.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/linux/arm_mpam.h b/include/linux/arm_mpam.h index 13a8ac5c2cbd..7f00c5285a32 100644 --- a/include/linux/arm_mpam.h +++ b/include/linux/arm_mpam.h @@ -49,4 +49,18 @@ static inline int mpam_ris_create(struct mpam_msc *msc, u8 ris_idx, } #endif +/** + * mpam_register_requestor() - Register a requestor with the MPAM driver + * @partid_max: The maximum PARTID value the requestor can generate. + * @pmg_max: The maximum PMG value the requestor can generate. + * + * Registers a requestor with the MPAM driver to ensure the chosen system-wide + * minimum PARTID and PMG values will allow the requestors features to be used. + * + * Returns an error if the registration is too late, and a larger PARTID/PMG + * value has been advertised to user-space. In this case the requestor should + * not use its MPAM features. Returns 0 on success. + */ +int mpam_register_requestor(u16 partid_max, u8 pmg_max); + #endif /* __LINUX_ARM_MPAM_H */ -- cgit v1.2.3 From 934fa943b53795339486cc0026b3ab7ad39dc600 Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Tue, 18 Nov 2025 03:11:08 -0800 Subject: net: mana: Handle SKB if TX SGEs exceed hardware limit The MANA hardware supports a maximum of 30 scatter-gather entries (SGEs) per TX WQE. Exceeding this limit can cause TX failures. Add ndo_features_check() callback to validate SKB layout before transmission. For GSO SKBs that would exceed the hardware SGE limit, clear NETIF_F_GSO_MASK to enforce software segmentation in the stack. Add a fallback in mana_start_xmit() to linearize non-GSO SKBs that still exceed the SGE limit. Also, Add ethtool counter for SKBs linearized Co-developed-by: Dipayaan Roy Signed-off-by: Dipayaan Roy Signed-off-by: Aditya Garg Reviewed-by: Eric Dumazet Reviewed-by: Haiyang Zhang Link: https://patch.msgid.link/1763464269-10431-2-git-send-email-gargaditya@linux.microsoft.com Signed-off-by: Jakub Kicinski --- include/net/mana/gdma.h | 8 +++++++- include/net/mana/mana.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 2e4f2f3175e5..a4cf307859f8 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -486,6 +486,8 @@ struct gdma_wqe { #define INLINE_OOB_SMALL_SIZE 8 #define INLINE_OOB_LARGE_SIZE 24 +#define MANA_MAX_TX_WQE_SGL_ENTRIES 30 + #define MAX_TX_WQE_SIZE 512 #define MAX_RX_WQE_SIZE 256 @@ -592,6 +594,9 @@ enum { #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) #define GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE BIT(6) +/* Driver supports linearizing the skb when num_sge exceeds hardware limit */ +#define GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE BIT(20) + /* Driver can send HWC periodically to query stats */ #define GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY BIT(21) @@ -605,7 +610,8 @@ enum { GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \ GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \ GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE | \ - GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY) + GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY | \ + GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE) #define GDMA_DRV_CAP_FLAGS2 0 diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index d37f4cea0ac3..fb28b3cac067 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -377,6 +377,7 @@ struct mana_ethtool_stats { u64 wake_queue; u64 tx_cqe_err; u64 tx_cqe_unknown_type; + u64 tx_linear_pkt_cnt; u64 rx_coalesced_err; u64 rx_cqe_unknown_type; }; -- cgit v1.2.3 From 45120304e84171fd215c1b57b15b285446d15106 Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Tue, 18 Nov 2025 03:11:09 -0800 Subject: net: mana: Drop TX skb on post_work_request failure and unmap resources Drop TX packets when posting the work request fails and ensure DMA mappings are always cleaned up. Signed-off-by: Aditya Garg Reviewed-by: Haiyang Zhang Link: https://patch.msgid.link/1763464269-10431-3-git-send-email-gargaditya@linux.microsoft.com Signed-off-by: Jakub Kicinski --- include/net/mana/mana.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index fb28b3cac067..d7e089c6b694 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -593,6 +593,7 @@ int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, void mana_query_phy_stats(struct mana_port_context *apc); int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues); void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); +void mana_unmap_skb(struct sk_buff *skb, struct mana_port_context *apc); extern const struct ethtool_ops mana_ethtool_ops; extern struct dentry *mana_debugfs_root; -- cgit v1.2.3 From 3fee828789b1cf294a8fc83ad8a37f644c174fae Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Sun, 16 Nov 2025 22:45:36 +0200 Subject: net/mlx5: Move the esw mode notifier chain outside the devlink lock The esw mode change notifier chain is initialized/cleaned up in mlx5_init_one() / mlx5_uninit_one() with the devlink lock held. Move the notifier head from the eswitch struct into mlx5_priv directly, and initialize it outside the critical section. This will allow notifier registration to happen earlier in the init procedure in subsequent patches. Signed-off-by: Cosmin Ratiu Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1763325940-1231508-3-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 7aec53371cf0..9a4a5112a59e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -599,6 +599,7 @@ struct mlx5_priv { struct mlx5_flow_steering *steering; struct mlx5_mpfs *mpfs; + struct blocking_notifier_head esw_n_head; struct mlx5_eswitch *eswitch; struct mlx5_core_sriov sriov; struct mlx5_lag *lag; -- cgit v1.2.3 From d3a356db853bc2dfb51034eacafd41aca7dd4c37 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Sun, 16 Nov 2025 22:45:37 +0200 Subject: net/mlx5: Move the vhca event notifier outside of the devlink lock The vhca event notifier consists of an atomic notifier for vhca state changes (used for SF events), multiple workqueues and a blocking notifier chain for delivering the vhca state change events for further processing. This patch moves the vhca notifier head outside of mlx5_init_one() / mlx5_uninit_one() and into the mlx5_mdev_init() / mlx5_mdev_uninit() functions. This allows called notifiers to grab the PF devlink lock which was previously impossible because it would create a circular lock dependency. mlx5_vhca_event_stop() is now called earlier in the cleanup phase and flushes the workqueues to ensure that after the call, there are no pending events. This simplifies the cleanup flow for vhca event consumers. Signed-off-by: Cosmin Ratiu Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1763325940-1231508-4-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9a4a5112a59e..88afb2788dc9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -488,7 +488,6 @@ struct mlx5_devcom_dev; struct mlx5_fw_reset; struct mlx5_eq_table; struct mlx5_irq_table; -struct mlx5_vhca_state_notifier; struct mlx5_sf_dev_table; struct mlx5_sf_hw_table; struct mlx5_sf_table; @@ -615,7 +614,8 @@ struct mlx5_priv { struct mlx5_bfreg_data bfregs; struct mlx5_sq_bfreg bfreg; #ifdef CONFIG_MLX5_SF - struct mlx5_vhca_state_notifier *vhca_state_notifier; + struct mlx5_nb vhca_state_nb; + struct blocking_notifier_head vhca_state_n_head; struct mlx5_sf_dev_table *sf_dev_table; struct mlx5_core_dev *parent_mdev; #endif -- cgit v1.2.3 From e63c9c5f0a4802deea81a48c2c40d0af56153e8a Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Sun, 16 Nov 2025 22:45:38 +0200 Subject: net/mlx5: Move the SF HW table notifier outside the devlink lock Move the SF HW table notifier registration/unregistration outside of mlx5_init_one() / mlx5_uninit_one() and into the mlx5_mdev_init() / mlx5_mdev_uninit() functions. This is only done for non-SFs, since SFs do not have a SF HW table themselves. Signed-off-by: Cosmin Ratiu Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1763325940-1231508-5-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 88afb2788dc9..d6c5bcebdaca 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -620,6 +620,7 @@ struct mlx5_priv { struct mlx5_core_dev *parent_mdev; #endif #ifdef CONFIG_MLX5_SF_MANAGER + struct notifier_block sf_hw_table_vhca_nb; struct mlx5_sf_hw_table *sf_hw_table; struct mlx5_sf_table *sf_table; #endif -- cgit v1.2.3 From d4a0acbd94c2a93bf308a9fde9ab6719f5d98c7a Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Sun, 16 Nov 2025 22:45:39 +0200 Subject: net/mlx5: Move the SF table notifiers outside the devlink lock Move the SF table notifiers registration/unregistration outside of mlx5_init_one() / mlx5_uninit_one() and into the mlx5_mdev_init() / mlx5_mdev_uninit() functions. This is only done for non-SFs, since SFs do not have a SF table themselves and thus don't need notifiers. Signed-off-by: Cosmin Ratiu Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1763325940-1231508-6-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d6c5bcebdaca..6af62047a614 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -622,6 +622,9 @@ struct mlx5_priv { #ifdef CONFIG_MLX5_SF_MANAGER struct notifier_block sf_hw_table_vhca_nb; struct mlx5_sf_hw_table *sf_hw_table; + struct notifier_block sf_table_esw_nb; + struct notifier_block sf_table_vhca_nb; + struct notifier_block sf_table_mdev_nb; struct mlx5_sf_table *sf_table; #endif struct blocking_notifier_head lag_nh; -- cgit v1.2.3 From 64ad6470c882fcaecfa4a1da96ea94de7ca0dc80 Mon Sep 17 00:00:00 2001 From: Cosmin Ratiu Date: Sun, 16 Nov 2025 22:45:40 +0200 Subject: net/mlx5: Move SF dev table notifier registration outside the PF devlink lock This completes the previous patches by moving notifier registration for SF dev tables outside the devlink locked critical section in mlx5_init_one() / mlx5_uninit_one() and into the mlx5_mdev_init() / mlx5_mdev_uninit() functions. This is only done for non-SFs, since SFs do not have a SF HW table themselves. After this patch, notifiers can grab the PF devlink lock (soon to be necessary) without creating a locking cycle. Signed-off-by: Cosmin Ratiu Reviewed-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/1763325940-1231508-7-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/linux/mlx5/driver.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6af62047a614..1c54aa6f74fb 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -616,6 +616,7 @@ struct mlx5_priv { #ifdef CONFIG_MLX5_SF struct mlx5_nb vhca_state_nb; struct blocking_notifier_head vhca_state_n_head; + struct notifier_block sf_dev_nb; struct mlx5_sf_dev_table *sf_dev_table; struct mlx5_core_dev *parent_mdev; #endif -- cgit v1.2.3 From a77f0ad44fde89874654ba48f461209fb0382107 Mon Sep 17 00:00:00 2001 From: Pagadala Yesu Anjaneyulu Date: Wed, 12 Nov 2025 11:10:23 +0200 Subject: wifi: cfg80211: Add support for 6GHz AP role not relevant AP type Add IEEE80211_6GHZ_CTRL_REG_AP_ROLE_NOT_RELEVANT and map it to IEEE80211_REG_LPI_AP for safe regulatory compliance when AP role classification is not applicable. Use LPI as safe fallback to prevent power limit violations. Signed-off-by: Pagadala Yesu Anjaneyulu Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20251112110828.856283677cc7.I36138a34847c3b4e680974bf347dde844448f3bc@changeid Signed-off-by: Johannes Berg --- include/linux/ieee80211-he.h | 1 + include/net/cfg80211.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ieee80211-he.h b/include/linux/ieee80211-he.h index 904d50db5bb8..a08c446fbb04 100644 --- a/include/linux/ieee80211-he.h +++ b/include/linux/ieee80211-he.h @@ -548,6 +548,7 @@ static inline bool ieee80211_he_capa_size_ok(const u8 *data, u8 len) #define IEEE80211_6GHZ_CTRL_REG_VLP_AP 2 #define IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP 3 #define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD 4 +#define IEEE80211_6GHZ_CTRL_REG_AP_ROLE_NOT_RELEVANT 7 #define IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP 8 /** diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 625cb2c78361..3d3ed1932262 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -10148,6 +10148,7 @@ cfg80211_6ghz_power_type(u8 control, u32 client_flags) switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { case IEEE80211_6GHZ_CTRL_REG_LPI_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: + case IEEE80211_6GHZ_CTRL_REG_AP_ROLE_NOT_RELEVANT: return IEEE80211_REG_LPI_AP; case IEEE80211_6GHZ_CTRL_REG_SP_AP: case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: -- cgit v1.2.3 From ee19b52c31b3b111f140c1affd88eca1ed11edd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Draszik?= Date: Fri, 14 Nov 2025 14:10:59 +0000 Subject: mfd: sec: Use chained IRQs for s2mpg10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On S2MPG10 (and similar like S2MPG11), top-level interrupt status and mask registers exist which need to be unmasked to get the PMIC interrupts. This additional status doesn't seem to exist on other PMICs in the S2MP* family, and the S2MPG10 driver is manually dealing with masking and unmasking currently. The correct approach here is to register this hierarchy as chained interrupts, though, without any additional manual steps. Doing so will also simplify addition of other, similar, PMICs (like S2MPG11) in the future. Update the driver to do just that. Signed-off-by: André Draszik Link: https://patch.msgid.link/20251114-s2mpg10-chained-irq-v1-1-34ddfa49c4cd@linaro.org Signed-off-by: Lee Jones --- include/linux/mfd/samsung/irq.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mfd/samsung/irq.h b/include/linux/mfd/samsung/irq.h index b4805cbd949b..8402a5f8e18a 100644 --- a/include/linux/mfd/samsung/irq.h +++ b/include/linux/mfd/samsung/irq.h @@ -57,6 +57,12 @@ enum s2mpa01_irq { #define S2MPA01_IRQ_B24_TSD_MASK (1 << 4) #define S2MPA01_IRQ_B35_TSD_MASK (1 << 5) +enum s2mpg10_common_irq { + /* Top-level (common) block */ + S2MPG10_COMMON_IRQ_PMIC, + S2MPG10_COMMON_IRQ_UNUSED, +}; + enum s2mpg10_irq { /* PMIC */ S2MPG10_IRQ_PWRONF, -- cgit v1.2.3 From 4255545a28f75fb6082b6f91d1e7ada28383ab22 Mon Sep 17 00:00:00 2001 From: Chien Wong Date: Thu, 13 Nov 2025 22:05:08 +0800 Subject: wifi: mac80211: add generic MMIE struct defines The added struct is needed when writing generic handler for both CMAC-128 and CMAC-256. Signed-off-by: Chien Wong Link: https://patch.msgid.link/20251113140511.48658-3-m@xv97.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 6d4bc80caf96..d55d8ea3a8be 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1207,7 +1207,7 @@ struct ieee80211_mgmt { #define IEEE80211_MIN_ACTION_SIZE offsetof(struct ieee80211_mgmt, u.action.u) -/* Management MIC information element (IEEE 802.11w) */ +/* Management MIC information element (IEEE 802.11w) for CMAC */ struct ieee80211_mmie { u8 element_id; u8 length; @@ -1225,6 +1225,15 @@ struct ieee80211_mmie_16 { u8 mic[16]; } __packed; +/* Management MIC information element (IEEE 802.11w) for all variants */ +struct ieee80211_mmie_var { + u8 element_id; + u8 length; + __le16 key_id; + u8 sequence_number[6]; + u8 mic[]; /* 8 or 16 bytes */ +} __packed; + struct ieee80211_vendor_ie { u8 element_id; u8 len; @@ -1889,6 +1898,9 @@ enum ieee80211_radio_measurement_actioncode { #define IEEE80211_GCMP_HDR_LEN 8 #define IEEE80211_GCMP_MIC_LEN 16 #define IEEE80211_GCMP_PN_LEN 6 +#define IEEE80211_CMAC_128_MIC_LEN 8 +#define IEEE80211_CMAC_256_MIC_LEN 16 +#define IEEE80211_GMAC_MIC_LEN 16 #define FILS_NONCE_LEN 16 #define FILS_MAX_KEK_LEN 64 -- cgit v1.2.3 From 77d7dc8bef482e987036bc204136bbda552d95cd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:45 +0100 Subject: sched/mmcid: Revert the complex CID management The CID management is a complex beast, which affects both scheduling and task migration. The compaction mechanism forces random tasks of a process into task work on exit to user space causing latency spikes. Revert back to the initial simple bitmap allocating mechanics, which are known to have scalability issues as that allows to gradually build up a replacement functionality in a reviewable way. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.068197830@linutronix.de --- include/linux/mm_types.h | 53 ++---------------------------------------------- 1 file changed, 2 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 90e5790c318f..63b8c1209e7b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -922,13 +922,9 @@ struct vm_area_struct { #define vma_policy(vma) NULL #endif -#ifdef CONFIG_SCHED_MM_CID struct mm_cid { - u64 time; - int cid; - int recent_cid; + unsigned int cid; }; -#endif /* * Opaque type representing current mm_struct flag state. Must be accessed via @@ -1000,12 +996,6 @@ struct mm_struct { * runqueue locks. */ struct mm_cid __percpu *pcpu_cid; - /* - * @mm_cid_next_scan: Next mm_cid scan (in jiffies). - * - * When the next mm_cid scan is due (in jiffies). - */ - unsigned long mm_cid_next_scan; /** * @nr_cpus_allowed: Number of CPUs allowed for mm. * @@ -1013,14 +1003,6 @@ struct mm_struct { * threads allowed CPUs. */ unsigned int nr_cpus_allowed; - /** - * @max_nr_cid: Maximum number of allowed concurrency - * IDs allocated. - * - * Track the highest number of allowed concurrency IDs - * allocated for the mm. - */ - atomic_t max_nr_cid; /** * @cpus_allowed_lock: Lock protecting mm cpus_allowed. * @@ -1371,35 +1353,7 @@ static inline void vma_iter_init(struct vma_iterator *vmi, #ifdef CONFIG_SCHED_MM_CID -enum mm_cid_state { - MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ - MM_CID_LAZY_PUT = (1U << 31), -}; - -static inline bool mm_cid_is_unset(int cid) -{ - return cid == MM_CID_UNSET; -} - -static inline bool mm_cid_is_lazy_put(int cid) -{ - return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); -} - -static inline bool mm_cid_is_valid(int cid) -{ - return !(cid & MM_CID_LAZY_PUT); -} - -static inline int mm_cid_set_lazy_put(int cid) -{ - return cid | MM_CID_LAZY_PUT; -} - -static inline int mm_cid_clear_lazy_put(int cid) -{ - return cid & ~MM_CID_LAZY_PUT; -} +#define MM_CID_UNSET (~0U) /* * mm_cpus_allowed: Union of all mm's threads allowed CPUs. @@ -1432,11 +1386,8 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); pcpu_cid->cid = MM_CID_UNSET; - pcpu_cid->recent_cid = MM_CID_UNSET; - pcpu_cid->time = 0; } mm->nr_cpus_allowed = p->nr_cpus_allowed; - atomic_set(&mm->max_nr_cid, 0); raw_spin_lock_init(&mm->cpus_allowed_lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); cpumask_clear(mm_cidmask(mm)); -- cgit v1.2.3 From 8cea569ca785060b8c5cc7800713ddc3b1548a94 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:47 +0100 Subject: sched/mmcid: Use proper data structures Having a lot of CID functionality specific members in struct task_struct and struct mm_struct is not really making the code easier to read. Encapsulate the CID specific parts in data structures and keep them separate from the stuff they are embedded in. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.131573768@linutronix.de --- include/linux/mm_types.h | 56 ++++++++++++---------------------------------- include/linux/rseq_types.h | 42 ++++++++++++++++++++++++++++++++++ include/linux/sched.h | 11 ++------- 3 files changed, 58 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 63b8c1209e7b..e4818e932a1d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -922,10 +923,6 @@ struct vm_area_struct { #define vma_policy(vma) NULL #endif -struct mm_cid { - unsigned int cid; -}; - /* * Opaque type representing current mm_struct flag state. Must be accessed via * mm_flags_xxx() helper functions. @@ -987,30 +984,9 @@ struct mm_struct { */ atomic_t mm_users; -#ifdef CONFIG_SCHED_MM_CID - /** - * @pcpu_cid: Per-cpu current cid. - * - * Keep track of the currently allocated mm_cid for each cpu. - * The per-cpu mm_cid values are serialized by their respective - * runqueue locks. - */ - struct mm_cid __percpu *pcpu_cid; - /** - * @nr_cpus_allowed: Number of CPUs allowed for mm. - * - * Number of CPUs allowed in the union of all mm's - * threads allowed CPUs. - */ - unsigned int nr_cpus_allowed; - /** - * @cpus_allowed_lock: Lock protecting mm cpus_allowed. - * - * Provide mutual exclusion for mm cpus_allowed and - * mm nr_cpus_allowed updates. - */ - raw_spinlock_t cpus_allowed_lock; -#endif + /* MM CID related storage */ + struct mm_mm_cid mm_cid; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* size of all page tables */ #endif @@ -1352,9 +1328,6 @@ static inline void vma_iter_init(struct vma_iterator *vmi, } #ifdef CONFIG_SCHED_MM_CID - -#define MM_CID_UNSET (~0U) - /* * mm_cpus_allowed: Union of all mm's threads allowed CPUs. */ @@ -1383,20 +1356,20 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) int i; for_each_possible_cpu(i) { - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); + struct mm_cid_pcpu *pcpu = per_cpu_ptr(mm->mm_cid.pcpu, i); - pcpu_cid->cid = MM_CID_UNSET; + pcpu->cid = MM_CID_UNSET; } - mm->nr_cpus_allowed = p->nr_cpus_allowed; - raw_spin_lock_init(&mm->cpus_allowed_lock); + mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; + raw_spin_lock_init(&mm->mm_cid.lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); cpumask_clear(mm_cidmask(mm)); } static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) { - mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid); - if (!mm->pcpu_cid) + mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu); + if (!mm->mm_cid.pcpu) return -ENOMEM; mm_init_cid(mm, p); return 0; @@ -1405,8 +1378,8 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct * static inline void mm_destroy_cid(struct mm_struct *mm) { - free_percpu(mm->pcpu_cid); - mm->pcpu_cid = NULL; + free_percpu(mm->mm_cid.pcpu); + mm->mm_cid.pcpu = NULL; } static inline unsigned int mm_cid_size(void) @@ -1421,10 +1394,9 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas if (!mm) return; /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ - raw_spin_lock(&mm->cpus_allowed_lock); + guard(raw_spinlock)(&mm->mm_cid.lock); cpumask_or(mm_allowed, mm_allowed, cpumask); - WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed)); - raw_spin_unlock(&mm->cpus_allowed_lock); + WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); } #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 9c7a34154de8..e444dd267c7a 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -90,4 +90,46 @@ struct rseq_data { struct rseq_data { }; #endif /* !CONFIG_RSEQ */ +#ifdef CONFIG_SCHED_MM_CID + +#define MM_CID_UNSET (~0U) + +/** + * struct sched_mm_cid - Storage for per task MM CID data + * @active: MM CID is active for the task + * @cid: The CID associated to the task + * @last_cid: The last CID associated to the task + */ +struct sched_mm_cid { + unsigned int active; + unsigned int cid; + unsigned int last_cid; +}; + +/** + * struct mm_cid_pcpu - Storage for per CPU MM_CID data + * @cid: The CID associated to the CPU + */ +struct mm_cid_pcpu { + unsigned int cid; +}; + +/** + * struct mm_mm_cid - Storage for per MM CID data + * @pcpu: Per CPU storage for CIDs associated to a CPU + * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map + * is growth only. + * @lock: Spinlock to protect all fields except @pcpu. It also protects + * the MM cid cpumask and the MM cidmask bitmap. + */ +struct mm_mm_cid { + struct mm_cid_pcpu __percpu *pcpu; + unsigned int nr_cpus_allowed; + raw_spinlock_t lock; +}; +#else /* CONFIG_SCHED_MM_CID */ +struct mm_mm_cid { }; +struct sched_mm_cid { }; +#endif /* !CONFIG_SCHED_MM_CID */ + #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index e47abc8685d7..64f080d6ed6e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1407,14 +1407,7 @@ struct task_struct { #endif /* CONFIG_NUMA_BALANCING */ struct rseq_data rseq; - -#ifdef CONFIG_SCHED_MM_CID - int mm_cid; /* Current cid in mm */ - int last_mm_cid; /* Most recent cid in mm */ - int migrate_from_cpu; - int mm_cid_active; /* Whether cid bitmap is active */ - struct callback_head cid_work; -#endif + struct sched_mm_cid mm_cid; struct tlbflush_unmap_batch tlb_ubc; @@ -2308,7 +2301,7 @@ void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit_signals(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { - return t->mm_cid; + return t->mm_cid.cid; } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } -- cgit v1.2.3 From be4463fa2c7185823d2989562162d578b45a89ae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:49 +0100 Subject: sched/mmcid: Cacheline align MM CID storage Both the per CPU storage and the data in mm_struct are heavily used in context switch. As they can end up next to other frequently modified data, they are subject to false sharing. Make them cache line aligned. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.194111661@linutronix.de --- include/linux/rseq_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index e444dd267c7a..d7e8071b626a 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -112,7 +112,7 @@ struct sched_mm_cid { */ struct mm_cid_pcpu { unsigned int cid; -}; +}____cacheline_aligned_in_smp; /** * struct mm_mm_cid - Storage for per MM CID data @@ -126,7 +126,7 @@ struct mm_mm_cid { struct mm_cid_pcpu __percpu *pcpu; unsigned int nr_cpus_allowed; raw_spinlock_t lock; -}; +}____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; struct sched_mm_cid { }; -- cgit v1.2.3 From b08ef5fc8fa01ae5285bef5ff783bbb425d1fb08 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:53 +0100 Subject: sched/mmcid: Move scheduler code out of global header This is only used in the scheduler core code, so there is no point to have it in a global header. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mathieu Desnoyers Acked-by: Yury Norov (NVIDIA) Link: https://patch.msgid.link/20251119172549.321259077@linutronix.de --- include/linux/mm_types.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e4818e932a1d..67a7bdf772f7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1387,27 +1387,14 @@ static inline unsigned int mm_cid_size(void) return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ } -static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) -{ - struct cpumask *mm_allowed = mm_cpus_allowed(mm); - - if (!mm) - return; - /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ - guard(raw_spinlock)(&mm->mm_cid.lock); - cpumask_or(mm_allowed, mm_allowed, cpumask); - WRITE_ONCE(mm->mm_cid.nr_cpus_allowed, cpumask_weight(mm_allowed)); -} #else /* CONFIG_SCHED_MM_CID */ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } static inline void mm_destroy_cid(struct mm_struct *mm) { } - static inline unsigned int mm_cid_size(void) { return 0; } -static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } #endif /* CONFIG_SCHED_MM_CID */ struct mmu_gather; -- cgit v1.2.3 From 437cb3ded25038d5280d21de489ce78c745118d5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:26:57 +0100 Subject: cpumask: Introduce cpumask_weighted_or() CID management OR's two cpumasks and then calculates the weight on the result. That's inefficient as that has to walk the same stuff twice. As this is done with runqueue lock held, there is a real benefit of speeding this up. Depending on the system this results in 10-20% less cycles spent with runqueue lock held for a 4K cpumask. Provide cpumask_weighted_or() and the corresponding bitmap functions which return the weight of the OR result right away. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yury Norov (NVIDIA) Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.448263340@linutronix.de --- include/linux/bitmap.h | 15 +++++++++++++++ include/linux/cpumask.h | 16 ++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 595217b7a6e7..b0395e4ccf90 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -45,6 +45,7 @@ struct device; * bitmap_copy(dst, src, nbits) *dst = *src * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 + * bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) * bitmap_complement(dst, src, nbits) *dst = ~(*src) @@ -165,6 +166,8 @@ bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); +unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, unsigned int nbits); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, unsigned int nbits); bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, @@ -337,6 +340,18 @@ void bitmap_or(unsigned long *dst, const unsigned long *src1, __bitmap_or(dst, src1, src2, nbits); } +static __always_inline +unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1, + const unsigned long *src2, unsigned int nbits) +{ + if (small_const_nbits(nbits)) { + *dst = *src1 | *src2; + return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits)); + } else { + return __bitmap_weighted_or(dst, src1, src2, nbits); + } +} + static __always_inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, unsigned int nbits) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index ff8f41ab7ce6..feba06eb0a42 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -728,6 +728,22 @@ void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, cpumask_bits(src2p), small_cpumask_bits); } +/** + * cpumask_weighted_or - *dstp = *src1p | *src2p and return the weight of the result + * @dstp: the cpumask result + * @src1p: the first input + * @src2p: the second input + * + * Return: The number of bits set in the resulting cpumask @dstp + */ +static __always_inline +unsigned int cpumask_weighted_or(struct cpumask *dstp, const struct cpumask *src1p, + const struct cpumask *src2p) +{ + return bitmap_weighted_or(cpumask_bits(dstp), cpumask_bits(src1p), + cpumask_bits(src2p), small_cpumask_bits); +} + /** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result -- cgit v1.2.3 From b11890683380a36b8488229f818d5e76e8204587 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 19 Nov 2025 15:13:14 +0100 Subject: ata: libata-scsi: Fix system suspend for a security locked drive Commit cf3fc037623c ("ata: libata-scsi: Fix ata_to_sense_error() status handling") fixed ata_to_sense_error() to properly generate sense key ABORTED COMMAND (without any additional sense code), instead of the previous bogus sense key ILLEGAL REQUEST with the additional sense code UNALIGNED WRITE COMMAND, for a failed command. However, this broke suspend for Security locked drives (drives that have Security enabled, and have not been Security unlocked by boot firmware). The reason for this is that the SCSI disk driver, for the Synchronize Cache command only, treats any sense data with sense key ILLEGAL REQUEST as a successful command (regardless of ASC / ASCQ). After commit cf3fc037623c ("ata: libata-scsi: Fix ata_to_sense_error() status handling") the code that treats any sense data with sense key ILLEGAL REQUEST as a successful command is no longer applicable, so the command fails, which causes the system suspend to be aborted: sd 1:0:0:0: PM: dpm_run_callback(): scsi_bus_suspend returns -5 sd 1:0:0:0: PM: failed to suspend async: error -5 PM: Some devices failed to suspend, or early wake event detected To make suspend work once again, for a Security locked device only, return sense data LOGICAL UNIT ACCESS NOT AUTHORIZED, the actual sense data which a real SCSI device would have returned if locked. The SCSI disk driver treats this sense data as a successful command. Cc: stable@vger.kernel.org Reported-by: Ilia Baryshnikov Closes: https://bugzilla.kernel.org/show_bug.cgi?id=220704 Fixes: cf3fc037623c ("ata: libata-scsi: Fix ata_to_sense_error() status handling") Reviewed-by: Hannes Reinecke Reviewed-by: Martin K. Petersen Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- include/linux/ata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/ata.h b/include/linux/ata.h index 792e10a09787..c9013e472aa3 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -566,6 +566,7 @@ struct ata_bmdma_prd { #define ata_id_has_ncq(id) ((id)[ATA_ID_SATA_CAPABILITY] & (1 << 8)) #define ata_id_queue_depth(id) (((id)[ATA_ID_QUEUE_DEPTH] & 0x1f) + 1) #define ata_id_removable(id) ((id)[ATA_ID_CONFIG] & (1 << 7)) +#define ata_id_is_locked(id) (((id)[ATA_ID_DLF] & 0x7) == 0x7) #define ata_id_has_atapi_AN(id) \ ((((id)[ATA_ID_SATA_CAPABILITY] != 0x0000) && \ ((id)[ATA_ID_SATA_CAPABILITY] != 0xffff)) && \ -- cgit v1.2.3 From 78cfd833bc04c0398ca4cfc64704350aebe4d4c2 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 20 Nov 2025 13:06:39 +0000 Subject: firmware: cs_dsp: Factor out common debugfs string read cs_dsp_debugfs_wmfw_read() and cs_dsp_debugfs_bin_read() were identical except for which struct member they printed. Move all this duplicated code into a common function cs_dsp_debugfs_string_read(). The check for dsp->booted has been removed because this is redundant. The two strings are set when the DSP is booted and cleared when the DSP is powered-down. Access to the string char * must be protected by the pwr_lock mutex. The string is passed into cs_dsp_debugfs_string_read() as a pointer to the char * so that the mutex lock can also be factored out into cs_dsp_debugfs_string_read(). wmfw_file_name and bin_file_name members of struct cs_dsp have been changed to const char *. It makes for a better API to pass a const pointer into cs_dsp_debugfs_string_read(). Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20251120130640.1169780-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index a66eb7624730..69959032f8f5 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -188,8 +188,8 @@ struct cs_dsp { #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_root; - char *wmfw_file_name; - char *bin_file_name; + const char *wmfw_file_name; + const char *bin_file_name; #endif }; -- cgit v1.2.3 From d5089fffe1db04a802b028c2ef4875be1ed452a3 Mon Sep 17 00:00:00 2001 From: Baojun Xu Date: Mon, 17 Nov 2025 18:21:53 +0800 Subject: ASoC: tas2781: Add tas2568/2574/5806m/5806md/5830 support TAS5806M, TAS5806MD, TAS5830 has on-chip DSP without current/voltage feedback, and in same family with TAS58XX. TAS2568, TAS2574 is in family with TAS257X. Signed-off-by: Baojun Xu Link: https://patch.msgid.link/20251117102153.30644-2-baojun.xu@ti.com Signed-off-by: Mark Brown --- include/sound/tas2781.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/sound/tas2781.h b/include/sound/tas2781.h index c3b4c43dd2bf..711142cb9918 100644 --- a/include/sound/tas2781.h +++ b/include/sound/tas2781.h @@ -117,15 +117,20 @@ enum audio_device { TAS2120, TAS2320, TAS2563, + TAS2568, TAS2570, TAS2572, + TAS2574, TAS2781, TAS5802, + TAS5806M, + TAS5806MD, TAS5815, TAS5822, TAS5825, TAS5827, TAS5828, + TAS5830, TAS_OTHERS, }; -- cgit v1.2.3 From 6f87b41303d3c4280a57b4f7360022a0951b43dd Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 18 Nov 2025 11:04:03 +0100 Subject: string: fix kerneldoc formatting in strends() strends() kernel doc should have used `@str:` format for arguments instead of `@str -`. Fixes: 197b3f3c70d6 ("string: provide strends()") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/all/20251118134748.40f03b9c@canb.auug.org.au/ Link: https://lore.kernel.org/r/20251118-strends-follow-up-v1-1-d3f8ef750f59@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/string.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index 929d05d1247c..69e9256592f8 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -564,8 +564,8 @@ static inline bool strstarts(const char *str, const char *prefix) /** * strends - Check if a string ends with another string. - * @str - NULL-terminated string to check against @suffix - * @suffix - NULL-terminated string defining the suffix to look for in @str + * @str: NULL-terminated string to check against @suffix + * @suffix: NULL-terminated string defining the suffix to look for in @str * * Returns: * True if @str ends with @suffix. False in all other cases. -- cgit v1.2.3 From 8278cb72c60399f6dc6300c409879fb4c7291513 Mon Sep 17 00:00:00 2001 From: Yuntao Wang Date: Sat, 15 Nov 2025 21:47:46 +0800 Subject: of/fdt: Consolidate duplicate code into helper functions Currently, there are many pieces of nearly identical code scattered across different places. Consolidate the duplicate code into helper functions to improve maintainability and reduce the likelihood of errors. Signed-off-by: Yuntao Wang Link: https://patch.msgid.link/20251115134753.179931-2-yuntao.wang@linux.dev Signed-off-by: Rob Herring (Arm) --- include/linux/of_fdt.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index b8d6c0c20876..51dadbaa3d63 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -55,6 +55,15 @@ extern int of_get_flat_dt_subnode_by_name(unsigned long node, const char *uname); extern const void *of_get_flat_dt_prop(unsigned long node, const char *name, int *size); + +extern const __be32 *of_flat_dt_get_addr_size_prop(unsigned long node, + const char *name, + int *entries); +extern bool of_flat_dt_get_addr_size(unsigned long node, const char *name, + u64 *addr, u64 *size); +extern void of_flat_dt_read_addr_size(const __be32 *prop, int entry_index, + u64 *addr, u64 *size); + extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern unsigned long of_get_flat_dt_root(void); extern uint32_t of_get_flat_dt_phandle(unsigned long node); -- cgit v1.2.3 From d1cadd4bfc2802c6f73b1739dbceef7513afc591 Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 19 Nov 2025 22:41:28 +0000 Subject: nodemask: use min() instead of min_t() min_t(unsigned int, a, b) casts an 'unsigned long' to 'unsigned int'. Use min(a, b) instead as it promotes any 'unsigned int' to 'unsigned long' and so cannot discard significant bits. In this case the 'unsigned long' value is small enough that the result is ok. Detected by an extra check added to min_t(). Signed-off-by: David Laight Signed-off-by: Yury Norov (NVIDIA) --- include/linux/nodemask.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 7ad1f5c7407e..bd38648c998d 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -245,18 +245,18 @@ static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int n } /* FIXME: better would be to fix all architectures to never return - > MAX_NUMNODES, then the silly min_ts could be dropped. */ + > MAX_NUMNODES, then the silly min()s could be dropped. */ #define first_node(src) __first_node(&(src)) static __always_inline unsigned int __first_node(const nodemask_t *srcp) { - return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); + return min(MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp) { - return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); + return min(MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } /* @@ -293,8 +293,7 @@ static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node) #define first_unset_node(mask) __first_unset_node(&(mask)) static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp) { - return min_t(unsigned int, MAX_NUMNODES, - find_first_zero_bit(maskp->bits, MAX_NUMNODES)); + return min(MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) -- cgit v1.2.3 From d7cdbbc93c564902169e854e78716a7b5e6cb241 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Thu, 20 Nov 2025 14:23:58 +0100 Subject: software node: allow referencing firmware nodes At the moment software nodes can only reference other software nodes. This is a limitation for devices created, for instance, on the auxiliary bus with a dynamic software node attached which cannot reference devices the firmware node of which is "real" (as an OF node or otherwise). Make it possible for a software node to reference all firmware nodes in addition to static software nodes. To that end: add a second pointer to struct software_node_ref_args of type struct fwnode_handle. The core swnode code will first check the swnode pointer and if it's NULL, it will assume the fwnode pointer should be set. Software node graphs remain the same, as in: the remote endpoints still have to be software nodes. Acked-by: Linus Walleij Reviewed-by: Sakari Ailus Reviewed-by: Andy Shevchenko Acked-by: Greg Kroah-Hartman Signed-off-by: Bartosz Golaszewski Reviewed-by: Charles Keepax Tested-by: Charles Keepax Signed-off-by: Philipp Zabel --- include/linux/property.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/property.h b/include/linux/property.h index 50b26589dd70..272bfbdea7bf 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -355,19 +355,26 @@ struct software_node; /** * struct software_node_ref_args - Reference property with additional arguments - * @node: Reference to a software node + * @swnode: Reference to a software node + * @fwnode: Alternative reference to a firmware node handle * @nargs: Number of elements in @args array * @args: Integer arguments */ struct software_node_ref_args { - const struct software_node *node; + const struct software_node *swnode; + struct fwnode_handle *fwnode; unsigned int nargs; u64 args[NR_FWNODE_REFERENCE_ARGS]; }; #define SOFTWARE_NODE_REFERENCE(_ref_, ...) \ (const struct software_node_ref_args) { \ - .node = _ref_, \ + .swnode = _Generic(_ref_, \ + const struct software_node *: _ref_, \ + default: NULL), \ + .fwnode = _Generic(_ref_, \ + struct fwnode_handle *: _ref_, \ + default: NULL), \ .nargs = COUNT_ARGS(__VA_ARGS__), \ .args = { __VA_ARGS__ }, \ } -- cgit v1.2.3 From cf6ec18ea6e12569b83af2709d0bd0cc09da198f Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 20 Nov 2025 13:44:34 +0000 Subject: ASoC: soc.h: Add SOC_ENUM_EXT_ACC() to allow setting access flags Add a macro SOC_ENUM_EXT_ACC() to allow the access permission flags to be set. This is the same as SOC_ENUM_EXT() but with an extra argument for the access flags. This will be used by the cs35l56.c driver to create a read-only volatile enum. It's preferable to avoid custom control macros in codec drivers. Code maintenance is easier if all control macros are defined together in soc.h. This commit only creates this one macro that is actually going to be used. There's no point cluttering soc.h with unused macros - that just adds a maintenance burden. People can add equivalents for the other macros if they need them. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20251120134437.1179191-2-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index 37dc6f6fc63f..b1b6b6a497da 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -319,6 +319,13 @@ struct platform_device; #define SOC_VALUE_ENUM_EXT(xname, xenum, xhandler_get, xhandler_put) \ SOC_ENUM_EXT(xname, xenum, xhandler_get, xhandler_put) +#define SOC_ENUM_EXT_ACC(xname, xenum, xhandler_get, xhandler_put, xaccess) \ +{ .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, \ + .access = xaccess, \ + .info = snd_soc_info_enum_double, \ + .get = xhandler_get, .put = xhandler_put, \ + .private_value = (unsigned long)&xenum } + #define SND_SOC_BYTES(xname, xbase, xregs) \ { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, \ .info = snd_soc_bytes_info, .get = snd_soc_bytes_get, \ -- cgit v1.2.3 From d7a82707f19c7a11ce42dd46cb22ca34a58cc9b0 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Thu, 20 Nov 2025 13:44:35 +0000 Subject: ASoC: soc.h: Add SND_SOC_BYTES_E_ACC() to allow setting access flags Add a macro SND_SOC_BYTES_E_ACC() to allow the access permission flags to be set. This is the same as SND_SOC_BYTES_E() but with an extra argument for the access flags. This will be used by the cs35l56.c driver to create a read-only volatile byte control. It's preferable to avoid custom control macros in codec drivers. Code maintenance is easier if all control macros are defined together in soc.h. This commit only creates this one macro that is actually going to be used. There's no point cluttering soc.h with unused macros - that just adds a maintenance burden. People can add equivalents for the other macros if they need them. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20251120134437.1179191-3-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/sound/soc.h b/include/sound/soc.h index b1b6b6a497da..aa0fe6b80293 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -338,6 +338,13 @@ struct platform_device; .put = xhandler_put, .private_value = \ ((unsigned long)&(struct soc_bytes) \ {.base = xbase, .num_regs = xregs }) } +#define SND_SOC_BYTES_E_ACC(xname, xbase, xregs, xhandler_get, xhandler_put, xaccess) \ +{ .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, \ + .access = xaccess, \ + .info = snd_soc_bytes_info, .get = xhandler_get, \ + .put = xhandler_put, .private_value = \ + ((unsigned long)&(struct soc_bytes) \ + {.base = xbase, .num_regs = xregs }) } #define SND_SOC_BYTES_MASK(xname, xbase, xregs, xmask) \ { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, \ -- cgit v1.2.3 From 1f382215119a0bc165e766e5bc424b3d3e8dae35 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Wed, 19 Nov 2025 17:55:24 +0800 Subject: cgroup/cpuset: Introduce cpuset_cpus_allowed_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpuset_cpus_allowed() uses a reader lock that is sleepable under RT, which means it cannot be called inside raw_spin_lock_t context. Introduce a new cpuset_cpus_allowed_locked() helper that performs the same function as cpuset_cpus_allowed() except that the caller must have acquired the cpuset_mutex so that no further locking will be needed. Suggested-by: Waiman Long Signed-off-by: Pingfan Liu Cc: Waiman Long Cc: Tejun Heo Cc: Johannes Weiner Cc: Michal Koutný Cc: linux-kernel@vger.kernel.org To: cgroups@vger.kernel.org Reviewed-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 2ddb256187b5..a98d3330385c 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -74,6 +74,7 @@ extern void inc_dl_tasks_cs(struct task_struct *task); extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); extern void cpuset_unlock(void); +extern void cpuset_cpus_allowed_locked(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern bool cpuset_cpu_is_isolated(int cpu); @@ -195,10 +196,16 @@ static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } static inline void cpuset_unlock(void) { } +static inline void cpuset_cpus_allowed_locked(struct task_struct *p, + struct cpumask *mask) +{ + cpumask_copy(mask, task_cpu_possible_mask(p)); +} + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { - cpumask_copy(mask, task_cpu_possible_mask(p)); + cpuset_cpus_allowed_locked(p, mask); } static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) -- cgit v1.2.3 From 3efee7362dbf896072af1c1aaeaf9fd6e235c591 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 20 Nov 2025 15:56:57 +0000 Subject: ASoC: SDCA: Add stubs for FDL helper functions In the case the SDCA IRQ is built in but FDL support is not stub functions are required for the FDL helpers to avoid build failures. The FDL IRQs likely shouldn't get triggered in this case, however they would still be a part of the build. Fixes: 71f7990a34cd ("ASoC: SDCA: Add FDL library for XU entities") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511200419.SbU6YvjE-lkp@intel.com/ Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20251120155657.2181751-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_fdl.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'include') diff --git a/include/sound/sdca_fdl.h b/include/sound/sdca_fdl.h index f4ba809cb203..fbaf4b384c8a 100644 --- a/include/sound/sdca_fdl.h +++ b/include/sound/sdca_fdl.h @@ -64,6 +64,8 @@ struct fdl_state { SDCA_CTL_XU_FDLD_ACK_TRANSFER | \ SDCA_CTL_XU_FDLD_NEEDS_SET) +#if IS_ENABLED(CONFIG_SND_SOC_SDCA_FDL) + int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt); int sdca_fdl_process(struct sdca_interrupt *interrupt); int sdca_fdl_sync(struct device *dev, struct sdca_function_data *function, @@ -72,4 +74,32 @@ int sdca_fdl_sync(struct device *dev, struct sdca_function_data *function, int sdca_reset_function(struct device *dev, struct sdca_function_data *function, struct regmap *regmap); +#else + +static inline int sdca_fdl_alloc_state(struct sdca_interrupt *interrupt) +{ + return 0; +} + +static inline int sdca_fdl_process(struct sdca_interrupt *interrupt) +{ + return 0; +} + +static inline int sdca_fdl_sync(struct device *dev, + struct sdca_function_data *function, + struct sdca_interrupt_info *info) +{ + return 0; +} + +static inline int sdca_reset_function(struct device *dev, + struct sdca_function_data *function, + struct regmap *regmap) +{ + return 0; +} + +#endif // CONFIG_SND_SOC_SDCA_FDL + #endif // __SDCA_FDL_H__ -- cgit v1.2.3 From 5fe65824b74c0414f105f0535437108cd6c31cc7 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 20 Nov 2025 15:30:12 +0000 Subject: ASoC: SDCA: Add missing forward declaration in header The structure sdca_function_desc contains a fwnode_handle which is undefined if the user doesn't pull in an appropriate header. Add a forward declaration to avoid this. Fixes: 996bf834d0b6 ("ASoC: SDCA: Add code to parse Function information") Tested-by: Bard Liao Reviewed-by: Maciej Strozek Reviewed-by: Peter Ujfalusi Tested-by: Richard Fitzgerald Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20251120153023.2105663-4-ckeepax@opensource.cirrus.com Reviewed-by: Vinod Koul Signed-off-by: Mark Brown --- include/sound/sdca.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/sdca.h b/include/sound/sdca.h index d38cdbfeb35f..d58d60221277 100644 --- a/include/sound/sdca.h +++ b/include/sound/sdca.h @@ -13,6 +13,7 @@ #include struct acpi_table_swft; +struct fwnode_handle; struct sdw_slave; #define SDCA_MAX_FUNCTION_COUNT 8 -- cgit v1.2.3 From 5acf17b6df5e759bfb8bc0a75fadcbb3e363a17b Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 20 Nov 2025 15:30:19 +0000 Subject: ASoC: SDCA: Add helper to write initialization writes Add a helper function to write out the SDCA blind initialization writes. Acked-by: Vinod Koul Tested-by: Bard Liao Reviewed-by: Maciej Strozek Reviewed-by: Peter Ujfalusi Tested-by: Richard Fitzgerald Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20251120153023.2105663-11-ckeepax@opensource.cirrus.com Reviewed-by: Vinod Koul Signed-off-by: Mark Brown --- include/sound/sdca_regmap.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/sdca_regmap.h b/include/sound/sdca_regmap.h index b2e3c2ad2bb8..792540a530fc 100644 --- a/include/sound/sdca_regmap.h +++ b/include/sound/sdca_regmap.h @@ -27,5 +27,7 @@ int sdca_regmap_populate_constants(struct device *dev, struct sdca_function_data int sdca_regmap_write_defaults(struct device *dev, struct regmap *regmap, struct sdca_function_data *function); +int sdca_regmap_write_init(struct device *dev, struct regmap *regmap, + struct sdca_function_data *function); #endif // __SDCA_REGMAP_H__ -- cgit v1.2.3 From 4496d1c65bad7a3a32d2e09aaf3c54bc562c3fcc Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Thu, 20 Nov 2025 15:30:20 +0000 Subject: ASoC: SDCA: add function devices Use the auxiliary bus to register/unregister subdevices for each function. Each function will be handled with a separate driver, matched using a name. If a vendor wants to override a specific function driver, they could use a custom name to match with a custom function driver. Signed-off-by: Pierre-Louis Bossart Tested-by: Bard Liao Reviewed-by: Maciej Strozek Reviewed-by: Peter Ujfalusi Tested-by: Richard Fitzgerald Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20251120153023.2105663-12-ckeepax@opensource.cirrus.com Reviewed-by: Vinod Koul Signed-off-by: Mark Brown --- include/sound/sdca.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/sound/sdca.h b/include/sound/sdca.h index d58d60221277..67ff3c88705d 100644 --- a/include/sound/sdca.h +++ b/include/sound/sdca.h @@ -15,18 +15,21 @@ struct acpi_table_swft; struct fwnode_handle; struct sdw_slave; +struct sdca_dev; #define SDCA_MAX_FUNCTION_COUNT 8 /** * struct sdca_function_desc - short descriptor for an SDCA Function * @node: firmware node for the Function. + * @func_dev: pointer to SDCA function device. * @name: Human-readable string. * @type: Function topology type. * @adr: ACPI address (used for SDCA register access). */ struct sdca_function_desc { struct fwnode_handle *node; + struct sdca_dev *func_dev; const char *name; u32 type; u8 adr; @@ -59,6 +62,8 @@ void sdca_lookup_functions(struct sdw_slave *slave); void sdca_lookup_swft(struct sdw_slave *slave); void sdca_lookup_interface_revision(struct sdw_slave *slave); bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_quirk quirk); +int sdca_dev_register_functions(struct sdw_slave *slave); +void sdca_dev_unregister_functions(struct sdw_slave *slave); #else @@ -69,6 +74,14 @@ static inline bool sdca_device_quirk_match(struct sdw_slave *slave, enum sdca_qu { return false; } + +static inline int sdca_dev_register_functions(struct sdw_slave *slave) +{ + return 0; +} + +static inline void sdca_dev_unregister_functions(struct sdw_slave *slave) {} + #endif #endif -- cgit v1.2.3 From 2d877d0659cb69cc0677ee2805e9521966d70ac5 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 20 Nov 2025 15:30:21 +0000 Subject: ASoC: SDCA: Add basic SDCA class driver Add a device level driver as the entry point for the class driver. Additional auxiliary drivers will be registered to support each function within the device. This driver will register those function drivers and provide the device level functionality, such as monitoring bus attach/detach, the device level register map, and the root for the IRQ handling. Co-developed-by: Maciej Strozek Tested-by: Bard Liao Reviewed-by: Maciej Strozek Reviewed-by: Peter Ujfalusi Tested-by: Richard Fitzgerald Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20251120153023.2105663-13-ckeepax@opensource.cirrus.com Reviewed-by: Vinod Koul Signed-off-by: Mark Brown --- include/linux/soundwire/sdw_registers.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/soundwire/sdw_registers.h b/include/linux/soundwire/sdw_registers.h index 0a5939285583..cae8a0a5a9b0 100644 --- a/include/linux/soundwire/sdw_registers.h +++ b/include/linux/soundwire/sdw_registers.h @@ -355,4 +355,6 @@ /* Check the reserved and fixed bits in address */ #define SDW_SDCA_VALID_CTL(reg) (((reg) & (GENMASK(31, 25) | BIT(18) | BIT(13))) == BIT(30)) +#define SDW_SDCA_MAX_REGISTER 0x47FFFFFF + #endif /* __SDW_REGISTERS_H */ -- cgit v1.2.3 From f58ef9d1d1355b15443719df95081f193067ab88 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:20 +0200 Subject: PCI/P2PDMA: Separate the mmap() support from the core logic Currently the P2PDMA code requires a pgmap and a struct page to function. The was serving three important purposes: - DMA API compatibility, where scatterlist required a struct page as input - Life cycle management, the percpu_ref is used to prevent UAF during device hot unplug - A way to get the P2P provider data through the pci_p2pdma_pagemap The DMA API now has a new flow, and has gained phys_addr_t support, so it no longer needs struct pages to perform P2P mapping. Lifecycle management can be delegated to the user, DMABUF for instance has a suitable invalidation protocol that does not require struct page. Finding the P2P provider data can also be managed by the caller without need to look it up from the phys_addr. Split the P2PDMA code into two layers. The optional upper layer, effectively, provides a way to mmap() P2P memory into a VMA by providing struct page, pgmap, a genalloc and sysfs. The lower layer provides the actual P2P infrastructure and is wrapped up in a new struct p2pdma_provider. Rework the mmap layer to use new p2pdma_provider based APIs. Drivers that do not want to put P2P memory into VMA's can allocate a struct p2pdma_provider after probe() starts and free it before remove() completes. When DMA mapping the driver must convey the struct p2pdma_provider to the DMA mapping code along with a phys_addr of the MMIO BAR slice to map. The driver must ensure that no DMA mapping outlives the lifetime of the struct p2pdma_provider. The intended target of this new API layer is DMABUF. There is usually only a single p2pdma_provider for a DMABUF exporter. Most drivers can establish the p2pdma_provider during probe, access the single instance during DMABUF attach and use that to drive the DMA mapping. DMABUF provides an invalidation mechanism that can guarantee all DMA is halted and the DMA mappings are undone prior to destroying the struct p2pdma_provider. This ensures there is no UAF through DMABUFs that are lingering past driver removal. The new p2pdma_provider layer cannot be used to create P2P memory that can be mapped into VMA's, be used with pin_user_pages(), O_DIRECT, and so on. These use cases must still use the mmap() layer. The p2pdma_provider layer is principally for DMABUF-like use cases where DMABUF natively manages the life cycle and access instead of vmas/pin_user_pages()/struct page. In addition, remove the bus_off field from pci_p2pdma_map_state since it duplicates information already available in the pgmap structure. The bus_offset is only used in one location (pci_p2pdma_bus_addr_map) and is always identical to pgmap->bus_offset. Signed-off-by: Jason Gunthorpe Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-1-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/pci-p2pdma.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 951f81a38f3a..1400f3ad4299 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -16,6 +16,16 @@ struct block_device; struct scatterlist; +/** + * struct p2pdma_provider + * + * A p2pdma provider is a range of MMIO address space available to the CPU. + */ +struct p2pdma_provider { + struct device *owner; + u64 bus_offset; +}; + #ifdef CONFIG_PCI_P2PDMA int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); @@ -139,11 +149,11 @@ enum pci_p2pdma_map_type { }; struct pci_p2pdma_map_state { - struct dev_pagemap *pgmap; + struct p2pdma_provider *mem; enum pci_p2pdma_map_type map; - u64 bus_off; }; + /* helper for pci_p2pdma_state(), do not use directly */ void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page); @@ -162,8 +172,7 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, struct page *page) { if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { - if (state->pgmap != page_pgmap(page)) - __pci_p2pdma_update_state(state, dev, page); + __pci_p2pdma_update_state(state, dev, page); return state->map; } return PCI_P2PDMA_MAP_NONE; @@ -181,7 +190,7 @@ static inline dma_addr_t pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) { WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); - return paddr + state->bus_off; + return paddr + state->mem->bus_offset; } #endif /* _LINUX_PCI_P2P_H */ -- cgit v1.2.3 From d4504262f745e48c1739c8b864f779b4b0f9de80 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:21 +0200 Subject: PCI/P2PDMA: Simplify bus address mapping API Update the pci_p2pdma_bus_addr_map() function to take a direct pointer to the p2pdma_provider structure instead of the pci_p2pdma_map_state. This simplifies the API by removing the need for callers to extract the provider from the state structure. The change updates all callers across the kernel (block layer, IOMMU, DMA direct, and HMM) to pass the provider pointer directly, making the code more explicit and reducing unnecessary indirection. This also removes the runtime warning check since callers now have direct control over which provider they use. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-2-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/pci-p2pdma.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 1400f3ad4299..9516ef97b17a 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -181,16 +181,15 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev, /** * pci_p2pdma_bus_addr_map - Translate a physical address to a bus address * for a PCI_P2PDMA_MAP_BUS_ADDR transfer. - * @state: P2P state structure + * @provider: P2P provider structure * @paddr: physical address to map * * Map a physically contiguous PCI_P2PDMA_MAP_BUS_ADDR transfer. */ static inline dma_addr_t -pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr) +pci_p2pdma_bus_addr_map(struct p2pdma_provider *provider, phys_addr_t paddr) { - WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR); - return paddr + state->mem->bus_offset; + return paddr + provider->bus_offset; } #endif /* _LINUX_PCI_P2P_H */ -- cgit v1.2.3 From 372d6d1b8ae3cdfe6b0638a0a848c6865ec94567 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:22 +0200 Subject: PCI/P2PDMA: Refactor to separate core P2P functionality from memory allocation Refactor the PCI P2PDMA subsystem to separate the core peer-to-peer DMA functionality from the optional memory allocation layer. This creates a two-tier architecture: The core layer provides P2P mapping functionality for physical addresses based on PCI device MMIO BARs and integrates with the DMA API for mapping operations. This layer is required for all P2PDMA users. The optional upper layer provides memory allocation capabilities including gen_pool allocator, struct page support, and sysfs interface for user space access. This separation allows subsystems like DMABUF to use only the core P2P mapping functionality without the overhead of memory allocation features they don't need. The core functionality is now available through the new pcim_p2pdma_provider() function that returns a p2pdma_provider structure. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-3-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/pci-p2pdma.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 9516ef97b17a..15471252817b 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -27,6 +27,8 @@ struct p2pdma_provider { }; #ifdef CONFIG_PCI_P2PDMA +int pcim_p2pdma_init(struct pci_dev *pdev); +struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar); int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset); int pci_p2pdma_distance_many(struct pci_dev *provider, struct device **clients, @@ -44,6 +46,15 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); #else /* CONFIG_PCI_P2PDMA */ +static inline int pcim_p2pdma_init(struct pci_dev *pdev) +{ + return -EOPNOTSUPP; +} +static inline struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, + int bar) +{ + return NULL; +} static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) { -- cgit v1.2.3 From 395698bd2cd7639b85784a4a8f5ddb7a581e353c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:23 +0200 Subject: PCI/P2PDMA: Provide an access to pci_p2pdma_map_type() function Provide an access to pci_p2pdma_map_type() function to allow subsystems to determine the appropriate mapping type for P2PDMA transfers between a provider and target device. The pci_p2pdma_map_type() function is the core P2P layer version of the existing public, but struct page focused, pci_p2pdma_state() function. It returns the same result. It is required to use the p2p subsystem from drivers that don't use the struct page layer. Like __pci_p2pdma_update_state() it is not an exported function. The idea is that only subsystem code will implement mapping helpers for taking in phys_addr_t lists, this is deliberately not made accessible to every driver to prevent abuse. Following patches will use this function to implement a shared DMA mapping helper for DMABUF. Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-4-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/pci-p2pdma.h | 85 +++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index 15471252817b..517e121d2598 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -26,6 +26,45 @@ struct p2pdma_provider { u64 bus_offset; }; +enum pci_p2pdma_map_type { + /* + * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before + * the mapping type has been calculated. Exported routines for the API + * will never return this value. + */ + PCI_P2PDMA_MAP_UNKNOWN = 0, + + /* + * Not a PCI P2PDMA transfer. + */ + PCI_P2PDMA_MAP_NONE, + + /* + * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will + * traverse the host bridge and the host bridge is not in the + * allowlist. DMA Mapping routines should return an error when + * this is returned. + */ + PCI_P2PDMA_MAP_NOT_SUPPORTED, + + /* + * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to + * each other directly through a PCI switch and the transaction will + * not traverse the host bridge. Such a mapping should program + * the DMA engine with PCI bus addresses. + */ + PCI_P2PDMA_MAP_BUS_ADDR, + + /* + * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk + * to each other, but the transaction traverses a host bridge on the + * allowlist. In this case, a normal mapping either with CPU physical + * addresses (in the case of dma-direct) or IOVA addresses (in the + * case of IOMMUs) should be used to program the DMA engine. + */ + PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, +}; + #ifdef CONFIG_PCI_P2PDMA int pcim_p2pdma_init(struct pci_dev *pdev); struct p2pdma_provider *pcim_p2pdma_provider(struct pci_dev *pdev, int bar); @@ -45,6 +84,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); +enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider, + struct device *dev); #else /* CONFIG_PCI_P2PDMA */ static inline int pcim_p2pdma_init(struct pci_dev *pdev) { @@ -106,6 +147,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page, { return sprintf(page, "none\n"); } +static inline enum pci_p2pdma_map_type +pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev) +{ + return PCI_P2PDMA_MAP_NOT_SUPPORTED; +} #endif /* CONFIG_PCI_P2PDMA */ @@ -120,45 +166,6 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client) return pci_p2pmem_find_many(&client, 1); } -enum pci_p2pdma_map_type { - /* - * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before - * the mapping type has been calculated. Exported routines for the API - * will never return this value. - */ - PCI_P2PDMA_MAP_UNKNOWN = 0, - - /* - * Not a PCI P2PDMA transfer. - */ - PCI_P2PDMA_MAP_NONE, - - /* - * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will - * traverse the host bridge and the host bridge is not in the - * allowlist. DMA Mapping routines should return an error when - * this is returned. - */ - PCI_P2PDMA_MAP_NOT_SUPPORTED, - - /* - * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to - * each other directly through a PCI switch and the transaction will - * not traverse the host bridge. Such a mapping should program - * the DMA engine with PCI bus addresses. - */ - PCI_P2PDMA_MAP_BUS_ADDR, - - /* - * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk - * to each other, but the transaction traverses a host bridge on the - * allowlist. In this case, a normal mapping either with CPU physical - * addresses (in the case of dma-direct) or IOVA addresses (in the - * case of IOMMUs) should be used to program the DMA engine. - */ - PCI_P2PDMA_MAP_THRU_HOST_BRIDGE, -}; - struct pci_p2pdma_map_state { struct p2pdma_provider *mem; enum pci_p2pdma_map_type map; -- cgit v1.2.3 From 3aa31a8bb11e47c0ff2b306988d1756b810c1c3c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:25 +0200 Subject: dma-buf: provide phys_vec to scatter-gather mapping routine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add dma_buf_phys_vec_to_sgt() and dma_buf_free_sgt() helpers to convert an array of MMIO physical address ranges into scatter-gather tables with proper DMA mapping. These common functions are a starting point and support any PCI drivers creating mappings from their BAR's MMIO addresses. VFIO is one case, as shortly will be RDMA. We can review existing DRM drivers to refactor them separately. We hope this will evolve into routines to help common DRM that include mixed CPU and MMIO mappings. Compared to the dma_map_resource() abuse this implementation handles the complicated PCI P2P scenarios properly, especially when an IOMMU is enabled: - Direct bus address mapping without IOVA allocation for PCI_P2PDMA_MAP_BUS_ADDR, using pci_p2pdma_bus_addr_map(). This happens if the IOMMU is enabled but the PCIe switch ACS flags allow transactions to avoid the host bridge. Further, this handles the slightly obscure, case of MMIO with a phys_addr_t that is different from the physical BAR programming (bus offset). The phys_addr_t is converted to a dma_addr_t and accommodates this effect. This enables certain real systems to work, especially on ARM platforms. - Mapping through host bridge with IOVA allocation and DMA_ATTR_MMIO attribute for MMIO memory regions (PCI_P2PDMA_MAP_THRU_HOST_BRIDGE). This happens when the IOMMU is enabled and the ACS flags are forcing all traffic to the IOMMU - ie for virtualization systems. - Cases where P2P is not supported through the host bridge/CPU. The P2P subsystem is the proper place to detect this and block it. Helper functions fill_sg_entry() and calc_sg_nents() handle the scatter-gather table construction, splitting large regions into UINT_MAX-sized chunks to fit within sg->length field limits. Since the physical address based DMA API forbids use of the CPU list of the scatterlist this will produce a mangled scatterlist that has a fully zero-length and NULL'd CPU list. The list is 0 length, all the struct page pointers are NULL and zero sized. This is stronger and more robust than the existing mangle_sg_table() technique. It is a future project to migrate DMABUF as a subsystem away from using scatterlist for this data structure. Reviewed-by: Kevin Tian Reviewed-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Christian König Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-6-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/dma-buf-mapping.h | 17 +++++++++++++++++ include/linux/dma-buf.h | 11 +++++++++++ 2 files changed, 28 insertions(+) create mode 100644 include/linux/dma-buf-mapping.h (limited to 'include') diff --git a/include/linux/dma-buf-mapping.h b/include/linux/dma-buf-mapping.h new file mode 100644 index 000000000000..a3c0ce2d3a42 --- /dev/null +++ b/include/linux/dma-buf-mapping.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * DMA BUF Mapping Helpers + * + */ +#ifndef __DMA_BUF_MAPPING_H__ +#define __DMA_BUF_MAPPING_H__ +#include + +struct sg_table *dma_buf_phys_vec_to_sgt(struct dma_buf_attachment *attach, + struct p2pdma_provider *provider, + struct dma_buf_phys_vec *phys_vec, + size_t nr_ranges, size_t size, + enum dma_data_direction dir); +void dma_buf_free_sgt(struct dma_buf_attachment *attach, struct sg_table *sgt, + enum dma_data_direction dir); +#endif diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index d58e329ac0e7..0bc492090237 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -22,6 +22,7 @@ #include #include #include +#include struct device; struct dma_buf; @@ -530,6 +531,16 @@ struct dma_buf_export_info { void *priv; }; +/** + * struct dma_buf_phys_vec - describe continuous chunk of memory + * @paddr: physical address of that chunk + * @len: Length of this chunk + */ +struct dma_buf_phys_vec { + phys_addr_t paddr; + size_t len; +}; + /** * DEFINE_DMA_BUF_EXPORT_INFO - helper macro for exporters * @name: export-info name -- cgit v1.2.3 From 64a5dedcff801072154a806102d731ecdf0e7552 Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 20 Nov 2025 11:28:26 +0200 Subject: vfio: Export vfio device get and put registration helpers These helpers are useful for managing additional references taken on the device from other associated VFIO modules. Original-patch-by: Jason Gunthorpe Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Tested-by: Alex Mastro Tested-by: Nicolin Chen Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-7-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/vfio.h b/include/linux/vfio.h index eb563f538dee..217ba4ef1752 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -297,6 +297,8 @@ static inline void vfio_put_device(struct vfio_device *device) int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); +bool vfio_device_try_get_registration(struct vfio_device *device); +void vfio_device_put_registration(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set); -- cgit v1.2.3 From 8312cab5ff4702389a86129051eba6ea046a71a1 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Thu, 20 Nov 2025 15:56:47 +0100 Subject: timers/migration: Rename 'online' bit to 'available' The timer migration hierarchy excludes offline CPUs via the tmigr_is_not_available function, which is essentially checking the online bit for the CPU. Rename the online bit to available and all references in function names and tracepoint to generalise the concept of available CPUs. Signed-off-by: Gabriele Monaco Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Reviewed-by: Thomas Gleixner Link: https://patch.msgid.link/20251120145653.296659-2-gmonaco@redhat.com --- include/trace/events/timer_migration.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h index 47db5eaf2f9a..61171b13c687 100644 --- a/include/trace/events/timer_migration.h +++ b/include/trace/events/timer_migration.h @@ -173,14 +173,14 @@ DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_active, TP_ARGS(tmc) ); -DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_online, +DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_available, TP_PROTO(struct tmigr_cpu *tmc), TP_ARGS(tmc) ); -DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_offline, +DEFINE_EVENT(tmigr_cpugroup, tmigr_cpu_unavailable, TP_PROTO(struct tmigr_cpu *tmc), -- cgit v1.2.3 From b56651007fc018effe695a68d48caa6970b23094 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 20 Nov 2025 15:56:52 +0100 Subject: cpumask: Add initialiser to use cleanup helpers Now we can simplify a code that allocates cpumasks for local needs. Automatic variables have to be initialized at declaration, or at least before any possibility for the logic to return, so that compiler wouldn't try to call an associate destructor function on a random stack number. Because cpumask_var_t, depending on the CPUMASK_OFFSTACK config, is either a pointer or an array, we have to have a macro for initialization. So define a CPUMASK_VAR_NULL macro, which allows to init struct cpumask pointer with NULL when CPUMASK_OFFSTACK is enabled, and effectively a no-op when CPUMASK_OFFSTACK is disabled (initialisation optimised out with -O2). Signed-off-by: Yury Norov Signed-off-by: Gabriele Monaco Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://patch.msgid.link/20251120145653.296659-7-gmonaco@redhat.com --- include/linux/cpumask.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index ff8f41ab7ce6..68be522449ec 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -1005,6 +1005,7 @@ static __always_inline unsigned int cpumask_size(void) #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly +#define CPUMASK_VAR_NULL NULL bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); @@ -1051,6 +1052,7 @@ static __always_inline bool cpumask_available(cpumask_var_t mask) #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly +#define CPUMASK_VAR_NULL {} static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { -- cgit v1.2.3 From 7dec062cfcf27808dbb70a0b231d1a698792743d Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Thu, 20 Nov 2025 15:56:53 +0100 Subject: timers/migration: Exclude isolated cpus from hierarchy The timer migration mechanism allows active CPUs to pull timers from idle ones to improve the overall idle time. This is however undesired when CPU intensive workloads run on isolated cores, as the algorithm would move the timers from housekeeping to isolated cores, negatively affecting the isolation. Exclude isolated cores from the timer migration algorithm, extend the concept of unavailable cores, currently used for offline ones, to isolated ones: * A core is unavailable if isolated or offline; * A core is available if non isolated and online; A core is considered unavailable as isolated if it belongs to: * the isolcpus (domain) list * an isolated cpuset Except if it is: * in the nohz_full list (already idle for the hierarchy) * the nohz timekeeper core (must be available to handle global timers) CPUs are added to the hierarchy during late boot, excluding isolated ones, the hierarchy is also adapted when the cpuset isolation changes. Due to how the timer migration algorithm works, any CPU part of the hierarchy can have their global timers pulled by remote CPUs and have to pull remote timers, only skipping pulling remote timers would break the logic. For this reason, prevent isolated CPUs from pulling remote global timers, but also the other way around: any global timer started on an isolated CPU will run there. This does not break the concept of isolation (global timers don't come from outside the CPU) and, if considered inappropriate, can usually be mitigated with other isolation techniques (e.g. IRQ pinning). This effect was noticed on a 128 cores machine running oslat on the isolated cores (1-31,33-63,65-95,97-127). The tool monopolises CPUs, and the CPU with lowest count in a timer migration hierarchy (here 1 and 65) appears as always active and continuously pulls global timers, from the housekeeping CPUs. This ends up moving driver work (e.g. delayed work) to isolated CPUs and causes latency spikes: before the change: # oslat -c 1-31,33-63,65-95,97-127 -D 62s ... Maximum: 1203 10 3 4 ... 5 (us) after the change: # oslat -c 1-31,33-63,65-95,97-127 -D 62s ... Maximum: 10 4 3 4 3 ... 5 (us) The same behaviour was observed on a machine with as few as 20 cores / 40 threads with isocpus set to: 1-9,11-39 with rtla-osnoise-top. Signed-off-by: Gabriele Monaco Signed-off-by: Thomas Gleixner Tested-by: John B. Wyatt IV Reviewed-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://patch.msgid.link/20251120145653.296659-8-gmonaco@redhat.com --- include/linux/timer.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/timer.h b/include/linux/timer.h index 0414d9e6b4fc..62e1cea71125 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -188,4 +188,13 @@ int timers_dead_cpu(unsigned int cpu); #define timers_dead_cpu NULL #endif +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask); +#else +static inline int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) +{ + return 0; +} +#endif + #endif -- cgit v1.2.3 From d0d9a9629f505ac70e1ffd172e092ff71f5d989a Mon Sep 17 00:00:00 2001 From: Elaine Zhang Date: Tue, 11 Nov 2025 10:57:35 +0800 Subject: dt-bindings: clock, reset: Add support for rv1126b Add clock and reset ID defines for rv1126b. Also add documentation for the rv1126b CRU core. Signed-off-by: Elaine Zhang Acked-by: Conor Dooley Link: https://patch.msgid.link/20251111025738.869847-3-zhangqing@rock-chips.com Signed-off-by: Heiko Stuebner --- include/dt-bindings/clock/rockchip,rv1126b-cru.h | 392 ++++++++++++++++++++++ include/dt-bindings/reset/rockchip,rv1126b-cru.h | 405 +++++++++++++++++++++++ 2 files changed, 797 insertions(+) create mode 100644 include/dt-bindings/clock/rockchip,rv1126b-cru.h create mode 100644 include/dt-bindings/reset/rockchip,rv1126b-cru.h (limited to 'include') diff --git a/include/dt-bindings/clock/rockchip,rv1126b-cru.h b/include/dt-bindings/clock/rockchip,rv1126b-cru.h new file mode 100644 index 000000000000..721d50a1419f --- /dev/null +++ b/include/dt-bindings/clock/rockchip,rv1126b-cru.h @@ -0,0 +1,392 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (c) 2025 Rockchip Electronics Co., Ltd. + * Author: Elaine Zhang + */ + +#ifndef _DT_BINDINGS_CLK_ROCKCHIP_RV1126B_H +#define _DT_BINDINGS_CLK_ROCKCHIP_RV1126B_H + +/* pll clocks */ +#define PLL_GPLL 0 +#define PLL_CPLL 1 +#define PLL_AUPLL 2 +#define ARMCLK 3 +#define SCLK_DDR 4 + +/* clk (clocks) */ +#define CLK_CPLL_DIV20 5 +#define CLK_CPLL_DIV10 6 +#define CLK_CPLL_DIV8 7 +#define CLK_GPLL_DIV8 8 +#define CLK_GPLL_DIV6 9 +#define CLK_GPLL_DIV4 10 +#define CLK_CPLL_DIV3 11 +#define CLK_GPLL_DIV3 12 +#define CLK_CPLL_DIV2 13 +#define CLK_GPLL_DIV2 14 +#define CLK_CM_FRAC0 15 +#define CLK_CM_FRAC1 16 +#define CLK_CM_FRAC2 17 +#define CLK_UART_FRAC0 18 +#define CLK_UART_FRAC1 19 +#define CLK_AUDIO_FRAC0 20 +#define CLK_AUDIO_FRAC1 21 +#define CLK_AUDIO_INT0 22 +#define CLK_AUDIO_INT1 23 +#define SCLK_UART0_SRC 24 +#define SCLK_UART1 25 +#define SCLK_UART2 26 +#define SCLK_UART3 27 +#define SCLK_UART4 28 +#define SCLK_UART5 29 +#define SCLK_UART6 30 +#define SCLK_UART7 31 +#define MCLK_SAI0 32 +#define MCLK_SAI1 33 +#define MCLK_SAI2 34 +#define MCLK_PDM 35 +#define CLKOUT_PDM 36 +#define MCLK_ASRC0 37 +#define MCLK_ASRC1 38 +#define MCLK_ASRC2 39 +#define MCLK_ASRC3 40 +#define CLK_ASRC0 41 +#define CLK_ASRC1 42 +#define CLK_CORE_PLL 43 +#define CLK_NPU_PLL 44 +#define CLK_VEPU_PLL 45 +#define CLK_ISP_PLL 46 +#define CLK_AISP_PLL 47 +#define CLK_SARADC0_SRC 48 +#define CLK_SARADC1_SRC 49 +#define CLK_SARADC2_SRC 50 +#define HCLK_NPU_ROOT 51 +#define PCLK_NPU_ROOT 52 +#define ACLK_VEPU_ROOT 53 +#define HCLK_VEPU_ROOT 54 +#define PCLK_VEPU_ROOT 55 +#define CLK_CORE_RGA_SRC 56 +#define ACLK_GMAC_ROOT 57 +#define ACLK_VI_ROOT 58 +#define HCLK_VI_ROOT 59 +#define PCLK_VI_ROOT 60 +#define DCLK_VICAP_ROOT 61 +#define CLK_SYS_DSMC_ROOT 62 +#define ACLK_VDO_ROOT 63 +#define ACLK_RKVDEC_ROOT 64 +#define HCLK_VDO_ROOT 65 +#define PCLK_VDO_ROOT 66 +#define DCLK_OOC_SRC 67 +#define DCLK_VOP 68 +#define DCLK_DECOM_SRC 69 +#define PCLK_DDR_ROOT 70 +#define ACLK_SYSMEM_SRC 71 +#define ACLK_TOP_ROOT 72 +#define ACLK_BUS_ROOT 73 +#define HCLK_BUS_ROOT 74 +#define PCLK_BUS_ROOT 75 +#define CCLK_SDMMC0 76 +#define CCLK_SDMMC1 77 +#define CCLK_EMMC 78 +#define SCLK_2X_FSPI0 79 +#define CLK_GMAC_PTP_REF_SRC 80 +#define CLK_GMAC_125M 81 +#define CLK_TIMER_ROOT 82 +#define TCLK_WDT_NS_SRC 83 +#define TCLK_WDT_S_SRC 84 +#define TCLK_WDT_HPMCU 85 +#define CLK_CAN0 86 +#define CLK_CAN1 87 +#define PCLK_PERI_ROOT 88 +#define ACLK_PERI_ROOT 89 +#define CLK_I2C_BUS_SRC 90 +#define CLK_SPI0 91 +#define CLK_SPI1 92 +#define BUSCLK_PMU_SRC 93 +#define CLK_PWM0 94 +#define CLK_PWM2 95 +#define CLK_PWM3 96 +#define CLK_PKA_RKCE_SRC 97 +#define ACLK_RKCE_SRC 98 +#define ACLK_VCP_ROOT 99 +#define HCLK_VCP_ROOT 100 +#define PCLK_VCP_ROOT 101 +#define CLK_CORE_FEC_SRC 102 +#define CLK_CORE_AVSP_SRC 103 +#define CLK_50M_GMAC_IOBUF_VI 104 +#define PCLK_TOP_ROOT 105 +#define CLK_MIPI0_OUT2IO 106 +#define CLK_MIPI1_OUT2IO 107 +#define CLK_MIPI2_OUT2IO 108 +#define CLK_MIPI3_OUT2IO 109 +#define CLK_CIF_OUT2IO 110 +#define CLK_MAC_OUT2IO 111 +#define MCLK_SAI0_OUT2IO 112 +#define MCLK_SAI1_OUT2IO 113 +#define MCLK_SAI2_OUT2IO 114 +#define CLK_CM_FRAC0_SRC 115 +#define CLK_CM_FRAC1_SRC 116 +#define CLK_CM_FRAC2_SRC 117 +#define CLK_UART_FRAC0_SRC 118 +#define CLK_UART_FRAC1_SRC 119 +#define CLK_AUDIO_FRAC0_SRC 120 +#define CLK_AUDIO_FRAC1_SRC 121 +#define ACLK_NPU_ROOT 122 +#define HCLK_RKNN 123 +#define ACLK_RKNN 124 +#define PCLK_GPIO3 125 +#define DBCLK_GPIO3 126 +#define PCLK_IOC_VCCIO3 127 +#define PCLK_SARADC0 128 +#define CLK_SARADC0 129 +#define HCLK_SDMMC1 130 +#define HCLK_VEPU 131 +#define ACLK_VEPU 132 +#define CLK_CORE_VEPU 133 +#define HCLK_FEC 134 +#define ACLK_FEC 135 +#define CLK_CORE_FEC 136 +#define HCLK_AVSP 137 +#define ACLK_AVSP 138 +#define BUSCLK_PMU1_ROOT 139 +#define HCLK_AISP 140 +#define ACLK_AISP 141 +#define CLK_CORE_AISP 142 +#define CLK_CORE_ISP_ROOT 143 +#define PCLK_DSMC 144 +#define ACLK_DSMC 145 +#define HCLK_CAN0 146 +#define HCLK_CAN1 147 +#define PCLK_GPIO2 148 +#define DBCLK_GPIO2 149 +#define PCLK_GPIO4 150 +#define DBCLK_GPIO4 151 +#define PCLK_GPIO5 152 +#define DBCLK_GPIO5 153 +#define PCLK_GPIO6 154 +#define DBCLK_GPIO6 155 +#define PCLK_GPIO7 156 +#define DBCLK_GPIO7 157 +#define PCLK_IOC_VCCIO2 158 +#define PCLK_IOC_VCCIO4 159 +#define PCLK_IOC_VCCIO5 160 +#define PCLK_IOC_VCCIO6 161 +#define PCLK_IOC_VCCIO7 162 +#define HCLK_ISP 163 +#define ACLK_ISP 164 +#define CLK_CORE_ISP 165 +#define HCLK_VICAP 166 +#define ACLK_VICAP 167 +#define DCLK_VICAP 168 +#define ISP0CLK_VICAP 169 +#define HCLK_VPSS 170 +#define ACLK_VPSS 171 +#define CLK_CORE_VPSS 172 +#define PCLK_CSI2HOST0 173 +#define DCLK_CSI2HOST0 174 +#define PCLK_CSI2HOST1 175 +#define DCLK_CSI2HOST1 176 +#define PCLK_CSI2HOST2 177 +#define DCLK_CSI2HOST2 178 +#define PCLK_CSI2HOST3 179 +#define DCLK_CSI2HOST3 180 +#define HCLK_SDMMC0 181 +#define ACLK_GMAC 182 +#define PCLK_GMAC 183 +#define CLK_GMAC_PTP_REF 184 +#define PCLK_CSIPHY0 185 +#define PCLK_CSIPHY1 186 +#define PCLK_MACPHY 187 +#define PCLK_SARADC1 188 +#define CLK_SARADC1 189 +#define PCLK_SARADC2 190 +#define CLK_SARADC2 191 +#define ACLK_RKVDEC 192 +#define HCLK_RKVDEC 193 +#define CLK_HEVC_CA_RKVDEC 194 +#define ACLK_VOP 195 +#define HCLK_VOP 196 +#define HCLK_RKJPEG 197 +#define ACLK_RKJPEG 198 +#define ACLK_RKMMU_DECOM 199 +#define HCLK_RKMMU_DECOM 200 +#define DCLK_DECOM 201 +#define ACLK_DECOM 202 +#define PCLK_DECOM 203 +#define PCLK_MIPI_DSI 204 +#define PCLK_DSIPHY 205 +#define ACLK_OOC 206 +#define ACLK_SYSMEM 207 +#define PCLK_DDRC 208 +#define PCLK_DDRMON 209 +#define CLK_TIMER_DDRMON 210 +#define PCLK_DFICTRL 211 +#define PCLK_DDRPHY 212 +#define PCLK_DMA2DDR 213 +#define CLK_RCOSC_SRC 214 +#define BUSCLK_PMU_MUX 215 +#define BUSCLK_PMU_ROOT 216 +#define PCLK_PMU 217 +#define CLK_XIN_RC_DIV 218 +#define CLK_32K 219 +#define PCLK_PMU_GPIO0 220 +#define DBCLK_PMU_GPIO0 221 +#define PCLK_PMU_HP_TIMER 222 +#define CLK_PMU_HP_TIMER 223 +#define CLK_PMU_32K_HP_TIMER 224 +#define PCLK_PWM1 225 +#define CLK_PWM1 226 +#define CLK_OSC_PWM1 227 +#define CLK_RC_PWM1 228 +#define CLK_FREQ_PWM1 229 +#define CLK_COUNTER_PWM1 230 +#define PCLK_I2C2 231 +#define CLK_I2C2 232 +#define PCLK_UART0 233 +#define SCLK_UART0 234 +#define PCLK_RCOSC_CTRL 235 +#define CLK_OSC_RCOSC_CTRL 236 +#define CLK_REF_RCOSC_CTRL 237 +#define PCLK_IOC_PMUIO0 238 +#define CLK_REFOUT 239 +#define CLK_PREROLL 240 +#define CLK_PREROLL_32K 241 +#define HCLK_PMU_SRAM 242 +#define PCLK_WDT_LPMCU 243 +#define TCLK_WDT_LPMCU 244 +#define CLK_LPMCU 245 +#define CLK_LPMCU_RTC 246 +#define PCLK_LPMCU_MAILBOX 247 +#define HCLK_OOC 248 +#define PCLK_SPI2AHB 249 +#define HCLK_SPI2AHB 250 +#define HCLK_FSPI1 251 +#define HCLK_XIP_FSPI1 252 +#define SCLK_1X_FSPI1 253 +#define PCLK_IOC_PMUIO1 254 +#define PCLK_AUDIO_ADC_PMU 255 +#define MCLK_AUDIO_ADC_PMU 256 +#define MCLK_AUDIO_ADC_DIV4_PMU 257 +#define MCLK_LPSAI 258 +#define ACLK_GIC400 259 +#define PCLK_WDT_NS 260 +#define TCLK_WDT_NS 261 +#define PCLK_WDT_HPMCU 262 +#define HCLK_CACHE 263 +#define PCLK_HPMCU_MAILBOX 264 +#define PCLK_HPMCU_INTMUX 265 +#define CLK_HPMCU 266 +#define CLK_HPMCU_RTC 267 +#define PCLK_RKDMA 268 +#define ACLK_RKDMA 269 +#define PCLK_DCF 270 +#define ACLK_DCF 271 +#define HCLK_RGA 272 +#define ACLK_RGA 273 +#define CLK_CORE_RGA 274 +#define PCLK_TIMER 275 +#define CLK_TIMER0 276 +#define CLK_TIMER1 277 +#define CLK_TIMER2 278 +#define CLK_TIMER3 279 +#define CLK_TIMER4 280 +#define CLK_TIMER5 281 +#define PCLK_I2C0 282 +#define CLK_I2C0 283 +#define PCLK_I2C1 284 +#define CLK_I2C1 285 +#define PCLK_I2C3 286 +#define CLK_I2C3 287 +#define PCLK_I2C4 288 +#define CLK_I2C4 289 +#define PCLK_I2C5 290 +#define CLK_I2C5 291 +#define PCLK_SPI0 292 +#define PCLK_SPI1 293 +#define PCLK_PWM0 294 +#define CLK_OSC_PWM0 295 +#define CLK_RC_PWM0 296 +#define PCLK_PWM2 297 +#define CLK_OSC_PWM2 298 +#define CLK_RC_PWM2 299 +#define PCLK_PWM3 300 +#define CLK_OSC_PWM3 301 +#define CLK_RC_PWM3 302 +#define PCLK_UART1 303 +#define PCLK_UART2 304 +#define PCLK_UART3 305 +#define PCLK_UART4 306 +#define PCLK_UART5 307 +#define PCLK_UART6 308 +#define PCLK_UART7 309 +#define PCLK_TSADC 310 +#define CLK_TSADC 311 +#define HCLK_SAI0 312 +#define HCLK_SAI1 313 +#define HCLK_SAI2 314 +#define HCLK_RKDSM 315 +#define MCLK_RKDSM 316 +#define HCLK_PDM 317 +#define HCLK_ASRC0 318 +#define HCLK_ASRC1 319 +#define PCLK_AUDIO_ADC_BUS 320 +#define MCLK_AUDIO_ADC_BUS 321 +#define MCLK_AUDIO_ADC_DIV4_BUS 322 +#define PCLK_RKCE 323 +#define HCLK_NS_RKCE 324 +#define PCLK_OTPC_NS 325 +#define CLK_SBPI_OTPC_NS 326 +#define CLK_USER_OTPC_NS 327 +#define CLK_OTPC_ARB 328 +#define PCLK_OTP_MASK 329 +#define CLK_TSADC_PHYCTRL 330 +#define LRCK_SRC_ASRC0 331 +#define LRCK_DST_ASRC0 332 +#define LRCK_SRC_ASRC1 333 +#define LRCK_DST_ASRC1 334 +#define PCLK_KEY_READER 335 +#define ACLK_NSRKCE 336 +#define CLK_PKA_NSRKCE 337 +#define PCLK_RTC_ROOT 338 +#define PCLK_GPIO1 339 +#define DBCLK_GPIO1 340 +#define PCLK_IOC_VCCIO1 341 +#define ACLK_USB3OTG 342 +#define CLK_REF_USB3OTG 343 +#define CLK_SUSPEND_USB3OTG 344 +#define HCLK_USB2HOST 345 +#define HCLK_ARB_USB2HOST 346 +#define PCLK_RTC_TEST 347 +#define HCLK_EMMC 348 +#define HCLK_FSPI0 349 +#define HCLK_XIP_FSPI0 350 +#define PCLK_PIPEPHY 351 +#define PCLK_USB2PHY 352 +#define CLK_REF_PIPEPHY_CPLL_SRC 353 +#define CLK_REF_PIPEPHY 354 +#define HCLK_VPSL 355 +#define ACLK_VPSL 356 +#define CLK_CORE_VPSL 357 +#define CLK_MACPHY 358 +#define HCLK_RKRNG_NS 359 +#define HCLK_RKRNG_S_NS 360 +#define CLK_AISP_PLL_SRC 361 + +/* secure clks */ +#define CLK_USER_OTPC_S 362 +#define CLK_SBPI_OTPC_S 363 +#define PCLK_OTPC_S 364 +#define PCLK_KEY_READER_S 365 +#define HCLK_KL_RKCE_S 366 +#define HCLK_RKCE_S 367 +#define PCLK_WDT_S 368 +#define TCLK_WDT_S 369 +#define CLK_STIMER0 370 +#define CLK_STIMER1 371 +#define PLK_STIMER 372 +#define HCLK_RKRNG_S 373 +#define CLK_PKA_RKCE_S 374 +#define ACLK_RKCE_S 375 + +#endif diff --git a/include/dt-bindings/reset/rockchip,rv1126b-cru.h b/include/dt-bindings/reset/rockchip,rv1126b-cru.h new file mode 100644 index 000000000000..a7712db319d0 --- /dev/null +++ b/include/dt-bindings/reset/rockchip,rv1126b-cru.h @@ -0,0 +1,405 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR MIT) */ +/* + * Copyright (c) 2025 Rockchip Electronics Co., Ltd. + * Author: Elaine Zhang + */ + +#ifndef _DT_BINDINGS_RESET_ROCKCHIP_RV1126B_H +#define _DT_BINDINGS_RESET_ROCKCHIP_RV1126B_H + +/* ==========================list all of reset fields id=========================== */ +/* TOPCRU-->SOFTRST_CON00 */ + +/* TOPCRU-->SOFTRST_CON15 */ +#define SRST_P_CRU 0 +#define SRST_P_CRU_BIU 1 + +/* BUSCRU-->SOFTRST_CON00 */ +#define SRST_A_TOP_BIU 2 +#define SRST_A_RKCE_BIU 3 +#define SRST_A_BUS_BIU 4 +#define SRST_H_BUS_BIU 5 +#define SRST_P_BUS_BIU 6 +#define SRST_P_CRU_BUS 7 +#define SRST_P_SYS_GRF 8 +#define SRST_H_BOOTROM 9 +#define SRST_A_GIC400 10 +#define SRST_A_SPINLOCK 11 +#define SRST_P_WDT_NS 12 +#define SRST_T_WDT_NS 13 + +/* BUSCRU-->SOFTRST_CON01 */ +#define SRST_P_WDT_HPMCU 14 +#define SRST_T_WDT_HPMCU 15 +#define SRST_H_CACHE 16 +#define SRST_P_HPMCU_MAILBOX 17 +#define SRST_P_HPMCU_INTMUX 18 +#define SRST_HPMCU_FULL_CLUSTER 19 +#define SRST_HPMCU_PWUP 20 +#define SRST_HPMCU_ONLY_CORE 21 +#define SRST_T_HPMCU_JTAG 22 +#define SRST_P_RKDMA 23 +#define SRST_A_RKDMA 24 + +/* BUSCRU-->SOFTRST_CON02 */ +#define SRST_P_DCF 25 +#define SRST_A_DCF 26 +#define SRST_H_RGA 27 +#define SRST_A_RGA 28 +#define SRST_CORE_RGA 29 +#define SRST_P_TIMER 30 +#define SRST_TIMER0 31 +#define SRST_TIMER1 32 +#define SRST_TIMER2 33 +#define SRST_TIMER3 34 +#define SRST_TIMER4 35 +#define SRST_TIMER5 36 +#define SRST_A_RKCE 37 +#define SRST_PKA_RKCE 38 +#define SRST_H_RKRNG_S 39 +#define SRST_H_RKRNG_NS 40 + +/* BUSCRU-->SOFTRST_CON03 */ +#define SRST_P_I2C0 41 +#define SRST_I2C0 42 +#define SRST_P_I2C1 43 +#define SRST_I2C1 44 +#define SRST_P_I2C3 45 +#define SRST_I2C3 46 +#define SRST_P_I2C4 47 +#define SRST_I2C4 48 +#define SRST_P_I2C5 49 +#define SRST_I2C5 50 +#define SRST_P_SPI0 51 +#define SRST_SPI0 52 +#define SRST_P_SPI1 53 +#define SRST_SPI1 54 + +/* BUSCRU-->SOFTRST_CON04 */ +#define SRST_P_PWM0 55 +#define SRST_PWM0 56 +#define SRST_P_PWM2 57 +#define SRST_PWM2 58 +#define SRST_P_PWM3 59 +#define SRST_PWM3 60 + +/* BUSCRU-->SOFTRST_CON05 */ +#define SRST_P_UART1 61 +#define SRST_S_UART1 62 +#define SRST_P_UART2 63 +#define SRST_S_UART2 64 +#define SRST_P_UART3 65 +#define SRST_S_UART3 66 +#define SRST_P_UART4 67 +#define SRST_S_UART4 68 +#define SRST_P_UART5 69 +#define SRST_S_UART5 70 +#define SRST_P_UART6 71 +#define SRST_S_UART6 72 +#define SRST_P_UART7 73 +#define SRST_S_UART7 74 + +/* BUSCRU-->SOFTRST_CON06 */ +#define SRST_P_TSADC 75 +#define SRST_TSADC 76 +#define SRST_H_SAI0 77 +#define SRST_M_SAI0 78 +#define SRST_H_SAI1 79 +#define SRST_M_SAI1 80 +#define SRST_H_SAI2 81 +#define SRST_M_SAI2 82 +#define SRST_H_RKDSM 83 +#define SRST_M_RKDSM 84 +#define SRST_H_PDM 85 +#define SRST_M_PDM 86 +#define SRST_PDM 87 + +/* BUSCRU-->SOFTRST_CON07 */ +#define SRST_H_ASRC0 88 +#define SRST_ASRC0 89 +#define SRST_H_ASRC1 90 +#define SRST_ASRC1 91 +#define SRST_P_AUDIO_ADC_BUS 92 +#define SRST_M_AUDIO_ADC_BUS 93 +#define SRST_P_RKCE 94 +#define SRST_H_NS_RKCE 95 +#define SRST_P_OTPC_NS 96 +#define SRST_SBPI_OTPC_NS 97 +#define SRST_USER_OTPC_NS 98 +#define SRST_OTPC_ARB 99 +#define SRST_P_OTP_MASK 100 + +/* PERICRU-->SOFTRST_CON00 */ +#define SRST_A_PERI_BIU 101 +#define SRST_P_PERI_BIU 102 +#define SRST_P_RTC_BIU 103 +#define SRST_P_CRU_PERI 104 +#define SRST_P_PERI_GRF 105 +#define SRST_P_GPIO1 106 +#define SRST_DB_GPIO1 107 +#define SRST_P_IOC_VCCIO1 108 +#define SRST_A_USB3OTG 109 +#define SRST_H_USB2HOST 110 +#define SRST_H_ARB_USB2HOST 111 +#define SRST_P_RTC_TEST 112 + +/* PERICRU-->SOFTRST_CON01 */ +#define SRST_H_EMMC 113 +#define SRST_H_FSPI0 114 +#define SRST_H_XIP_FSPI0 115 +#define SRST_S_2X_FSPI0 116 +#define SRST_UTMI_USB2HOST 117 +#define SRST_REF_PIPEPHY 118 +#define SRST_P_PIPEPHY 119 +#define SRST_P_PIPEPHY_GRF 120 +#define SRST_P_USB2PHY 121 +#define SRST_POR_USB2PHY 122 +#define SRST_OTG_USB2PHY 123 +#define SRST_HOST_USB2PHY 124 + +/* CORECRU-->SOFTRST_CON00 */ +#define SRST_REF_PVTPLL_CORE 125 +#define SRST_NCOREPORESET0 126 +#define SRST_NCORESET0 127 +#define SRST_NCOREPORESET1 128 +#define SRST_NCORESET1 129 +#define SRST_NCOREPORESET2 130 +#define SRST_NCORESET2 131 +#define SRST_NCOREPORESET3 132 +#define SRST_NCORESET3 133 +#define SRST_NDBGRESET 134 +#define SRST_NL2RESET 135 + +/* CORECRU-->SOFTRST_CON01 */ +#define SRST_A_CORE_BIU 136 +#define SRST_P_CORE_BIU 137 +#define SRST_H_CORE_BIU 138 +#define SRST_P_DBG 139 +#define SRST_POT_DBG 140 +#define SRST_NT_DBG 141 +#define SRST_P_CORE_PVTPLL 142 +#define SRST_P_CRU_CORE 143 +#define SRST_P_CORE_GRF 144 +#define SRST_P_DFT2APB 145 + +/* PMUCRU-->SOFTRST_CON00 */ +#define SRST_H_PMU_BIU 146 +#define SRST_P_PMU_GPIO0 147 +#define SRST_DB_PMU_GPIO0 148 +#define SRST_P_PMU_HP_TIMER 149 +#define SRST_PMU_HP_TIMER 150 +#define SRST_PMU_32K_HP_TIMER 151 + +/* PMUCRU-->SOFTRST_CON01 */ +#define SRST_P_PWM1 152 +#define SRST_PWM1 153 +#define SRST_P_I2C2 154 +#define SRST_I2C2 155 +#define SRST_P_UART0 156 +#define SRST_S_UART0 157 + +/* PMUCRU-->SOFTRST_CON02 */ +#define SRST_P_RCOSC_CTRL 158 +#define SRST_REF_RCOSC_CTRL 159 +#define SRST_P_IOC_PMUIO0 160 +#define SRST_P_CRU_PMU 161 +#define SRST_P_PMU_GRF 162 +#define SRST_PREROLL 163 +#define SRST_PREROLL_32K 164 +#define SRST_H_PMU_SRAM 165 + +/* PMUCRU-->SOFTRST_CON03 */ +#define SRST_P_WDT_LPMCU 166 +#define SRST_T_WDT_LPMCU 167 +#define SRST_LPMCU_FULL_CLUSTER 168 +#define SRST_LPMCU_PWUP 169 +#define SRST_LPMCU_ONLY_CORE 170 +#define SRST_T_LPMCU_JTAG 171 +#define SRST_P_LPMCU_MAILBOX 172 + +/* PMU1CRU-->SOFTRST_CON00 */ +#define SRST_P_SPI2AHB 173 +#define SRST_H_SPI2AHB 174 +#define SRST_H_FSPI1 175 +#define SRST_H_XIP_FSPI1 176 +#define SRST_S_1X_FSPI1 177 +#define SRST_P_IOC_PMUIO1 178 +#define SRST_P_CRU_PMU1 179 +#define SRST_P_AUDIO_ADC_PMU 180 +#define SRST_M_AUDIO_ADC_PMU 181 +#define SRST_H_PMU1_BIU 182 + +/* PMU1CRU-->SOFTRST_CON01 */ +#define SRST_P_LPDMA 183 +#define SRST_A_LPDMA 184 +#define SRST_H_LPSAI 185 +#define SRST_M_LPSAI 186 +#define SRST_P_AOA_TDD 187 +#define SRST_P_AOA_FE 188 +#define SRST_P_AOA_AAD 189 +#define SRST_P_AOA_APB 190 +#define SRST_P_AOA_SRAM 191 + +/* DDRCRU-->SOFTRST_CON00 */ +#define SRST_P_DDR_BIU 192 +#define SRST_P_DDRC 193 +#define SRST_P_DDRMON 194 +#define SRST_TIMER_DDRMON 195 +#define SRST_P_DFICTRL 196 +#define SRST_P_DDR_GRF 197 +#define SRST_P_CRU_DDR 198 +#define SRST_P_DDRPHY 199 +#define SRST_P_DMA2DDR 200 + +/* SUBDDRCRU-->SOFTRST_CON00 */ +#define SRST_A_SYSMEM_BIU 201 +#define SRST_A_SYSMEM 202 +#define SRST_A_DDR_BIU 203 +#define SRST_A_DDRSCH0_CPU 204 +#define SRST_A_DDRSCH1_NPU 205 +#define SRST_A_DDRSCH2_POE 206 +#define SRST_A_DDRSCH3_VI 207 +#define SRST_CORE_DDRC 208 +#define SRST_DDRMON 209 +#define SRST_DFICTRL 210 +#define SRST_RS 211 +#define SRST_A_DMA2DDR 212 +#define SRST_DDRPHY 213 + +/* VICRU-->SOFTRST_CON00 */ +#define SRST_REF_PVTPLL_ISP 214 +#define SRST_A_GMAC_BIU 215 +#define SRST_A_VI_BIU 216 +#define SRST_H_VI_BIU 217 +#define SRST_P_VI_BIU 218 +#define SRST_P_CRU_VI 219 +#define SRST_P_VI_GRF 220 +#define SRST_P_VI_PVTPLL 221 +#define SRST_P_DSMC 222 +#define SRST_A_DSMC 223 +#define SRST_H_CAN0 224 +#define SRST_CAN0 225 +#define SRST_H_CAN1 226 +#define SRST_CAN1 227 + +/* VICRU-->SOFTRST_CON01 */ +#define SRST_P_GPIO2 228 +#define SRST_DB_GPIO2 229 +#define SRST_P_GPIO4 230 +#define SRST_DB_GPIO4 231 +#define SRST_P_GPIO5 232 +#define SRST_DB_GPIO5 233 +#define SRST_P_GPIO6 234 +#define SRST_DB_GPIO6 235 +#define SRST_P_GPIO7 236 +#define SRST_DB_GPIO7 237 +#define SRST_P_IOC_VCCIO2 238 +#define SRST_P_IOC_VCCIO4 239 +#define SRST_P_IOC_VCCIO5 240 +#define SRST_P_IOC_VCCIO6 241 +#define SRST_P_IOC_VCCIO7 242 + +/* VICRU-->SOFTRST_CON02 */ +#define SRST_CORE_ISP 243 +#define SRST_H_VICAP 244 +#define SRST_A_VICAP 245 +#define SRST_D_VICAP 246 +#define SRST_ISP0_VICAP 247 +#define SRST_CORE_VPSS 248 +#define SRST_CORE_VPSL 249 +#define SRST_P_CSI2HOST0 250 +#define SRST_P_CSI2HOST1 251 +#define SRST_P_CSI2HOST2 252 +#define SRST_P_CSI2HOST3 253 +#define SRST_H_SDMMC0 254 +#define SRST_A_GMAC 255 +#define SRST_P_CSIPHY0 256 +#define SRST_P_CSIPHY1 257 + +/* VICRU-->SOFTRST_CON03 */ +#define SRST_P_MACPHY 258 +#define SRST_MACPHY 259 +#define SRST_P_SARADC1 260 +#define SRST_SARADC1 261 +#define SRST_P_SARADC2 262 +#define SRST_SARADC2 263 + +/* VEPUCRU-->SOFTRST_CON00 */ +#define SRST_REF_PVTPLL_VEPU 264 +#define SRST_A_VEPU_BIU 265 +#define SRST_H_VEPU_BIU 266 +#define SRST_P_VEPU_BIU 267 +#define SRST_P_CRU_VEPU 268 +#define SRST_P_VEPU_GRF 269 +#define SRST_P_GPIO3 270 +#define SRST_DB_GPIO3 271 +#define SRST_P_IOC_VCCIO3 272 +#define SRST_P_SARADC0 273 +#define SRST_SARADC0 274 +#define SRST_H_SDMMC1 275 + +/* VEPUCRU-->SOFTRST_CON01 */ +#define SRST_P_VEPU_PVTPLL 276 +#define SRST_H_VEPU 277 +#define SRST_A_VEPU 278 +#define SRST_CORE_VEPU 279 + +/* NPUCRU-->SOFTRST_CON00 */ +#define SRST_REF_PVTPLL_NPU 280 +#define SRST_A_NPU_BIU 281 +#define SRST_H_NPU_BIU 282 +#define SRST_P_NPU_BIU 283 +#define SRST_P_CRU_NPU 284 +#define SRST_P_NPU_GRF 285 +#define SRST_P_NPU_PVTPLL 286 +#define SRST_H_RKNN 287 +#define SRST_A_RKNN 288 + +/* VDOCRU-->SOFTRST_CON00 */ +#define SRST_A_RKVDEC_BIU 289 +#define SRST_A_VDO_BIU 290 +#define SRST_H_VDO_BIU 291 +#define SRST_P_VDO_BIU 292 +#define SRST_P_CRU_VDO 293 +#define SRST_P_VDO_GRF 294 +#define SRST_A_RKVDEC 295 +#define SRST_H_RKVDEC 296 +#define SRST_HEVC_CA_RKVDEC 297 +#define SRST_A_VOP 298 +#define SRST_H_VOP 299 +#define SRST_D_VOP 300 +#define SRST_A_OOC 301 +#define SRST_H_OOC 302 +#define SRST_D_OOC 303 + +/* VDOCRU-->SOFTRST_CON01 */ +#define SRST_H_RKJPEG 304 +#define SRST_A_RKJPEG 305 +#define SRST_A_RKMMU_DECOM 306 +#define SRST_H_RKMMU_DECOM 307 +#define SRST_D_DECOM 308 +#define SRST_A_DECOM 309 +#define SRST_P_DECOM 310 +#define SRST_P_MIPI_DSI 311 +#define SRST_P_DSIPHY 312 + +/* VCPCRU-->SOFTRST_CON00 */ +#define SRST_REF_PVTPLL_VCP 313 +#define SRST_A_VCP_BIU 314 +#define SRST_H_VCP_BIU 315 +#define SRST_P_VCP_BIU 316 +#define SRST_P_CRU_VCP 317 +#define SRST_P_VCP_GRF 318 +#define SRST_P_VCP_PVTPLL 319 +#define SRST_A_AISP_BIU 320 +#define SRST_H_AISP_BIU 321 +#define SRST_CORE_AISP 322 + +/* VCPCRU-->SOFTRST_CON01 */ +#define SRST_H_FEC 323 +#define SRST_A_FEC 324 +#define SRST_CORE_FEC 325 +#define SRST_H_AVSP 326 +#define SRST_A_AVSP 327 + +#endif -- cgit v1.2.3 From 84692a1519b32d61ff882cf24a9eda900961acad Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Thu, 20 Nov 2025 11:15:56 -0800 Subject: io_uring/kbuf: remove obsolete buf_nr_pages and update comments The buf_nr_pages field in io_buffer_list was previously used to determine whether the buffer list uses ring-provided buffers or classic provided buffers. This is now determined by checking the IOBL_BUF_RING flag. Remove the buf_nr_pages field and update related comments. Signed-off-by: Joanne Koong Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 92780764d5fa..e1adb0d20a0a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -327,8 +327,8 @@ struct io_ring_ctx { /* * Modifications are protected by ->uring_lock and ->mmap_lock. - * The flags, buf_pages and buf_nr_pages fields should be stable - * once published. + * The buffer list's io mapped region should be stable once + * published. */ struct xarray io_bl_xa; -- cgit v1.2.3 From c04507ac500e2cc8048000c2a849588227554e06 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 16 Nov 2025 21:51:07 +0100 Subject: sched: Provide and use set_need_resched_current() set_tsk_need_resched(current) requires set_preempt_need_resched(current) to work correctly outside of the scheduler. Provide set_need_resched_current() which wraps this correctly and replace all the open coded instances. Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251116174750.665769842@linutronix.de --- include/linux/sched.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index bb436ee1942d..021d05aa941a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2058,6 +2058,13 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +static inline void set_need_resched_current(void) +{ + lockdep_assert_irqs_disabled(); + set_tsk_need_resched(current); + set_preempt_need_resched(); +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return -- cgit v1.2.3 From 898f94465205e33295c29333a82a249b8f90aa74 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 23 Oct 2025 09:12:39 -0400 Subject: lockd: don't allow locking on reexported NFSv2/3 Since commit 9254c8ae9b81 ("nfsd: disallow file locking and delegations for NFSv4 reexport"), file locking when reexporting an NFS mount via NFSv4 is expressly prohibited by nfsd. Do the same in lockd: Add a new nlmsvc_file_cannot_lock() helper that will test whether file locking is allowed for a given file, and return nlm_lck_denied_nolocks if it isn't. Signed-off-by: Jeff Layton Tested-by: Olga Kornievskaia Signed-off-by: Chuck Lever --- include/linux/lockd/lockd.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index c8f0f9458f2c..330e38776bb2 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -12,6 +12,7 @@ /* XXX: a lot of this should really be under fs/lockd. */ +#include #include #include #include @@ -307,7 +308,7 @@ void nlmsvc_invalidate_all(void); int nlmsvc_unlock_all_by_sb(struct super_block *sb); int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr); -static inline struct file *nlmsvc_file_file(struct nlm_file *file) +static inline struct file *nlmsvc_file_file(const struct nlm_file *file) { return file->f_file[O_RDONLY] ? file->f_file[O_RDONLY] : file->f_file[O_WRONLY]; @@ -318,6 +319,12 @@ static inline struct inode *nlmsvc_file_inode(struct nlm_file *file) return file_inode(nlmsvc_file_file(file)); } +static inline bool +nlmsvc_file_cannot_lock(const struct nlm_file *file) +{ + return exportfs_cannot_lock(nlmsvc_file_file(file)->f_path.dentry->d_sb->s_export_op); +} + static inline int __nlm_privileged_request4(const struct sockaddr *sap) { const struct sockaddr_in *sin = (struct sockaddr_in *)sap; -- cgit v1.2.3 From 340b59816bc417c306cd76b867914cfb4f386d2d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 4 Nov 2025 16:57:09 +0800 Subject: mm: kill mm_wr_locked from unmap_vmas() and unmap_single_vma() Kill mm_wr_locked since commit f8e97613fed2 ("mm: convert VM_PFNMAP tracking to pfnmap_track() + pfnmap_untrack()") remove the user. Link: https://lkml.kernel.org/r/20251104085709.2688433-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: David Hildenbrand (Red Hat) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index b636d12bb651..df9f258a017c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2480,7 +2480,7 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, unsigned long tree_end, bool mm_wr_locked); + unsigned long end, unsigned long tree_end); struct mmu_notifier_range; -- cgit v1.2.3 From 5dba5cc2e0ffa76f2f6c8922a04469dc9602c396 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:43 +0000 Subject: mm: introduce VM_MAYBE_GUARD and make visible in /proc/$pid/smaps Patch series "introduce VM_MAYBE_GUARD and make it sticky", v4. Currently, guard regions are not visible to users except through /proc/$pid/pagemap, with no explicit visibility at the VMA level. This makes the feature less useful, as it isn't entirely apparent which VMAs may have these entries present, especially when performing actions which walk through memory regions such as those performed by CRIU. This series addresses this issue by introducing the VM_MAYBE_GUARD flag which fulfils this role, updating the smaps logic to display an entry for these. The semantics of this flag are that a guard region MAY be present if set (we cannot be sure, as we can't efficiently track whether an MADV_GUARD_REMOVE finally removes all the guard regions in a VMA) - but if not set the VMA definitely does NOT have any guard regions present. It's problematic to establish this flag without further action, because that means that VMAs with guard regions in them become non-mergeable with adjacent VMAs for no especially good reason. To work around this, this series also introduces the concept of 'sticky' VMA flags - that is flags which: a. if set in one VMA and not in another still permit those VMAs to be merged (if otherwise compatible). b. When they are merged, the resultant VMA must have the flag set. The VMA logic is updated to propagate these flags correctly. Additionally, VM_MAYBE_GUARD being an explicit VMA flag allows us to solve an issue with file-backed guard regions - previously these established an anon_vma object for file-backed mappings solely to have vma_needs_copy() correctly propagate guard region mappings to child processes. We introduce a new flag alias VM_COPY_ON_FORK (which currently only specifies VM_MAYBE_GUARD) and update vma_needs_copy() to check explicitly for this flag and to copy page tables if it is present, which resolves this issue. Additionally, we add the ability for allow-listed VMA flags to be atomically writable with only mmap/VMA read locks held. The only flag we allow so far is VM_MAYBE_GUARD, which we carefully ensure does not cause any races by being allowed to do so. This allows us to maintain guard region installation as a read-locked operation and not endure the overhead of obtaining a write lock here. Finally we introduce extensive VMA userland tests to assert that the sticky VMA logic behaves correctly as well as guard region self tests to assert that smaps visibility is correctly implemented. This patch (of 9): Currently, if a user needs to determine if guard regions are present in a range, they have to scan all VMAs (or have knowledge of which ones might have guard regions). Since commit 8e2f2aeb8b48 ("fs/proc/task_mmu: add guard region bit to pagemap") and the related commit a516403787e0 ("fs/proc: extend the PAGEMAP_SCAN ioctl to report guard regions"), users can use either /proc/$pid/pagemap or the PAGEMAP_SCAN functionality to perform this operation at a virtual address level. This is not ideal, and it gives no visibility at a /proc/$pid/smaps level that guard regions exist in ranges. This patch remedies the situation by establishing a new VMA flag, VM_MAYBE_GUARD, to indicate that a VMA may contain guard regions (it is uncertain because we cannot reasonably determine whether a MADV_GUARD_REMOVE call has removed all of the guard regions in a VMA, and additionally VMAs may change across merge/split). We utilise 0x800 for this flag which makes it available to 32-bit architectures also, a flag that was previously used by VM_DENYWRITE, which was removed in commit 8d0920bde5eb ("mm: remove VM_DENYWRITE") and hasn't bee reused yet. We also update the smaps logic and documentation to identify these VMAs. Another major use of this functionality is that we can use it to identify that we ought to copy page tables on fork. We do not actually implement usage of this flag in mm/madvise.c yet as we need to allow some VMA flags to be applied atomically under mmap/VMA read lock in order to avoid the need to acquire a write lock for this purpose. Link: https://lkml.kernel.org/r/cover.1763460113.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/cf8ef821eba29b6c5b5e138fffe95d6dcabdedb9.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +++ include/trace/events/mmflags.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index df9f258a017c..36b9418c00fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -271,6 +271,8 @@ extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif +#define VM_MAYBE_GUARD_BIT 11 + /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h @@ -296,6 +298,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_UFFD_MISSING 0 #endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ +#define VM_MAYBE_GUARD BIT(VM_MAYBE_GUARD_BIT) /* The VMA maybe contains guard regions. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index aa441f593e9a..a6e5a44c9b42 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -213,6 +213,7 @@ IF_HAVE_PG_ARCH_3(arch_3) {VM_UFFD_MISSING, "uffd_missing" }, \ IF_HAVE_UFFD_MINOR(VM_UFFD_MINOR, "uffd_minor" ) \ {VM_PFNMAP, "pfnmap" }, \ + {VM_MAYBE_GUARD, "maybe_guard" }, \ {VM_UFFD_WP, "uffd_wp" }, \ {VM_LOCKED, "locked" }, \ {VM_IO, "io" }, \ -- cgit v1.2.3 From 568822502383acd57d7cc1c72ee43932c45a9524 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:44 +0000 Subject: mm: add atomic VMA flags and set VM_MAYBE_GUARD as such This patch adds the ability to atomically set VMA flags with only the mmap read/VMA read lock held. As this could be hugely problematic for VMA flags in general given that all other accesses are non-atomic and serialised by the mmap/VMA locks, we implement this with a strict allow-list - that is, only designated flags are allowed to do this. We make VM_MAYBE_GUARD one of these flags. Link: https://lkml.kernel.org/r/97e57abed09f2663077ed7a36fb8206e243171a9.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lance Yang Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 36b9418c00fc..03776aab3837 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -518,6 +518,9 @@ extern unsigned int kobjsize(const void *objp); /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) +/* These flags can be updated atomically via VMA/mmap read lock. */ +#define VM_ATOMIC_SET_ALLOWED VM_MAYBE_GUARD + /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE @@ -860,6 +863,47 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, __vm_flags_mod(vma, set, clear); } +static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, + int bit) +{ + const vm_flags_t mask = BIT(bit); + + /* Only specific flags are permitted */ + if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED))) + return false; + + return true; +} + +/* + * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific + * valid flags are allowed to do this. + */ +static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit) +{ + /* mmap read lock/VMA read lock must be held. */ + if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) + vma_assert_locked(vma); + + if (__vma_flag_atomic_valid(vma, bit)) + set_bit(bit, &ACCESS_PRIVATE(vma, __vm_flags)); +} + +/* + * Test for VMA flag atomically. Requires no locks. Only specific valid flags + * are allowed to do this. + * + * This is necessarily racey, so callers must ensure that serialisation is + * achieved through some other means, or that races are permissible. + */ +static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, int bit) +{ + if (__vma_flag_atomic_valid(vma, bit)) + return test_bit(bit, &vma->vm_flags); + + return false; +} + static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; -- cgit v1.2.3 From 64212ba02e66e705cabce188453ba4e61e9d7325 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:46 +0000 Subject: mm: implement sticky VMA flags It is useful to be able to designate that certain flags are 'sticky', that is, if two VMAs are merged one with a flag of this nature and one without, the merged VMA sets this flag. As a result we ignore these flags for the purposes of determining VMA flag differences between VMAs being considered for merge. This patch therefore updates the VMA merge logic to perform this action, with flags possessing this property being described in the VM_STICKY bitmap. Those flags which ought to be ignored for the purposes of VMA merge are described in the VM_IGNORE_MERGE bitmap, which the VMA merge logic is also updated to use. As part of this change we place VM_SOFTDIRTY in VM_IGNORE_MERGE as it already had this behaviour, alongside VM_STICKY as sticky flags by implication must not disallow merge. Ultimately it seems that we should make VM_SOFTDIRTY a sticky flag in its own right, but this change is out of scope for this series. The only sticky flag designated as such is VM_MAYBE_GUARD, so as a result of this change, once the VMA flag is set upon guard region installation, VMAs with guard ranges will now not have their merge behaviour impacted as a result and can be freely merged with other VMAs without VM_MAYBE_GUARD set. Also update the comments for vma_modify_flags() to directly reference sticky flags now we have established the concept. We also update the VMA userland tests to account for the changes. Link: https://lkml.kernel.org/r/22ad5269f7669d62afb42ce0c79bad70b994c58d.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand (Red Hat) Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 03776aab3837..fea113d1d723 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -527,6 +527,34 @@ extern unsigned int kobjsize(const void *objp); #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) +/* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that + * mapped page tables may contain metadata not described by the + * VMA and thus any merged VMA may also contain this metadata, + * and thus we must make this flag sticky. + */ +#define VM_STICKY VM_MAYBE_GUARD + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but + * dirty bit -- the caller should mark merged VMA as dirty. If + * dirty bit won't be excluded from comparison, we increase + * pressure on the memory system forcing the kernel to generate + * new VMAs when old one could be extended instead. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. -- cgit v1.2.3 From ab04b530e7e8bd5cf9fb0c1ad20e0deee8f569ec Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 18 Nov 2025 10:17:47 +0000 Subject: mm: introduce copy-on-fork VMAs and make VM_MAYBE_GUARD one Gather all the VMA flags whose presence implies that page tables must be copied on fork into a single bitmap - VM_COPY_ON_FORK - and use this rather than specifying individual flags in vma_needs_copy(). We also add VM_MAYBE_GUARD to this list, as it being set on a VMA implies that there may be metadata contained in the page tables (that is - guard markers) which would will not and cannot be propagated upon fork. This was already being done manually previously in vma_needs_copy(), but this makes it very explicit, alongside VM_PFNMAP, VM_MIXEDMAP and VM_UFFD_WP all of which imply the same. Note that VM_STICKY flags ought generally to be marked VM_COPY_ON_FORK too - because equally a flag being VM_STICKY indicates that the VMA contains metadat that is not propagated by being faulted in - i.e. that the VMA metadata does not fully describe the VMA alone, and thus we must propagate whatever metadata there is on a fork. However, for maximum flexibility, we do not make this necessarily the case here. Link: https://lkml.kernel.org/r/5d41b24e7bc622cda0af92b6d558d7f4c0d1bc8c.1763460113.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Reviewed-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Cc: Andrei Vagin Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Jonathan Corbet Cc: Lance Yang Cc: Liam Howlett Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Michal Hocko Cc: Mike Rapoport Cc: Nico Pache Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index fea113d1d723..af2904aeb163 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -555,6 +555,32 @@ extern unsigned int kobjsize(const void *objp); */ #define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. -- cgit v1.2.3 From 05be0287955970b043a0742e85b6c285dea4f286 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 7 Nov 2025 17:55:36 +0800 Subject: mm: remove unnecessary __GFP_HIGHMEM in __p*d_alloc_one_*() __{pgd,p4d,pud,pmd,pte}_alloc_one_*() always allocate pages with GFP flag GFP_PGTABLE_KERNEL/GFP_PGTABLE_USER. These two macros are defined as follows: #define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO) #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) There is no __GFP_HIGHMEM in them, so we needn't to clear __GFP_HIGHMEM explicitly. Link: https://lkml.kernel.org/r/20251109021817.346181-1-chenhuacai@loongson.cn Link: https://lkml.kernel.org/r/20251107095536.3101371-1-chenhuacai@loongson.cn Signed-off-by: Huacai Chen Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Anshuman Khandual Reviewed-by: Kevin Brodsky Cc: Arnd Bergmann Cc: Jan Kara Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index b9d2a7c79b93..57137d3ac159 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -18,8 +18,7 @@ */ static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm) { - struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL & - ~__GFP_HIGHMEM, 0); + struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL, 0); if (!ptdesc) return NULL; @@ -178,7 +177,6 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) @@ -236,7 +234,6 @@ static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) @@ -284,7 +281,6 @@ static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - gfp &= ~__GFP_HIGHMEM; ptdesc = pagetable_alloc_noprof(gfp, order); if (!ptdesc) -- cgit v1.2.3 From bc8e51c05ad50a5a0b02114d3cc94d151a332595 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 7 Nov 2025 15:40:41 -0800 Subject: mm: memcg: dump memcg protection info on oom or alloc failures Currently kernel dumps memory state on oom and allocation failures. One of the question usually raised on those dumps is why the kernel has not reclaimed the reclaimable memory instead of triggering oom. One potential reason is the usage of memory protection provided by memcg. So, let's also dump the memory protected by the memcg in such reports to ease the debugging. Link: https://lkml.kernel.org/r/20251107234041.3632644-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Cc: David Rientjes Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8c0f15e5978f..966f7c1a0128 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1764,6 +1764,7 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); +void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1830,6 +1831,10 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) { return true; } + +static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) +{ +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) -- cgit v1.2.3 From 2197bb60f89077603cc580ff752c5cf6388c1099 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Nov 2025 20:32:01 +0000 Subject: mm: add vma_start_write_killable() Patch series "vma_start_write_killable"", v2. When we added the VMA lock, we made a major oversight in not adding a killable variant. That can run us into trouble where a thread takes the VMA lock for read (eg handling a page fault) and then goes out to lunch for an hour (eg doing reclaim). Another thread tries to modify the VMA, taking the mmap_lock for write, then attempts to lock the VMA for write. That blocks on the first thread, and ensures that every other page fault now tries to take the mmap_lock for read. Because everything's in an uninterruptible sleep, we can't kill the task, which makes me angry. This patchset just adds vma_start_write_killable() and converts one caller to use it. Most users are somewhat tricky to convert, so expect follow-up individual patches per call-site which need careful analysis to make sure we've done proper cleanup. This patch (of 2): The vma can be held read-locked for a substantial period of time, eg if memory allocation needs to go into reclaim. It's useful to be able to send fatal signals to threads which are waiting for the write lock. Link: https://lkml.kernel.org/r/20251110203204.1454057-1-willy@infradead.org Link: https://lkml.kernel.org/r/20251110203204.1454057-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Chris Li Cc: Jann Horn Cc: Matthew Wilcox (Oracle) Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index e05da70dc0cb..d53f72dba7fe 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -195,7 +195,8 @@ static inline bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned in return (vma->vm_lock_seq == *mm_lock_seq); } -void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); +int __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq, + int state); /* * Begin writing to a VMA. @@ -209,7 +210,30 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - __vma_start_write(vma, mm_lock_seq); + __vma_start_write(vma, mm_lock_seq, TASK_UNINTERRUPTIBLE); +} + +/** + * vma_start_write_killable - Begin writing to a VMA. + * @vma: The VMA we are going to modify. + * + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + * + * Context: May sleep while waiting for readers to drop the vma read lock. + * Caller must already hold the mmap_lock for write. + * + * Return: 0 for a successful acquisition. -EINTR if a fatal signal was + * received. + */ +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + unsigned int mm_lock_seq; + + if (__is_vma_write_locked(vma, &mm_lock_seq)) + return 0; + return __vma_start_write(vma, mm_lock_seq, TASK_KILLABLE); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -283,6 +307,8 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) { return 0; } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_assert_attached(struct vm_area_struct *vma) {} -- cgit v1.2.3 From 8b02baf37311754518dfe78073583db03fbb0c07 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:04 -0800 Subject: mm/damon: rename damos core filter helpers to have word core Patch series "mm/damon: misc cleanups". Yet another batch of misc cleanups and refactoring for DAMON code, tests, and documents. First two patches (1and 2) rename DAMOS core filters related code for readability. Three following patches (3-5) refactor page table walk callback functions in DAMON, as suggested by Hugh and David, and I promised. Next two patches (6 and 7) refactor DAMON core layer kunit test and sysfs interface selftest to be simple and deduplicated. Final two patches (8 and 9) fix up sphinx and grammatical errors on documents. This patch (of 9): DAMOS filters handled by the core layer are called core filters, while those handled by the ops layer are called ops filters. They share the same type but are managed in different places since core filters are evaluated before the ops filters. They also have different helper functions that depend on their managed places. The helper functions for ops filters have '_ops_' keyword on their name, so it is easy to know they are for ops filters. Meanwhile, the helper functions for core filters are not having the 'core' keyword on their name. This makes it easy to be mistakenly used for ops filters. Actually there was such a bug. To avoid future mistakes from similar confusions, rename DAMOS core filters helper functions to have a keyword 'core' on their names. Link: https://lkml.kernel.org/r/20251112154114.66053-1-sj@kernel.org Link: https://lkml.kernel.org/r/20251112154114.66053-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index f3566b978cdf..6e3db165fe60 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -871,10 +871,10 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damos_for_each_quota_goal_safe(goal, next, quota) \ list_for_each_entry_safe(goal, next, &(quota)->goals, list) -#define damos_for_each_filter(f, scheme) \ +#define damos_for_each_core_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->filters, list) -#define damos_for_each_filter_safe(f, next, scheme) \ +#define damos_for_each_core_filter_safe(f, next, scheme) \ list_for_each_entry_safe(f, next, &(scheme)->filters, list) #define damos_for_each_ops_filter(f, scheme) \ -- cgit v1.2.3 From 53298afe456e62ad2c2dc8bc7aa54bb86a67ba2f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 12 Nov 2025 07:41:05 -0800 Subject: mm/damon: rename damos->filters to damos->core_filters DAMOS filters that are handled by the ops layer are linked to damos->ops_filters. Owing to the ops_ prefix on the name, it is easy to understand it is for ops layer handled filters. The other types of filters, which are handled by the core layer, are linked to damos->filters. Because of the name, it is easy to confuse the list is there for not only core layer handled ones but all filters. Avoid such confusions by renaming the field to core_filters. Link: https://lkml.kernel.org/r/20251112154114.66053-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Bill Wendling Cc: Brendan Higgins Cc: David Gow Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Justin Stitt Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/damon.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/damon.h b/include/linux/damon.h index 6e3db165fe60..3813373a9200 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -492,7 +492,7 @@ struct damos_migrate_dests { * @wmarks: Watermarks for automated (in)activation of this scheme. * @migrate_dests: Destination nodes if @action is "migrate_{hot,cold}". * @target_nid: Destination node if @action is "migrate_{hot,cold}". - * @filters: Additional set of &struct damos_filter for &action. + * @core_filters: Additional set of &struct damos_filter for &action. * @ops_filters: ops layer handling &struct damos_filter objects list. * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. @@ -518,7 +518,7 @@ struct damos_migrate_dests { * * Before applying the &action to a memory region, &struct damon_operations * implementation could check pages of the region and skip &action to respect - * &filters + * &core_filters * * The minimum entity that @action can be applied depends on the underlying * &struct damon_operations. Since it may not be aligned with the core layer @@ -562,7 +562,7 @@ struct damos { struct damos_migrate_dests migrate_dests; }; }; - struct list_head filters; + struct list_head core_filters; struct list_head ops_filters; void *last_applied; struct damos_stat stat; @@ -872,10 +872,10 @@ static inline unsigned long damon_sz_region(struct damon_region *r) list_for_each_entry_safe(goal, next, &(quota)->goals, list) #define damos_for_each_core_filter(f, scheme) \ - list_for_each_entry(f, &(scheme)->filters, list) + list_for_each_entry(f, &(scheme)->core_filters, list) #define damos_for_each_core_filter_safe(f, next, scheme) \ - list_for_each_entry_safe(f, next, &(scheme)->filters, list) + list_for_each_entry_safe(f, next, &(scheme)->core_filters, list) #define damos_for_each_ops_filter(f, scheme) \ list_for_each_entry(f, &(scheme)->ops_filters, list) -- cgit v1.2.3 From 6707915e030a3258868355f989b80140c1a45bbe Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 17 Nov 2025 17:33:38 +0000 Subject: mm: propagate VM_SOFTDIRTY on merge Patch series "make VM_SOFTDIRTY a sticky VMA flag", v2. Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by establishing a new VMA, or via merge) as implemented in __mmap_complete() and do_brk_flags(). However, when performing a merge of existing mappings such as when performing mprotect(), we may lose the VM_SOFTDIRTY flag. Now we have the concept of making VMA flags 'sticky', that is that they both don't prevent merge and, importantly, are propagated to merged VMAs, this seems a sensible alternative to the existing special-casing of VM_SOFTDIRTY. We additionally add a self-test that demonstrates that this logic behaves as expected. This patch (of 2): Currently we set VM_SOFTDIRTY when a new mapping is set up (whether by establishing a new VMA, or via merge) as implemented in __mmap_complete() and do_brk_flags(). However, when performing a merge of existing mappings such as when performing mprotect(), we may lose the VM_SOFTDIRTY flag. This is because currently we simply ignore VM_SOFTDIRTY for the purposes of merge, so one VMA may possess the flag and another not, and whichever happens to be the target VMA will be the one upon which the merge is performed which may or may not have VM_SOFTDIRTY set. Now we have the concept of 'sticky' VMA flags, let's make VM_SOFTDIRTY one which solves this issue. Additionally update VMA userland tests to propagate changes. [akpm@linux-foundation.org: update comments, per Lorenzo] Link: https://lkml.kernel.org/r/0019e0b8-ee1e-4359-b5ee-94225cbe5588@lucifer.local Link: https://lkml.kernel.org/r/cover.1763399675.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/955478b5170715c895d1ef3b7f68e0cd77f76868.1763399675.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Suggested-by: Vlastimil Babka Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Pedro Falcato Acked-by: Andrey Vagin Reviewed-by: Vlastimil Babka Acked-by: Cyrill Gorcunov Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index af2904aeb163..bf660d5b6e97 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -532,28 +532,27 @@ extern unsigned int kobjsize(const void *objp); * possesses it but the other does not, the merged VMA should nonetheless have * applied to it: * + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. + * * VM_MAYBE_GUARD - If a VMA may have guard regions in place it implies that * mapped page tables may contain metadata not described by the * VMA and thus any merged VMA may also contain this metadata, * and thus we must make this flag sticky. */ -#define VM_STICKY VM_MAYBE_GUARD +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) /* * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one * of these flags and the other not does not preclude a merge. * - * VM_SOFTDIRTY - Should not prevent from VMA merging, if we match the flags but - * dirty bit -- the caller should mark merged VMA as dirty. If - * dirty bit won't be excluded from comparison, we increase - * pressure on the memory system forcing the kernel to generate - * new VMAs when old one could be extended instead. - * * VM_STICKY - When merging VMAs, VMA flags must match, unless they are * 'sticky'. If any sticky flags exist in either VMA, we simply * set all of them on the merged VMA. */ -#define VM_IGNORE_MERGE (VM_SOFTDIRTY | VM_STICKY) +#define VM_IGNORE_MERGE VM_STICKY /* * Flags which should result in page tables being copied on fork. These are -- cgit v1.2.3 From 4015b979767125cf8a2233a145a3b3af78bfd8fb Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Wed, 12 Nov 2025 15:53:34 +0800 Subject: Bluetooth: btusb: mediatek: Fix kernel crash when releasing mtk iso interface When performing reset tests and encountering abnormal card drop issues that lead to a kernel crash, it is necessary to perform a null check before releasing resources to avoid attempting to release a null pointer. <4>[ 29.158070] Hardware name: Google Quigon sku196612/196613 board (DT) <4>[ 29.158076] Workqueue: hci0 hci_cmd_sync_work [bluetooth] <4>[ 29.158154] pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) <4>[ 29.158162] pc : klist_remove+0x90/0x158 <4>[ 29.158174] lr : klist_remove+0x88/0x158 <4>[ 29.158180] sp : ffffffc0846b3c00 <4>[ 29.158185] pmr_save: 000000e0 <4>[ 29.158188] x29: ffffffc0846b3c30 x28: ffffff80cd31f880 x27: ffffff80c1bdc058 <4>[ 29.158199] x26: dead000000000100 x25: ffffffdbdc624ea3 x24: ffffff80c1bdc4c0 <4>[ 29.158209] x23: ffffffdbdc62a3e6 x22: ffffff80c6c07000 x21: ffffffdbdc829290 <4>[ 29.158219] x20: 0000000000000000 x19: ffffff80cd3e0648 x18: 000000031ec97781 <4>[ 29.158229] x17: ffffff80c1bdc4a8 x16: ffffffdc10576548 x15: ffffff80c1180428 <4>[ 29.158238] x14: 0000000000000000 x13: 000000000000e380 x12: 0000000000000018 <4>[ 29.158248] x11: ffffff80c2a7fd10 x10: 0000000000000000 x9 : 0000000100000000 <4>[ 29.158257] x8 : 0000000000000000 x7 : 7f7f7f7f7f7f7f7f x6 : 2d7223ff6364626d <4>[ 29.158266] x5 : 0000008000000000 x4 : 0000000000000020 x3 : 2e7325006465636e <4>[ 29.158275] x2 : ffffffdc11afeff8 x1 : 0000000000000000 x0 : ffffffdc11be4d0c <4>[ 29.158285] Call trace: <4>[ 29.158290] klist_remove+0x90/0x158 <4>[ 29.158298] device_release_driver_internal+0x20c/0x268 <4>[ 29.158308] device_release_driver+0x1c/0x30 <4>[ 29.158316] usb_driver_release_interface+0x70/0x88 <4>[ 29.158325] btusb_mtk_release_iso_intf+0x68/0xd8 [btusb (HASH:e8b6 5)] <4>[ 29.158347] btusb_mtk_reset+0x5c/0x480 [btusb (HASH:e8b6 5)] <4>[ 29.158361] hci_cmd_sync_work+0x10c/0x188 [bluetooth (HASH:a4fa 6)] <4>[ 29.158430] process_scheduled_works+0x258/0x4e8 <4>[ 29.158441] worker_thread+0x300/0x428 <4>[ 29.158448] kthread+0x108/0x1d0 <4>[ 29.158455] ret_from_fork+0x10/0x20 <0>[ 29.158467] Code: 91343000 940139d1 f9400268 927ff914 (f9401297) <4>[ 29.158474] ---[ end trace 0000000000000000 ]--- <0>[ 29.167129] Kernel panic - not syncing: Oops: Fatal exception <2>[ 29.167144] SMP: stopping secondary CPUs <4>[ 29.167158] ------------[ cut here ]------------ Fixes: ceac1cb0259d ("Bluetooth: btusb: mediatek: add ISO data transmission functions") Signed-off-by: Chris Lu Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index b8100dbfe5d7..32b1c08c8bba 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -749,7 +749,6 @@ struct hci_conn { __u8 remote_cap; __u8 remote_auth; - __u8 remote_id; unsigned int sent; -- cgit v1.2.3 From 79a2d4678ba90bdba577dc3af88cc900d6dcd5ee Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Sat, 15 Nov 2025 18:43:55 +0200 Subject: Bluetooth: hci_core: lookup hci_conn on RX path on protocol side The hdev lock/lookup/unlock/use pattern in the packet RX path doesn't ensure hci_conn* is not concurrently modified/deleted. This locking appears to be leftover from before conn_hash started using RCU commit bf4c63252490b ("Bluetooth: convert conn hash to RCU") and not clear if it had purpose since then. Currently, there are code paths that delete hci_conn* from elsewhere than the ordered hdev->workqueue where the RX work runs in. E.g. commit 5af1f84ed13a ("Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync") introduced some of these, and there probably were a few others before it. It's better to do the locking so that even if these run concurrently no UAF is possible. Move the lookup of hci_conn and associated socket-specific conn to protocol recv handlers, and do them within a single critical section to cover hci_conn* usage and lookup. syzkaller has reported a crash that appears to be this issue: [Task hdev->workqueue] [Task 2] hci_disconnect_all_sync l2cap_recv_acldata(hcon) hci_conn_get(hcon) hci_abort_conn_sync(hcon) hci_dev_lock hci_dev_lock hci_conn_del(hcon) v-------------------------------- hci_dev_unlock hci_conn_put(hcon) conn = hcon->l2cap_data (UAF) Fixes: 5af1f84ed13a ("Bluetooth: hci_sync: Fix UAF on hci_abort_conn_sync") Reported-by: syzbot+d32d77220b92eddd89ad@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d32d77220b92eddd89ad Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 32b1c08c8bba..0cb87687837f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -856,11 +856,12 @@ extern struct mutex hci_cb_list_lock; /* ----- HCI interface to upper protocols ----- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr); int l2cap_disconn_ind(struct hci_conn *hcon); -void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, + u16 flags); #if IS_ENABLED(CONFIG_BT_BREDR) int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); -void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb); +int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb); #else static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) @@ -868,23 +869,30 @@ static inline int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, return 0; } -static inline void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +static inline int sco_recv_scodata(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb) { + kfree_skb(skb); + return -ENOENT; } #endif #if IS_ENABLED(CONFIG_BT_LE) int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags); -void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags); +int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, + u16 flags); #else static inline int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { return 0; } -static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, - u16 flags) + +static inline int iso_recv(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb, u16 flags) { + kfree_skb(skb); + return -ENOENT; } #endif -- cgit v1.2.3 From 760fc597c33d5a727507c8bb19d6ab87a8c5885b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Oct 2025 12:44:18 +0100 Subject: panic: sys_info: align constant definition names with parameters Align constant definition names with parameters to make it easier to map. It's also better to maintain and extend the names while keeping their uniqueness. Link: https://lkml.kernel.org/r/20251030132007.3742368-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Feng Tang Reviewed-by: Petr Mladek Signed-off-by: Andrew Morton --- include/linux/sys_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sys_info.h b/include/linux/sys_info.h index 89d77dc4f2ed..a5bc3ea3d44b 100644 --- a/include/linux/sys_info.h +++ b/include/linux/sys_info.h @@ -14,7 +14,7 @@ #define SYS_INFO_LOCKS 0x00000008 #define SYS_INFO_FTRACE 0x00000010 #define SYS_INFO_PANIC_CONSOLE_REPLAY 0x00000020 -#define SYS_INFO_ALL_CPU_BT 0x00000040 +#define SYS_INFO_ALL_BT 0x00000040 #define SYS_INFO_BLOCKED_TASKS 0x00000080 void sys_info(unsigned long si_mask); -- cgit v1.2.3 From bd97c976419126ee3e9acd4957f6f16a90316643 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 4 Nov 2025 19:38:34 +0100 Subject: util_macros.h: fix kernel-doc for u64_to_user_ptr() The added documentation to u64_to_user_ptr() misspelled the function name. Fix it. Link: https://lkml.kernel.org/r/20251104183834.1046584-1-andriy.shevchenko@linux.intel.com Fixes: 029c896c4105 ("kernel.h: move PTR_IF() and u64_to_user_ptr() to util_macros.h") Signed-off-by: Andy Shevchenko Cc: Alexandru Ardelean Signed-off-by: Andrew Morton --- include/linux/util_macros.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/util_macros.h b/include/linux/util_macros.h index 9373962aade9..2eb528058d0d 100644 --- a/include/linux/util_macros.h +++ b/include/linux/util_macros.h @@ -136,10 +136,10 @@ #define PTR_IF(cond, ptr) ((cond) ? (ptr) : NULL) /** - * to_user_ptr - cast a pointer passed as u64 from user space to void __user * + * u64_to_user_ptr - cast a pointer passed as u64 from user space to void __user * * @x: The u64 value from user space, usually via IOCTL * - * to_user_ptr() simply casts a pointer passed as u64 from user space to void + * u64_to_user_ptr() simply casts a pointer passed as u64 from user space to void * __user * correctly. Using this lets us get rid of all the tiresome casts. */ #define u64_to_user_ptr(x) \ -- cgit v1.2.3 From 6480241f31f543333ed0c7a209962412461f6e41 Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:30 +0000 Subject: lib: add mul_u64_add_u64_div_u64() and mul_u64_u64_div_u64_roundup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing mul_u64_u64_div_u64() rounds down, a 'rounding up' variant needs 'divisor - 1' adding in between the multiply and divide so cannot easily be done by a caller. Add mul_u64_add_u64_div_u64(a, b, c, d) that calculates (a * b + c)/d and implement the 'round down' and 'round up' using it. Update the x86-64 asm to optimise for 'c' being a constant zero. Add kerndoc definitions for all three functions. Link: https://lkml.kernel.org/r/20251105201035.64043-5-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- include/linux/math64.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/math64.h b/include/linux/math64.h index 6aaccc1626ab..e889d850b7f1 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -282,7 +282,53 @@ static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) } #endif /* mul_u64_u32_div */ -u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); +/** + * mul_u64_add_u64_div_u64 - unsigned 64bit multiply, add, and divide + * @a: first unsigned 64bit multiplicand + * @b: second unsigned 64bit multiplicand + * @c: unsigned 64bit addend + * @d: unsigned 64bit divisor + * + * Multiply two 64bit values together to generate a 128bit product + * add a third value and then divide by a fourth. + * The Generic code divides by 0 if @d is zero and returns ~0 on overflow. + * Architecture specific code may trap on zero or overflow. + * + * Return: (@a * @b + @c) / @d + */ +u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d); + +/** + * mul_u64_u64_div_u64 - unsigned 64bit multiply and divide + * @a: first unsigned 64bit multiplicand + * @b: second unsigned 64bit multiplicand + * @d: unsigned 64bit divisor + * + * Multiply two 64bit values together to generate a 128bit product + * and then divide by a third value. + * The Generic code divides by 0 if @d is zero and returns ~0 on overflow. + * Architecture specific code may trap on zero or overflow. + * + * Return: @a * @b / @d + */ +#define mul_u64_u64_div_u64(a, b, d) mul_u64_add_u64_div_u64(a, b, 0, d) + +/** + * mul_u64_u64_div_u64_roundup - unsigned 64bit multiply and divide rounded up + * @a: first unsigned 64bit multiplicand + * @b: second unsigned 64bit multiplicand + * @d: unsigned 64bit divisor + * + * Multiply two 64bit values together to generate a 128bit product + * and then divide and round up. + * The Generic code divides by 0 if @d is zero and returns ~0 on overflow. + * Architecture specific code may trap on zero or overflow. + * + * Return: (@a * @b + @d - 1) / @d + */ +#define mul_u64_u64_div_u64_roundup(a, b, d) \ + ({ u64 _tmp = (d); mul_u64_add_u64_div_u64(a, b, _tmp - 1, _tmp); }) + /** * DIV64_U64_ROUND_UP - unsigned 64bit divide with 64bit divisor rounded up -- cgit v1.2.3 From 630f96a687def5616d6fa7f069adcea158320909 Mon Sep 17 00:00:00 2001 From: David Laight Date: Wed, 5 Nov 2025 20:10:33 +0000 Subject: lib: mul_u64_u64_div_u64(): optimise multiply on 32bit x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc generates horrid code for both ((u64)u32_a * u32_b) and (u64_a + u32_b). As well as the extra instructions it can generate a lot of spills to stack (including spills of constant zeros and even multiplies by constant zero). mul_u32_u32() already exists to optimise the multiply. Add a similar add_u64_32() for the addition. Disable both for clang - it generates better code without them. Move the 64x64 => 128 multiply into a static inline helper function for code clarity. No need for the a/b_hi/lo variables, the implicit casts on the function calls do the work for us. Should have minimal effect on the generated code. Use mul_u32_u32() and add_u64_u32() in the 64x64 => 128 multiply in mul_u64_add_u64_div_u64(). Link: https://lkml.kernel.org/r/20251105201035.64043-8-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Nicolas Pitre Cc: Biju Das Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Jens Axboe Cc: Li RongQing Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleinxer Cc: Uwe Kleine-König Signed-off-by: Andrew Morton --- include/linux/math64.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/math64.h b/include/linux/math64.h index e889d850b7f1..cc305206d89f 100644 --- a/include/linux/math64.h +++ b/include/linux/math64.h @@ -158,6 +158,17 @@ static inline u64 mul_u32_u32(u32 a, u32 b) } #endif +#ifndef add_u64_u32 +/* + * Many a GCC version also messes this up. + * Zero extending b and then spilling everything to stack. + */ +static inline u64 add_u64_u32(u64 a, u32 b) +{ + return a + b; +} +#endif + #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr -- cgit v1.2.3 From f3fb126fdc9e148da38a6e25d7fc609774a99fc3 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 6 Nov 2025 16:20:51 +0100 Subject: math.h: amend abs() kernel-doc and add a note about signed type limits - amend the kernel-doc so the description is decoupled from the parameter descriptions. - add a note to explain behaviour for the signed types when supplied value is the minimum (e.g., INT_MIN for int type). Link: https://lkml.kernel.org/r/20251106152051.2361551-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Jonathan Cameron Signed-off-by: Andrew Morton --- include/linux/math.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/math.h b/include/linux/math.h index 0198c92cbe3e..6dc1d1d32fbc 100644 --- a/include/linux/math.h +++ b/include/linux/math.h @@ -148,11 +148,16 @@ __STRUCT_FRACT(u32) /** * abs - return absolute value of an argument - * @x: the value. If it is unsigned type, it is converted to signed type first. - * char is treated as if it was signed (regardless of whether it really is) - * but the macro's return type is preserved as char. + * @x: the value. * - * Return: an absolute value of x. + * If it is unsigned type, @x is converted to signed type first. + * char is treated as if it was signed (regardless of whether it really is) + * but the macro's return type is preserved as char. + * + * NOTE, for signed type if @x is the minimum, the returned result is undefined + * as there is not enough bits to represent it as a positive number. + * + * Return: an absolute value of @x. */ #define abs(x) __abs_choose_expr(x, long long, \ __abs_choose_expr(x, long, \ -- cgit v1.2.3 From 242b872239f6a7deacbc20ab9406ea40cb738ec6 Mon Sep 17 00:00:00 2001 From: Xie Yuanbin Date: Sun, 9 Nov 2025 16:37:15 +0800 Subject: include/linux/once_lite.h: fix judgment in WARN_ONCE with clang For c code: ```c extern int xx; void test(void) { if (WARN_ONCE(xx, "x")) __asm__ volatile ("nop":::); } ``` Clang will generate the following assembly code: ```assemble test: movl xx(%rip), %eax // Assume xx == 0 (likely case) testl %eax, %eax // judge once je .LBB0_3 // jump to .LBB0_3 testb $1, test.__already_done(%rip) je .LBB0_2 .LBB0_3: testl %eax, %eax // judge again je .LBB0_5 // jump to .LBB0_5 .LBB0_4: nop .LBB0_5: retq // omit ``` In the above code, `xx == 0` should be a likely case, but in this case, xx has been judged twice. Test info: 1. kernel source: linux-next commit 9c0826a5d9aa4d52206d ("Add linux-next specific files for 20251107") 2. compiler: clang: Debian clang version 21.1.4 (8) with Debian LLD 21.1.4 (compatible with GNU linkers) 3. config: base on default x86_64_defconfig, and setting: CONFIG_MITIGATION_RETHUNK=n CONFIG_STACKPROTECTOR=n Add unlikely to __ret_cond to help the compiler optimize correctly. [akpm@linux-foundation.org: undo whitespace changes] Link: https://lkml.kernel.org/r/20251109083715.24495-1-qq570070308@gmail.com Signed-off-by: Xie Yuanbin Cc: Bill Wendling Cc: Jan Kara Cc: Justin Stitt Cc: Maninder Singh Cc: Masahiro Yamada Cc: Nathan Chancellor Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/once_lite.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/once_lite.h b/include/linux/once_lite.h index 27de7bc32a06..236592c4eeb1 100644 --- a/include/linux/once_lite.h +++ b/include/linux/once_lite.h @@ -16,7 +16,7 @@ bool __ret_cond = !!(condition); \ bool __ret_once = false; \ \ - if (unlikely(__ret_cond && !__already_done)) { \ + if (unlikely(__ret_cond) && unlikely(!__already_done)) {\ __already_done = true; \ __ret_once = true; \ } \ -- cgit v1.2.3 From f1e2ca801c54dfc09d6a5540207cec25e8d43f6f Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Fri, 14 Nov 2025 14:00:45 +0800 Subject: lib/base64: add support for multiple variants Patch series " lib/base64: add generic encoder/decoder, migrate users", v5. This series introduces a generic Base64 encoder/decoder to the kernel library, eliminating duplicated implementations and delivering significant performance improvements. The Base64 API has been extended to support multiple variants (Standard, URL-safe, and IMAP) as defined in RFC 4648 and RFC 3501. The API now takes a variant parameter and an option to control padding. As part of this series, users are migrated to the new interface while preserving their specific formats: fscrypt now uses BASE64_URLSAFE, Ceph uses BASE64_IMAP, and NVMe is updated to BASE64_STD. On the encoder side, the implementation processes input in 3-byte blocks, mapping 24 bits directly to 4 output symbols. This avoids bit-by-bit streaming and reduces loop overhead, achieving about a 2.7x speedup compared to previous implementations. On the decoder side, replace strchr() lookups with per-variant reverse tables and process input in 4-character groups. Each group is mapped to numeric values and combined into 3 bytes. Padded and unpadded forms are validated explicitly, rejecting invalid '=' usage and enforcing tail rules. This improves throughput by ~43-52x. This patch (of 6): Extend the base64 API to support multiple variants (standard, URL-safe, and IMAP) as defined in RFC 4648 and RFC 3501. The API now takes a variant parameter and an option to control padding. Update NVMe auth code to use the new interface with BASE64_STD. Link: https://lkml.kernel.org/r/20251114055829.87814-1-409411716@gms.tku.edu.tw Link: https://lkml.kernel.org/r/20251114060045.88792-1-409411716@gms.tku.edu.tw Signed-off-by: Kuan-Wei Chiu Co-developed-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw> Reviewed-by: David Laight Cc: Christoph Hellwig Cc: Eric Biggers Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jens Axboe Cc: Keith Busch Cc: Sagi Grimberg Cc: "Theodore Y. Ts'o" Cc: Viacheslav Dubeyko Cc: Xiubo Li Cc: Yu-Sheng Huang Signed-off-by: Andrew Morton --- include/linux/base64.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/base64.h b/include/linux/base64.h index 660d4cb1ef31..a2c6c9222da3 100644 --- a/include/linux/base64.h +++ b/include/linux/base64.h @@ -8,9 +8,15 @@ #include +enum base64_variant { + BASE64_STD, /* RFC 4648 (standard) */ + BASE64_URLSAFE, /* RFC 4648 (base64url) */ + BASE64_IMAP, /* RFC 3501 */ +}; + #define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) -int base64_encode(const u8 *src, int len, char *dst); -int base64_decode(const char *src, int len, u8 *dst); +int base64_encode(const u8 *src, int len, char *dst, bool padding, enum base64_variant variant); +int base64_decode(const char *src, int len, u8 *dst, bool padding, enum base64_variant variant); #endif /* _LINUX_BASE64_H */ -- cgit v1.2.3 From 9031b852c97f1db52180878aed66ca08946eca93 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 18 Nov 2025 17:32:50 +0000 Subject: uaccess: gate _copy_[to|from]_user on !INLINE_COPY_FROM_USER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These methods only exist when INLINE_COPY_FROM_USER is disabled, so update the header file to reflect that. This fixes the following error on builds that enable both RUST and INLINE_COPY_FROM_USER. ERROR: modpost: "_copy_from_user" [samples/rust/rust_misc_device.ko] undefined! ERROR: modpost: "_copy_to_user" [samples/rust/rust_misc_device.ko] undefined! This error is triggered because when a method is available both as a rust_helper_* and normal method, Rust will call the normal method. [akpm@linux-foundation.org: s/INLINE_COPY_FROM_USER/INLINE_COPY_TO_USER/, per Alice] Link: https://lkml.kernel.org/r/20251118173250.2821388-1-aliceryhl@google.com Fixes: d99dc586ca7c ("uaccess: decouple INLINE_COPY_FROM_USER and CONFIG_RUST") Signed-off-by: Alice Ryhl Cc: Alex Gaynor Cc: Andreas Hindborg Cc: Arnd Bergmann Cc: Björn Roy Baron Cc: Boqun Feng Cc: Danilo Krummrich Cc: Gary Guo Cc: John Hubbard Cc: Miguel Ojeda Cc: Trevor Gross Cc: Yury Norov (NVIDIA) Signed-off-by: Andrew Morton --- include/linux/uaccess.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 01cbd7dd0ba3..5594012160da 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -181,8 +181,10 @@ fail: memset(to + (n - res), 0, res); return res; } +#ifndef INLINE_COPY_FROM_USER extern __must_check unsigned long _copy_from_user(void *, const void __user *, unsigned long); +#endif static inline __must_check unsigned long _inline_copy_to_user(void __user *to, const void *from, unsigned long n) @@ -196,8 +198,10 @@ _inline_copy_to_user(void __user *to, const void *from, unsigned long n) } return n; } +#ifndef INLINE_COPY_TO_USER extern __must_check unsigned long _copy_to_user(void __user *, const void *, unsigned long); +#endif static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) -- cgit v1.2.3 From e0940c672ab4228caa33bcd7cc0ad8017482c2f1 Mon Sep 17 00:00:00 2001 From: Nirbhay Sharma Date: Fri, 21 Nov 2025 02:16:21 +0530 Subject: bpf: Document cfi_stubs and owner fields in struct bpf_struct_ops Add missing kernel-doc documentation for the cfi_stubs and owner fields in struct bpf_struct_ops to fix the following warnings: Warning: include/linux/bpf.h:1931 struct member 'cfi_stubs' not described in 'bpf_struct_ops' Warning: include/linux/bpf.h:1931 struct member 'owner' not described in 'bpf_struct_ops' The cfi_stubs field was added in commit 2cd3e3772e41 ("x86/cfi,bpf: Fix bpf_struct_ops CFI") to provide CFI stub functions for trampolines, and the owner field is used for module reference counting. Signed-off-by: Nirbhay Sharma Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20251120204620.59571-2-nirbhay.lkd@gmail.com --- include/linux/bpf.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 09d5dc541d1c..30fb40421405 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1922,12 +1922,14 @@ struct btf_member; * reason, if this callback is not defined, the check is skipped as * the struct_ops map will have final verification performed in * @reg. - * @type: BTF type. - * @value_type: Value type. + * @cfi_stubs: Pointer to a structure of stub functions for CFI. These stubs + * provide the correct Control Flow Integrity hashes for the + * trampolines generated by BPF struct_ops. + * @owner: The module that owns this struct_ops. Used for module reference + * counting to ensure the module providing the struct_ops cannot be + * unloaded while in use. * @name: The name of the struct bpf_struct_ops object. * @func_models: Func models - * @type_id: BTF type id. - * @value_id: BTF value id. */ struct bpf_struct_ops { const struct bpf_verifier_ops *verifier_ops; -- cgit v1.2.3 From 6d5dea68246ecb190a50a7fecbaf7f8c1ddb15e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Nov 2025 08:48:12 +0000 Subject: tcp: tcp_moderate_rcvbuf is only used in rx path sysctl_tcp_moderate_rcvbuf is only used from tcp_rcvbuf_grow(). Move it to netns_ipv4_read_rx group. Remove various CACHELINE_ASSERT_GROUP_SIZE() from netns_ipv4_struct_check(), as they have no real benefit but cause pain for all changes. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251119084813.3684576-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index de9d36acc8e2..11837d3ccc0a 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -74,11 +74,11 @@ struct netns_ipv4 { /* TXRX readonly hotpath cache lines */ __cacheline_group_begin(netns_ipv4_read_txrx); - u8 sysctl_tcp_moderate_rcvbuf; __cacheline_group_end(netns_ipv4_read_txrx); /* RX readonly hotpath cache line */ __cacheline_group_begin(netns_ipv4_read_rx); + u8 sysctl_tcp_moderate_rcvbuf; u8 sysctl_ip_early_demux; u8 sysctl_tcp_early_demux; u8 sysctl_tcp_l3mdev_accept; -- cgit v1.2.3 From ecfea98b7d0d56c5bf2df3fc02c5501afa5cef6f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Nov 2025 08:48:13 +0000 Subject: tcp: add net.ipv4.tcp_rcvbuf_low_rtt This is a follow up of commit aa251c84636c ("tcp: fix too slow tcp_rcvbuf_grow() action") which brought again the issue that I tried to fix in commit 65c5287892e9 ("tcp: fix sk_rcvbuf overshoot") We also recently increased tcp_rmem[2] to 32 MB in commit 572be9bf9d0d ("tcp: increase tcp_rmem[2] to 32 MB") Idea of this patch is to not let tcp_rcvbuf_grow() grow sk->sk_rcvbuf too fast for small RTT flows. If sk->sk_rcvbuf is too big, this can force NIC driver to not recycle pages from their page pool, and also can cause cache evictions for DDIO enabled cpus/NIC, as receivers are usually slower than senders. Add net.ipv4.tcp_rcvbuf_low_rtt sysctl, set by default to 1000 usec (1 ms) If RTT if smaller than the sysctl value, use the RTT/tcp_rcvbuf_low_rtt ratio to control sk_rcvbuf inflation. Tested: Pair of hosts with a 200Gbit IDPF NIC. Using netperf/netserver Client initiates 8 TCP bulk flows, asking netserver to use CPU #10 only. super_netperf 8 -H server -T,10 -l 30 On server, use perf -e tcp:tcp_rcvbuf_grow while test is running. Before: sysctl -w net.ipv4.tcp_rcvbuf_low_rtt=1 perf record -a -e tcp:tcp_rcvbuf_grow sleep 30 ; perf script|tail -20|cut -c30-230 1153.051201: tcp:tcp_rcvbuf_grow: time=398 rtt_us=382 copied=6905856 inq=180224 space=6115328 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25600000 famil 1153.138752: tcp:tcp_rcvbuf_grow: time=446 rtt_us=413 copied=5529600 inq=180224 space=4505600 ooo=0 scaling_ratio=240 rcvbuf=23068672 rcv_ssthresh=21571860 window_clamp=21626880 rcv_wnd=21286912 famil 1153.361484: tcp:tcp_rcvbuf_grow: time=415 rtt_us=380 copied=7061504 inq=204800 space=6725632 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25600000 famil 1153.457642: tcp:tcp_rcvbuf_grow: time=483 rtt_us=421 copied=5885952 inq=720896 space=4407296 ooo=0 scaling_ratio=240 rcvbuf=23763511 rcv_ssthresh=22223271 window_clamp=22278291 rcv_wnd=21430272 famil 1153.466002: tcp:tcp_rcvbuf_grow: time=308 rtt_us=281 copied=3244032 inq=180224 space=2883584 ooo=0 scaling_ratio=240 rcvbuf=44854314 rcv_ssthresh=41992059 window_clamp=42050919 rcv_wnd=41713664 famil 1153.747792: tcp:tcp_rcvbuf_grow: time=394 rtt_us=332 copied=4460544 inq=585728 space=3063808 ooo=0 scaling_ratio=240 rcvbuf=44854314 rcv_ssthresh=41992059 window_clamp=42050919 rcv_wnd=41373696 famil 1154.260747: tcp:tcp_rcvbuf_grow: time=652 rtt_us=226 copied=10977280 inq=737280 space=9486336 ooo=0 scaling_ratio=240 rcvbuf=31165538 rcv_ssthresh=29197743 window_clamp=29217691 rcv_wnd=28368896 fami 1154.375019: tcp:tcp_rcvbuf_grow: time=461 rtt_us=443 copied=7573504 inq=507904 space=6856704 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25288704 famil 1154.463072: tcp:tcp_rcvbuf_grow: time=494 rtt_us=408 copied=7983104 inq=200704 space=7065600 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25579520 famil 1154.474658: tcp:tcp_rcvbuf_grow: time=507 rtt_us=459 copied=5586944 inq=540672 space=4718592 ooo=0 scaling_ratio=240 rcvbuf=17852266 rcv_ssthresh=16692999 window_clamp=16736499 rcv_wnd=16056320 famil 1154.584657: tcp:tcp_rcvbuf_grow: time=494 rtt_us=427 copied=8126464 inq=204800 space=7782400 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25878235 window_clamp=25937095 rcv_wnd=25600000 famil 1154.702117: tcp:tcp_rcvbuf_grow: time=480 rtt_us=406 copied=5734400 inq=180224 space=5349376 ooo=0 scaling_ratio=240 rcvbuf=23068672 rcv_ssthresh=21571860 window_clamp=21626880 rcv_wnd=21286912 famil 1155.941595: tcp:tcp_rcvbuf_grow: time=717 rtt_us=670 copied=11042816 inq=3784704 space=7159808 ooo=0 scaling_ratio=240 rcvbuf=19581357 rcv_ssthresh=18333222 window_clamp=18357522 rcv_wnd=14614528 fam 1156.384735: tcp:tcp_rcvbuf_grow: time=529 rtt_us=473 copied=9011200 inq=180224 space=7258112 ooo=0 scaling_ratio=240 rcvbuf=19581357 rcv_ssthresh=18333222 window_clamp=18357522 rcv_wnd=18018304 famil 1157.821676: tcp:tcp_rcvbuf_grow: time=529 rtt_us=272 copied=8224768 inq=602112 space=6545408 ooo=0 scaling_ratio=240 rcvbuf=67000000 rcv_ssthresh=62793576 window_clamp=62812500 rcv_wnd=62115840 famil 1158.906379: tcp:tcp_rcvbuf_grow: time=710 rtt_us=445 copied=11845632 inq=540672 space=10240000 ooo=0 scaling_ratio=240 rcvbuf=31165538 rcv_ssthresh=29205935 window_clamp=29217691 rcv_wnd=28536832 fam 1164.600160: tcp:tcp_rcvbuf_grow: time=841 rtt_us=430 copied=12976128 inq=1290240 space=11304960 ooo=0 scaling_ratio=240 rcvbuf=31165538 rcv_ssthresh=29212591 window_clamp=29217691 rcv_wnd=27856896 fa 1165.163572: tcp:tcp_rcvbuf_grow: time=845 rtt_us=800 copied=12632064 inq=540672 space=7921664 ooo=0 scaling_ratio=240 rcvbuf=27666235 rcv_ssthresh=25912795 window_clamp=25937095 rcv_wnd=25260032 fami 1165.653464: tcp:tcp_rcvbuf_grow: time=388 rtt_us=309 copied=4493312 inq=180224 space=3874816 ooo=0 scaling_ratio=240 rcvbuf=44854314 rcv_ssthresh=41995899 window_clamp=42050919 rcv_wnd=41713664 famil 1166.651211: tcp:tcp_rcvbuf_grow: time=556 rtt_us=553 copied=6328320 inq=540672 space=5554176 ooo=0 scaling_ratio=240 rcvbuf=23068672 rcv_ssthresh=21571860 window_clamp=21626880 rcv_wnd=20946944 famil After: sysctl -w net.ipv4.tcp_rcvbuf_low_rtt=1000 perf record -a -e tcp:tcp_rcvbuf_grow sleep 30 ; perf script|tail -20|cut -c30-230 1457.053149: tcp:tcp_rcvbuf_grow: time=128 rtt_us=24 copied=1441792 inq=40960 space=1269760 ooo=0 scaling_ratio=240 rcvbuf=2960741 rcv_ssthresh=2605474 window_clamp=2775694 rcv_wnd=2568192 family=AF_I 1458.000778: tcp:tcp_rcvbuf_grow: time=128 rtt_us=31 copied=1441792 inq=24576 space=1400832 ooo=0 scaling_ratio=240 rcvbuf=3060163 rcv_ssthresh=2810042 window_clamp=2868902 rcv_wnd=2674688 family=AF_I 1458.088059: tcp:tcp_rcvbuf_grow: time=190 rtt_us=110 copied=3227648 inq=385024 space=2781184 ooo=0 scaling_ratio=240 rcvbuf=6728240 rcv_ssthresh=6252705 window_clamp=6307725 rcv_wnd=5799936 family=AF 1458.148549: tcp:tcp_rcvbuf_grow: time=232 rtt_us=129 copied=3956736 inq=237568 space=2842624 ooo=0 scaling_ratio=240 rcvbuf=6731333 rcv_ssthresh=6252705 window_clamp=6310624 rcv_wnd=5918720 family=AF 1458.466861: tcp:tcp_rcvbuf_grow: time=193 rtt_us=83 copied=2949120 inq=180224 space=2457600 ooo=0 scaling_ratio=240 rcvbuf=5751438 rcv_ssthresh=5357689 window_clamp=5391973 rcv_wnd=5054464 family=AF_ 1458.775476: tcp:tcp_rcvbuf_grow: time=257 rtt_us=127 copied=4304896 inq=352256 space=3346432 ooo=0 scaling_ratio=240 rcvbuf=8067131 rcv_ssthresh=7523275 window_clamp=7562935 rcv_wnd=7061504 family=AF 1458.776631: tcp:tcp_rcvbuf_grow: time=200 rtt_us=96 copied=3260416 inq=143360 space=2768896 ooo=0 scaling_ratio=240 rcvbuf=6397256 rcv_ssthresh=5938567 window_clamp=5997427 rcv_wnd=5828608 family=AF_ 1459.707973: tcp:tcp_rcvbuf_grow: time=215 rtt_us=96 copied=2506752 inq=163840 space=1388544 ooo=0 scaling_ratio=240 rcvbuf=3068867 rcv_ssthresh=2768282 window_clamp=2877062 rcv_wnd=2555904 family=AF_ 1460.246494: tcp:tcp_rcvbuf_grow: time=231 rtt_us=80 copied=3756032 inq=204800 space=3117056 ooo=0 scaling_ratio=240 rcvbuf=7288091 rcv_ssthresh=6773725 window_clamp=6832585 rcv_wnd=6471680 family=AF_ 1460.714596: tcp:tcp_rcvbuf_grow: time=270 rtt_us=110 copied=4714496 inq=311296 space=3719168 ooo=0 scaling_ratio=240 rcvbuf=8957739 rcv_ssthresh=8339020 window_clamp=8397880 rcv_wnd=7933952 family=AF 1462.029977: tcp:tcp_rcvbuf_grow: time=101 rtt_us=19 copied=1105920 inq=40960 space=1036288 ooo=0 scaling_ratio=240 rcvbuf=2338970 rcv_ssthresh=2091684 window_clamp=2192784 rcv_wnd=1986560 family=AF_I 1462.802385: tcp:tcp_rcvbuf_grow: time=89 rtt_us=45 copied=1069056 inq=0 space=1064960 ooo=0 scaling_ratio=240 rcvbuf=2338970 rcv_ssthresh=2091684 window_clamp=2192784 rcv_wnd=2035712 family=AF_INET6 1462.918648: tcp:tcp_rcvbuf_grow: time=105 rtt_us=33 copied=1441792 inq=180224 space=1069056 ooo=0 scaling_ratio=240 rcvbuf=2383282 rcv_ssthresh=2091684 window_clamp=2234326 rcv_wnd=1896448 family=AF_ 1463.222533: tcp:tcp_rcvbuf_grow: time=273 rtt_us=144 copied=4603904 inq=385024 space=3469312 ooo=0 scaling_ratio=240 rcvbuf=8422564 rcv_ssthresh=7891053 window_clamp=7896153 rcv_wnd=7409664 family=AF 1466.519312: tcp:tcp_rcvbuf_grow: time=130 rtt_us=23 copied=1343488 inq=0 space=1261568 ooo=0 scaling_ratio=240 rcvbuf=2780158 rcv_ssthresh=2493778 window_clamp=2606398 rcv_wnd=2494464 family=AF_INET6 1466.681003: tcp:tcp_rcvbuf_grow: time=128 rtt_us=21 copied=1441792 inq=12288 space=1343488 ooo=0 scaling_ratio=240 rcvbuf=2932027 rcv_ssthresh=2578555 window_clamp=2748775 rcv_wnd=2568192 family=AF_I 1470.689959: tcp:tcp_rcvbuf_grow: time=255 rtt_us=122 copied=3932160 inq=204800 space=3551232 ooo=0 scaling_ratio=240 rcvbuf=8182038 rcv_ssthresh=7647384 window_clamp=7670660 rcv_wnd=7442432 family=AF 1471.754154: tcp:tcp_rcvbuf_grow: time=188 rtt_us=95 copied=2138112 inq=577536 space=1429504 ooo=0 scaling_ratio=240 rcvbuf=3113650 rcv_ssthresh=2806426 window_clamp=2919046 rcv_wnd=2248704 family=AF_ 1476.813542: tcp:tcp_rcvbuf_grow: time=269 rtt_us=99 copied=3088384 inq=180224 space=2564096 ooo=0 scaling_ratio=240 rcvbuf=6219470 rcv_ssthresh=5771893 window_clamp=5830753 rcv_wnd=5509120 family=AF_ 1477.738309: tcp:tcp_rcvbuf_grow: time=166 rtt_us=54 copied=1777664 inq=180224 space=1417216 ooo=0 scaling_ratio=240 rcvbuf=3117118 rcv_ssthresh=2874958 window_clamp=2922298 rcv_wnd=2613248 family=AF_ We can see sk_rcvbuf values are much smaller, and that rtt_us (estimation of rtt from a receiver point of view) is kept small, instead of being bloated. No difference in throughput. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Tested-by: Paolo Abeni Link: https://patch.msgid.link/20251119084813.3684576-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netns/ipv4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 11837d3ccc0a..2dbd46fc4734 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -85,6 +85,7 @@ struct netns_ipv4 { /* 3 bytes hole, try to pack */ int sysctl_tcp_reordering; int sysctl_tcp_rmem[3]; + int sysctl_tcp_rcvbuf_low_rtt; __cacheline_group_end(netns_ipv4_read_rx); struct inet_timewait_death_row tcp_death_row; -- cgit v1.2.3 From 85081acc6b1188f2a6e5e605dc644225fcdf327f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 19 Nov 2025 10:03:50 +0000 Subject: net: stmmac: pass struct device to init()/exit() methods As struct plat_stmmacenet_data is not platform_device specific, pass a struct device into the init() and exit() methods to allow them to become independent of the underlying device. Signed-off-by: Russell King (Oracle) Acked-by: Chen-Yu Tsai Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/E1vLf2U-0000000FMN2-0SLg@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 4f70a6551e68..673b068fdadf 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -264,8 +264,8 @@ struct plat_stmmacenet_data { unsigned int mode, phy_interface_t interface); void (*ptp_clk_freq_config)(struct stmmac_priv *priv); - int (*init)(struct platform_device *pdev, void *priv); - void (*exit)(struct platform_device *pdev, void *priv); + int (*init)(struct device *dev, void *priv); + void (*exit)(struct device *dev, void *priv); int (*suspend)(struct device *dev, void *priv); int (*resume)(struct device *dev, void *priv); int (*mac_setup)(void *priv, struct mac_device_info *mac); -- cgit v1.2.3 From 6ff3310ca28298e363c78143b6a2f20312421f4e Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 19 Nov 2025 10:23:30 +0000 Subject: net: stmmac: move stmmac_axi_blen_to_mask() to stmmac_main.c Move the call to stmmac_axi_blen_to_mask() out of the individual MAC version drivers into the main code in stmmac_init_dma_engine(), passing the resulting value through a new member, axi_blen_regval, in the struct stmmac_axi structure. There is now no need for stmmac_axi_blen_to_dma_mask() to use u32p_replace_bits(), so use FIELD_PREP() instead. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vLfLW-0000000FMb1-0zKV@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index 673b068fdadf..d1a41fe0825f 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -113,6 +113,7 @@ struct stmmac_axi { u32 axi_wr_osr_lmt; u32 axi_rd_osr_lmt; bool axi_kbbe; + u32 axi_blen_regval; u32 axi_blen[AXI_BLEN]; bool axi_fb; bool axi_mb; -- cgit v1.2.3 From efd3c8cc52bb9583183ebb83c8c55b23bf97cb2f Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 19 Nov 2025 10:23:40 +0000 Subject: net: stmmac: remove axi_blen array Remove the axi_blen array from struct stmmac_axi as we set this array, and then immediately convert it ot the register value, never looking at the array again. Thus, the array can be function local rather than part of a run-time allocated long-lived struct. Signed-off-by: Russell King (Oracle) Link: https://patch.msgid.link/E1vLfLg-0000000FMbD-1vmh@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/stmmac.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index d1a41fe0825f..f1054b9c2d8a 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -114,7 +114,6 @@ struct stmmac_axi { u32 axi_rd_osr_lmt; bool axi_kbbe; u32 axi_blen_regval; - u32 axi_blen[AXI_BLEN]; bool axi_fb; bool axi_mb; bool axi_rb; -- cgit v1.2.3 From 491c5dc98b848c4781addd514caed95039e5366c Mon Sep 17 00:00:00 2001 From: Yael Chemla Date: Wed, 19 Nov 2025 22:48:15 +0200 Subject: net: ethtool: Add support for 1600Gbps speed Add support for 1600Gbps link modes based on 200Gbps per lane [1]. This includes the adopted IEEE 802.3dj copper and optical PMDs that use 200G/lane signaling [2]. Add the following PMD types: - KR8 (backplane) - CR8 (copper cable) - DR8 (SMF 500m) - DR8-2 (SMF 2km) These modes are defined in the 802.3dj specifications. References: [1] https://www.ieee802.org/3/dj/public/23_03/opsasnick_3dj_01a_2303.pdf [2] https://www.ieee802.org/3/dj/projdoc/objectives_P802d3dj_240314.pdf Signed-off-by: Yael Chemla Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/1763585297-1243980-2-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 8bd5ea5469d9..eb7ff2602fbb 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2077,6 +2077,10 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_800000baseDR4_2_Full_BIT = 118, ETHTOOL_LINK_MODE_800000baseSR4_Full_BIT = 119, ETHTOOL_LINK_MODE_800000baseVR4_Full_BIT = 120, + ETHTOOL_LINK_MODE_1600000baseCR8_Full_BIT = 121, + ETHTOOL_LINK_MODE_1600000baseKR8_Full_BIT = 122, + ETHTOOL_LINK_MODE_1600000baseDR8_Full_BIT = 123, + ETHTOOL_LINK_MODE_1600000baseDR8_2_Full_BIT = 124, /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS @@ -2190,6 +2194,7 @@ enum ethtool_link_mode_bit_indices { #define SPEED_200000 200000 #define SPEED_400000 400000 #define SPEED_800000 800000 +#define SPEED_1600000 1600000 #define SPEED_UNKNOWN -1 -- cgit v1.2.3 From d10f26a7abbd3dd5d59bac1acdca117385b54ea9 Mon Sep 17 00:00:00 2001 From: Yuji Ishikawa Date: Fri, 14 Nov 2025 15:53:58 +0900 Subject: dt-bindings: clock: tmpv770x: Remove definition of number of clocks Remove the definitions of number of clocks from bindings because they prevent adding new clocks. Since the previous patch removed all refereces within the driver, they can now be deleted. The same for resets and plls. Signed-off-by: Yuji Ishikawa Acked-by: Conor Dooley Signed-off-by: Stephen Boyd --- include/dt-bindings/clock/toshiba,tmpv770x.h | 3 --- include/dt-bindings/reset/toshiba,tmpv770x.h | 1 - 2 files changed, 4 deletions(-) (limited to 'include') diff --git a/include/dt-bindings/clock/toshiba,tmpv770x.h b/include/dt-bindings/clock/toshiba,tmpv770x.h index 5fce713001fd..89189c4f6a52 100644 --- a/include/dt-bindings/clock/toshiba,tmpv770x.h +++ b/include/dt-bindings/clock/toshiba,tmpv770x.h @@ -11,7 +11,6 @@ #define TMPV770X_PLL_PIDDRCPLL 4 #define TMPV770X_PLL_PIVOIFPLL 5 #define TMPV770X_PLL_PIIMGERPLL 6 -#define TMPV770X_NR_PLL 7 /* Clocks */ #define TMPV770X_CLK_PIPLL1_DIV1 0 @@ -141,7 +140,6 @@ #define TMPV770X_CLK_PIREFCLK 124 #define TMPV770X_CLK_SBUS 125 #define TMPV770X_CLK_BUSLCK 126 -#define TMPV770X_NR_CLK 127 /* Reset */ #define TMPV770X_RESET_PIETHER_2P5M 0 @@ -176,6 +174,5 @@ #define TMPV770X_RESET_PIPCMIF 29 #define TMPV770X_RESET_PICKMON 30 #define TMPV770X_RESET_SBUSCLK 31 -#define TMPV770X_NR_RESET 32 #endif /*_DT_BINDINGS_CLOCK_TOSHIBA_TMPV770X_H_ */ diff --git a/include/dt-bindings/reset/toshiba,tmpv770x.h b/include/dt-bindings/reset/toshiba,tmpv770x.h index c1007acb1941..bedfe253fa36 100644 --- a/include/dt-bindings/reset/toshiba,tmpv770x.h +++ b/include/dt-bindings/reset/toshiba,tmpv770x.h @@ -36,6 +36,5 @@ #define TMPV770X_RESET_PIPCMIF 29 #define TMPV770X_RESET_PICKMON 30 #define TMPV770X_RESET_SBUSCLK 31 -#define TMPV770X_NR_RESET 32 #endif /*_DT_BINDINGS_RESET_TOSHIBA_TMPV770X_H_ */ -- cgit v1.2.3 From beeff790c5679b3eacc8ee7021f775f447f47603 Mon Sep 17 00:00:00 2001 From: Yuji Ishikawa Date: Fri, 14 Nov 2025 16:05:11 +0900 Subject: dt-bindings: clock: tmpv770x: Add VIIF clocks Add clock and reset identifiers for the Video Input Interface. These identifiers support two instances: VIIF0 and VIIF1. Signed-off-by: Yuji Ishikawa Acked-by: Conor Dooley Signed-off-by: Stephen Boyd --- include/dt-bindings/clock/toshiba,tmpv770x.h | 11 +++++++++++ include/dt-bindings/reset/toshiba,tmpv770x.h | 8 ++++++++ 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/dt-bindings/clock/toshiba,tmpv770x.h b/include/dt-bindings/clock/toshiba,tmpv770x.h index 89189c4f6a52..a36c89266686 100644 --- a/include/dt-bindings/clock/toshiba,tmpv770x.h +++ b/include/dt-bindings/clock/toshiba,tmpv770x.h @@ -140,6 +140,9 @@ #define TMPV770X_CLK_PIREFCLK 124 #define TMPV770X_CLK_SBUS 125 #define TMPV770X_CLK_BUSLCK 126 +#define TMPV770X_CLK_VIIFBS1_L2ISP 127 +#define TMPV770X_CLK_VIIFBS1_L1ISP 128 +#define TMPV770X_CLK_VIIFBS1_PROC 129 /* Reset */ #define TMPV770X_RESET_PIETHER_2P5M 0 @@ -174,5 +177,13 @@ #define TMPV770X_RESET_PIPCMIF 29 #define TMPV770X_RESET_PICKMON 30 #define TMPV770X_RESET_SBUSCLK 31 +#define TMPV770X_RESET_VIIFBS0 32 +#define TMPV770X_RESET_VIIFBS0_APB 33 +#define TMPV770X_RESET_VIIFBS0_L2ISP 34 +#define TMPV770X_RESET_VIIFBS0_L1ISP 35 +#define TMPV770X_RESET_VIIFBS1 36 +#define TMPV770X_RESET_VIIFBS1_APB 37 +#define TMPV770X_RESET_VIIFBS1_L2ISP 38 +#define TMPV770X_RESET_VIIFBS1_L1ISP 39 #endif /*_DT_BINDINGS_CLOCK_TOSHIBA_TMPV770X_H_ */ diff --git a/include/dt-bindings/reset/toshiba,tmpv770x.h b/include/dt-bindings/reset/toshiba,tmpv770x.h index bedfe253fa36..9452bef31425 100644 --- a/include/dt-bindings/reset/toshiba,tmpv770x.h +++ b/include/dt-bindings/reset/toshiba,tmpv770x.h @@ -36,5 +36,13 @@ #define TMPV770X_RESET_PIPCMIF 29 #define TMPV770X_RESET_PICKMON 30 #define TMPV770X_RESET_SBUSCLK 31 +#define TMPV770X_RESET_VIIFBS0 32 +#define TMPV770X_RESET_VIIFBS0_APB 33 +#define TMPV770X_RESET_VIIFBS0_L2ISP 34 +#define TMPV770X_RESET_VIIFBS0_L1ISP 35 +#define TMPV770X_RESET_VIIFBS1 36 +#define TMPV770X_RESET_VIIFBS1_APB 37 +#define TMPV770X_RESET_VIIFBS1_L2ISP 38 +#define TMPV770X_RESET_VIIFBS1_L1ISP 39 #endif /*_DT_BINDINGS_RESET_TOSHIBA_TMPV770X_H_ */ -- cgit v1.2.3 From 011d133bb988f80d597a9cbdab659414ba7ff72b Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 18 Nov 2025 18:50:31 -0800 Subject: devlink: pass extack through to devlink_param::get() Allow devlink_param::get() handlers to report error messages via extack. This function is called in a few different contexts, but not all of them will have an valid extack to use. When devlink_param::get() is called from param_get_doit or param_get_dumpit contexts, pass the extack through so that drivers can report errors when retrieving param values. devlink_param::get() is called from the context of devlink_param_notify(), pass NULL in for the extack. Reviewed-by: Saeed Mahameed Reviewed-by: Aleksandr Loktionov Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20251119025038.651131-2-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 3 ++- include/net/dsa.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index d01046ef0577..5f479227144d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -490,7 +490,8 @@ struct devlink_param { enum devlink_param_type type; unsigned long supported_cmodes; int (*get)(struct devlink *devlink, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int (*set)(struct devlink *devlink, u32 id, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack); diff --git a/include/net/dsa.h b/include/net/dsa.h index 97d5f401cfcf..e40cdc12f7f3 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1251,7 +1251,8 @@ struct dsa_switch_ops { dsa_devlink_param_get, dsa_devlink_param_set, NULL) int dsa_devlink_param_get(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx); + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); int dsa_devlink_param_set(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 2a367002ed321e884276c3d7232a362ddd1bf7d6 Mon Sep 17 00:00:00 2001 From: Daniel Zahka Date: Tue, 18 Nov 2025 18:50:33 -0800 Subject: devlink: support default values for param-get and param-set Support querying and resetting to default param values. Introduce two new devlink netlink attrs: DEVLINK_ATTR_PARAM_VALUE_DEFAULT and DEVLINK_ATTR_PARAM_RESET_DEFAULT. The former is used to contain an optional parameter value inside of the param_value nested attribute. The latter is used in param-set requests from userspace to indicate that the driver should reset the param to its default value. To implement this, two new functions are added to the devlink driver api: devlink_param::get_default() and devlink_param::reset_default(). These callbacks allow drivers to implement default param actions for runtime and permanent cmodes. For driverinit params, the core latches the last value set by a driver via devl_param_driverinit_value_set(), and uses that as the default value for a param. Because default parameter values are optional, it would be impossible to discern whether or not a param of type bool has default value of false or not provided if the default value is encoded using a netlink flag type. For this reason, when a DEVLINK_PARAM_TYPE_BOOL has an associated default value, the default value is encoded using a u8 type. Signed-off-by: Daniel Zahka Link: https://patch.msgid.link/20251119025038.651131-4-daniel.zahka@gmail.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 42 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/devlink.h | 3 +++ 2 files changed, 45 insertions(+) (limited to 'include') diff --git a/include/net/devlink.h b/include/net/devlink.h index 5f479227144d..cb839e0435a1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -479,6 +479,10 @@ struct devlink_flash_notify { * @set: set parameter value, used for runtime and permanent * configuration modes * @validate: validate input value is applicable (within value range, etc.) + * @get_default: get parameter default value, used for runtime and permanent + * configuration modes + * @reset_default: reset parameter to default value, used for runtime and permanent + * configuration modes * * This struct should be used by the driver to fill the data for * a parameter it registers. @@ -498,6 +502,12 @@ struct devlink_param { int (*validate)(struct devlink *devlink, u32 id, union devlink_param_value val, struct netlink_ext_ack *extack); + int (*get_default)(struct devlink *devlink, u32 id, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack); + int (*reset_default)(struct devlink *devlink, u32 id, + enum devlink_param_cmode cmode, + struct netlink_ext_ack *extack); }; struct devlink_param_item { @@ -509,6 +519,7 @@ struct devlink_param_item { * until reload. */ bool driverinit_value_new_valid; + union devlink_param_value driverinit_default; }; enum devlink_param_generic_id { @@ -630,6 +641,37 @@ enum devlink_param_generic_id { .validate = _validate, \ } +#define DEVLINK_PARAM_GENERIC_WITH_DEFAULTS(_id, _cmodes, _get, _set, \ + _validate, _get_default, \ + _reset_default) \ +{ \ + .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ + .name = DEVLINK_PARAM_GENERIC_##_id##_NAME, \ + .type = DEVLINK_PARAM_GENERIC_##_id##_TYPE, \ + .generic = true, \ + .supported_cmodes = _cmodes, \ + .get = _get, \ + .set = _set, \ + .validate = _validate, \ + .get_default = _get_default, \ + .reset_default = _reset_default, \ +} + +#define DEVLINK_PARAM_DRIVER_WITH_DEFAULTS(_id, _name, _type, _cmodes, \ + _get, _set, _validate, \ + _get_default, _reset_default) \ +{ \ + .id = _id, \ + .name = _name, \ + .type = _type, \ + .supported_cmodes = _cmodes, \ + .get = _get, \ + .set = _set, \ + .validate = _validate, \ + .get_default = _get_default, \ + .reset_default = _reset_default, \ +} + /* Identifier of board design */ #define DEVLINK_INFO_VERSION_GENERIC_BOARD_ID "board.id" /* Revision of board design */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 157f11d3fb72..e7d6b6d13470 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -639,6 +639,9 @@ enum devlink_attr { DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, /* u64 */ + DEVLINK_ATTR_PARAM_VALUE_DEFAULT, /* dynamic */ + DEVLINK_ATTR_PARAM_RESET_DEFAULT, /* flag */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. -- cgit v1.2.3 From 5d74781ebc86c5fa9e9d6934024c505412de9b52 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 20 Nov 2025 11:28:29 +0200 Subject: vfio/pci: Add dma-buf export support for MMIO regions Add support for exporting PCI device MMIO regions through dma-buf, enabling safe sharing of non-struct page memory with controlled lifetime management. This allows RDMA and other subsystems to import dma-buf FDs and build them into memory regions for PCI P2P operations. The implementation provides a revocable attachment mechanism using dma-buf move operations. MMIO regions are normally pinned as BARs don't change physical addresses, but access is revoked when the VFIO device is closed or a PCI reset is issued. This ensures kernel self-defense against potentially hostile userspace. Currently VFIO can take MMIO regions from the device's BAR and map them into a PFNMAP VMA with special PTEs. This mapping type ensures the memory cannot be used with things like pin_user_pages(), hmm, and so on. In practice only the user process CPU and KVM can safely make use of these VMA. When VFIO shuts down these VMAs are cleaned by unmap_mapping_range() to prevent any UAF of the MMIO beyond driver unbind. However, VFIO type 1 has an insecure behavior where it uses follow_pfnmap_*() to fish a MMIO PFN out of a VMA and program it back into the IOMMU. This has a long history of enabling P2P DMA inside VMs, but has serious lifetime problems by allowing a UAF of the MMIO after the VFIO driver has been unbound. Introduce DMABUF as a new safe way to export a FD based handle for the MMIO regions. This can be consumed by existing DMABUF importers like RDMA or DRM without opening an UAF. A following series will add an importer to iommufd to obsolete the type 1 code and allow safe UAF-free MMIO P2P in VM cases. DMABUF has a built in synchronous invalidation mechanism called move_notify. VFIO keeps track of all drivers importing its MMIO and can invoke a synchronous invalidation callback to tell the importing drivers to DMA unmap and forget about the MMIO pfns. This process is being called revoke. This synchronous invalidation fully prevents any lifecycle problems. VFIO will do this before unbinding its driver ensuring there is no UAF of the MMIO beyond the driver lifecycle. Further, VFIO has additional behavior to block access to the MMIO during things like Function Level Reset. This is because some poor platforms may experience a MCE type crash when touching MMIO of a PCI device that is undergoing a reset. Today this is done by using unmap_mapping_range() on the VMAs. Extend that into the DMABUF world and temporarily revoke the MMIO from the DMABUF importers during FLR as well. This will more robustly prevent an errant P2P from possibly upsetting the platform. A DMABUF FD is a preferred handle for MMIO compared to using something like a pgmap because: - VFIO is supported, including its P2P feature, on archs that don't support pgmap - PCI devices have all sorts of BAR sizes, including ones smaller than a section so a pgmap cannot always be created - It is undesirable to waste a lot of memory for struct pages, especially for a case like a GPU with ~100GB of BAR size - We want a synchronous revoke semantic to support FLR with light hardware requirements Use the P2P subsystem to help generate the DMA mapping. This is a significant upgrade over the abuse of dma_map_resource() that has historically been used by DMABUF exporters. Experience with an OOT version of this patch shows that real systems do need this. This approach deals with all the P2P scenarios: - Non-zero PCI bus_offset - ACS flags routing traffic to the IOMMU - ACS flags that bypass the IOMMU - though vfio noiommu is required to hit this. There will be further work to formalize the revoke semantic in DMABUF. For now this acts like a move_notify dynamic exporter where importer fault handling will get a failure when they attempt to map. This means that only fully restartable fault capable importers can import the VFIO DMABUFs. A future revoke semantic should open this up to more HW as the HW only needs to invalidate, not handle restartable faults. Signed-off-by: Jason Gunthorpe Signed-off-by: Vivek Kasireddy Reviewed-by: Kevin Tian Signed-off-by: Leon Romanovsky Acked-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251120-dmabuf-vfio-v9-10-d7f71607f371@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 42 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) (limited to 'include') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index f541044e42a2..c9466ba323fa 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -26,6 +26,8 @@ struct vfio_pci_core_device; struct vfio_pci_region; +struct p2pdma_provider; +struct dma_buf_phys_vec; struct vfio_pci_regops { ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, @@ -49,9 +51,48 @@ struct vfio_pci_region { u32 flags; }; +struct vfio_pci_device_ops { + int (*get_dmabuf_phys)(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +}; + +#if IS_ENABLED(CONFIG_VFIO_PCI_DMABUF) +int vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len); +int vfio_pci_core_get_dmabuf_phys(struct vfio_pci_core_device *vdev, + struct p2pdma_provider **provider, + unsigned int region_index, + struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges); +#else +static inline int +vfio_pci_core_fill_phys_vec(struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, + size_t nr_ranges, phys_addr_t start, + phys_addr_t len) +{ + return -EINVAL; +} +static inline int vfio_pci_core_get_dmabuf_phys( + struct vfio_pci_core_device *vdev, struct p2pdma_provider **provider, + unsigned int region_index, struct dma_buf_phys_vec *phys_vec, + struct vfio_region_dma_range *dma_ranges, size_t nr_ranges) +{ + return -EOPNOTSUPP; +} +#endif + struct vfio_pci_core_device { struct vfio_device vdev; struct pci_dev *pdev; + const struct vfio_pci_device_ops *pci_ops; void __iomem *barmap[PCI_STD_NUM_BARS]; bool bar_mmap_supported[PCI_STD_NUM_BARS]; u8 *pci_config_map; @@ -94,6 +135,7 @@ struct vfio_pci_core_device { struct vfio_pci_core_device *sriov_pf_core_dev; struct notifier_block nb; struct rw_semaphore memory_lock; + struct list_head dmabufs; }; /* Will be exported for vfio pci drivers usage */ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 75100bf009ba..ac2329f24141 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -14,6 +14,7 @@ #include #include +#include #define VFIO_API_VERSION 0 @@ -1478,6 +1479,33 @@ struct vfio_device_feature_bus_master { }; #define VFIO_DEVICE_FEATURE_BUS_MASTER 10 +/** + * Upon VFIO_DEVICE_FEATURE_GET create a dma_buf fd for the + * regions selected. + * + * open_flags are the typical flags passed to open(2), eg O_RDWR, O_CLOEXEC, + * etc. offset/length specify a slice of the region to create the dmabuf from. + * nr_ranges is the total number of (P2P DMA) ranges that comprise the dmabuf. + * + * flags should be 0. + * + * Return: The fd number on success, -1 and errno is set on failure. + */ +#define VFIO_DEVICE_FEATURE_DMA_BUF 11 + +struct vfio_region_dma_range { + __u64 offset; + __u64 length; +}; + +struct vfio_device_feature_dma_buf { + __u32 region_index; + __u32 open_flags; + __u32 flags; + __u32 nr_ranges; + struct vfio_region_dma_range dma_ranges[] __counted_by(nr_ranges); +}; + /* -------- API for Type1 VFIO IOMMU -------- */ /** -- cgit v1.2.3 From 05954511b73e748d0370549ad9dd9cd95297d97a Mon Sep 17 00:00:00 2001 From: Jason Tian Date: Thu, 14 Aug 2025 09:52:52 -0700 Subject: RAS: Report all ARM processor CPER information to userspace The ARM processor CPER record was added in UEFI v2.6 and remained unchanged up to v2.10. Yet, the original arm_event trace code added by e9279e83ad1f ("trace, ras: add ARM processor error trace event") is incomplete, as it only traces some fields of UAPI 2.6 table N.16, not exporting any information from tables N.17 to N.29 of the record. This is not enough for the user to be able to figure out what has exactly happened or to take appropriate action. According to the UEFI v2.9 specification chapter N2.4.4, the ARM processor error section includes: - several (ERR_INFO_NUM) ARM processor error information structures (Tables N.17 to N.20); - several (CONTEXT_INFO_NUM) ARM processor context information structures (Tables N.21 to N.29); - several vendor specific error information structures. The size is given by Section Length minus the size of the other fields. In addition, it also exports two fields that are parsed by the GHES driver when firmware reports it, e.g.: - error severity - CPU logical index Report all of these information to userspace via a the ARM tracepoint so that userspace can properly record the error and take decisions related to CPU core isolation according to error severity and other info. The updated ARM trace event now contains the following fields: ====================================== ============================= UEFI field on table N.16 ARM Processor trace fields ====================================== ============================= Validation handled when filling data for affinity MPIDR and running state. ERR_INFO_NUM pei_len CONTEXT_INFO_NUM ctx_len Section Length indirectly reported by pei_len, ctx_len and oem_len Error affinity level affinity MPIDR_EL1 mpidr MIDR_EL1 midr Running State running_state PSCI State psci_state Processor Error Information Structure pei_err - count at pei_len Processor Context ctx_err- count at ctx_len Vendor Specific Error Info oem - count at oem_len ====================================== ============================= It should be noted that decoding of tables N.17 to N.29, if needed, will be handled in userspace. That gives more flexibility, as there won't be any need to flood the kernel with micro-architecture specific error decoding. Also, decoding the other fields require a complex logic, and should be done for each of the several values inside the record field. So, let userspace daemons like rasdaemon decode them, parsing such tables and having vendor-specific micro-architecture-specific decoders. [mchehab: modified description, solved merge conflicts and fixed coding style] Signed-off-by: Jason Tian Co-developed-by: Shengwei Luo Signed-off-by: Shengwei Luo Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Daniel Ferguson # rebased Reviewed-by: Jonathan Cameron Tested-by: Shiju Jose Acked-by: Borislav Petkov (AMD) Fixes: e9279e83ad1f ("trace, ras: add ARM processor error trace event") Link: https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#arm-processor-error-section Signed-off-by: Ard Biesheuvel --- include/linux/ras.h | 16 +++++++++++++--- include/ras/ras_event.h | 49 ++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/ras.h b/include/linux/ras.h index a64182bc72ad..468941bfe855 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -24,8 +24,7 @@ int __init parse_cec_param(char *str); void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len); -void log_arm_hw_error(struct cper_sec_proc_arm *err); - +void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev); #else static inline void log_non_standard_event(const guid_t *sec_type, @@ -33,7 +32,7 @@ log_non_standard_event(const guid_t *sec_type, const u8 sev, const u8 *err, const u32 len) { return; } static inline void -log_arm_hw_error(struct cper_sec_proc_arm *err) { return; } +log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return; } #endif struct atl_err { @@ -53,4 +52,15 @@ static inline unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } #endif /* CONFIG_AMD_ATL */ +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) +#include +/* + * Include ARM-specific SMP header which provides a function mapping mpidr to + * CPU logical index. + */ +#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr & MPIDR_HWID_BITMASK) +#else +#define GET_LOGICAL_INDEX(mpidr) -EINVAL +#endif /* CONFIG_ARM || CONFIG_ARM64 */ + #endif /* __RAS_H__ */ diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index c8cd0f00c845..c9f0b1018bcc 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -168,11 +168,25 @@ TRACE_EVENT(mc_event, * This event is generated when hardware detects an ARM processor error * has occurred. UEFI 2.6 spec section N.2.4.4. */ +#define APEIL "ARM Processor Err Info data len" +#define APEID "ARM Processor Err Info raw data" +#define APECIL "ARM Processor Err Context Info data len" +#define APECID "ARM Processor Err Context Info raw data" +#define VSEIL "Vendor Specific Err Info data len" +#define VSEID "Vendor Specific Err Info raw data" TRACE_EVENT(arm_event, - TP_PROTO(const struct cper_sec_proc_arm *proc), + TP_PROTO(const struct cper_sec_proc_arm *proc, + const u8 *pei_err, + const u32 pei_len, + const u8 *ctx_err, + const u32 ctx_len, + const u8 *oem, + const u32 oem_len, + u8 sev, + int cpu), - TP_ARGS(proc), + TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, cpu), TP_STRUCT__entry( __field(u64, mpidr) @@ -180,6 +194,14 @@ TRACE_EVENT(arm_event, __field(u32, running_state) __field(u32, psci_state) __field(u8, affinity) + __field(u32, pei_len) + __dynamic_array(u8, pei_buf, pei_len) + __field(u32, ctx_len) + __dynamic_array(u8, ctx_buf, ctx_len) + __field(u32, oem_len) + __dynamic_array(u8, oem_buf, oem_len) + __field(u8, sev) + __field(int, cpu) ), TP_fast_assign( @@ -199,12 +221,29 @@ TRACE_EVENT(arm_event, __entry->running_state = ~0; __entry->psci_state = ~0; } + __entry->pei_len = pei_len; + memcpy(__get_dynamic_array(pei_buf), pei_err, pei_len); + __entry->ctx_len = ctx_len; + memcpy(__get_dynamic_array(ctx_buf), ctx_err, ctx_len); + __entry->oem_len = oem_len; + memcpy(__get_dynamic_array(oem_buf), oem, oem_len); + __entry->sev = sev; + __entry->cpu = cpu; ), - TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; " - "running state: %d; PSCI state: %d", + TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: %016llx; " + "running state: %d; PSCI state: %d; " + "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s", + __entry->cpu, + __entry->sev, __entry->affinity, __entry->mpidr, __entry->midr, - __entry->running_state, __entry->psci_state) + __entry->running_state, __entry->psci_state, + APEIL, __entry->pei_len, APEID, + __print_hex(__get_dynamic_array(pei_buf), __entry->pei_len), + APECIL, __entry->ctx_len, APECID, + __print_hex(__get_dynamic_array(ctx_buf), __entry->ctx_len), + VSEIL, __entry->oem_len, VSEID, + __print_hex(__get_dynamic_array(oem_buf), __entry->oem_len)) ); /* -- cgit v1.2.3 From a976d790f49499ccaa0f991788ad8ebf92e7fd5c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 14 Aug 2025 09:52:54 -0700 Subject: efi/cper: Add a new helper function to print bitmasks Add a helper function to print a string with names associated to each bit field. A typical example is: const char * const bits[] = { "bit 3 name", "bit 4 name", "bit 5 name", }; char str[120]; unsigned int bitmask = BIT(3) | BIT(5); #define MASK GENMASK(5,3) cper_bits_to_str(str, sizeof(str), FIELD_GET(MASK, bitmask), bits, ARRAY_SIZE(bits)); The above code fills string "str" with "bit 3 name|bit 5 name". Reviewed-by: Jonathan Cameron Signed-off-by: Mauro Carvalho Chehab Acked-by: Borislav Petkov (AMD) Signed-off-by: Ard Biesheuvel --- include/linux/cper.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cper.h b/include/linux/cper.h index 0ed60a91eca9..58f40477c824 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -588,6 +588,8 @@ const char *cper_mem_err_type_str(unsigned int); const char *cper_mem_err_status_str(u64 status); void cper_print_bits(const char *prefix, unsigned int bits, const char * const strs[], unsigned int strs_size); +int cper_bits_to_str(char *buf, int buf_size, unsigned long bits, + const char * const strs[], unsigned int strs_size); void cper_mem_err_pack(const struct cper_sec_mem_err *, struct cper_mem_err_compact *); const char *cper_mem_err_unpack(struct trace_seq *, -- cgit v1.2.3 From 96b010536ee020e716d28d9b359a4bcd18800aeb Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 14 Aug 2025 09:52:55 -0700 Subject: efi/cper: align ARM CPER type with UEFI 2.9A/2.10 specs Up to UEFI spec 2.9, the type byte of CPER struct for ARM processor was defined simply as: Type at byte offset 4: - Cache error - TLB Error - Bus Error - Micro-architectural Error All other values are reserved Yet, there was no information about how this would be encoded. Spec 2.9A errata corrected it by defining: - Bit 1 - Cache Error - Bit 2 - TLB Error - Bit 3 - Bus Error - Bit 4 - Micro-architectural Error All other values are reserved That actually aligns with the values already defined on older versions at N.2.4.1. Generic Processor Error Section. Spec 2.10 also preserve the same encoding as 2.9A. Adjust CPER and GHES handling code for both generic and ARM processors to properly handle UEFI 2.9A and 2.10 encoding. Link: https://uefi.org/specs/UEFI/2.10/Apx_N_Common_Platform_Error_Record.html#arm-processor-error-information Signed-off-by: Mauro Carvalho Chehab Reviewed-by: Jonathan Cameron Acked-by: Borislav Petkov (AMD) Signed-off-by: Ard Biesheuvel --- include/linux/cper.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/cper.h b/include/linux/cper.h index 58f40477c824..5b1236d8c65b 100644 --- a/include/linux/cper.h +++ b/include/linux/cper.h @@ -297,11 +297,11 @@ enum { #define CPER_ARM_INFO_FLAGS_PROPAGATED BIT(2) #define CPER_ARM_INFO_FLAGS_OVERFLOW BIT(3) -#define CPER_ARM_CACHE_ERROR 0 -#define CPER_ARM_TLB_ERROR 1 -#define CPER_ARM_BUS_ERROR 2 -#define CPER_ARM_VENDOR_ERROR 3 -#define CPER_ARM_MAX_TYPE CPER_ARM_VENDOR_ERROR +#define CPER_ARM_ERR_TYPE_MASK GENMASK(4,1) +#define CPER_ARM_CACHE_ERROR BIT(1) +#define CPER_ARM_TLB_ERROR BIT(2) +#define CPER_ARM_BUS_ERROR BIT(3) +#define CPER_ARM_VENDOR_ERROR BIT(4) #define CPER_ARM_ERR_VALID_TRANSACTION_TYPE BIT(0) #define CPER_ARM_ERR_VALID_OPERATION_TYPE BIT(1) -- cgit v1.2.3 From 93863f3f859a626347ce2ec18947b11357b4ca14 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 20 Nov 2025 12:14:20 -0800 Subject: kbuild: Check for functions with ambiguous -ffunction-sections section names Commit 9c7dc1dd897a ("objtool: Warn on functions with ambiguous -ffunction-sections section names") only works for drivers which are compiled on architectures supported by objtool. Make a script to perform the same check for all architectures. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Greg Kroah-Hartman Link: https://patch.msgid.link/a6a49644a34964f7e02f3a8ce43af03e72817180.1763669451.git.jpoimboe@kernel.org --- include/asm-generic/vmlinux.lds.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 5efe1de2209b..0cdae6f809b5 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -110,7 +110,7 @@ * .text.startup could be __attribute__((constructor)) code in a *non* * ffunction-sections object, which should be placed in .init.text; or it could * be an actual function named startup() in an ffunction-sections object, which - * should be placed in .text. Objtool will detect and complain about any such + * should be placed in .text. The build will detect and complain about any such * ambiguously named functions. */ #define TEXT_MAIN \ -- cgit v1.2.3 From 8e8678e740ecde2ae4a0404fd9b4ed2b726e236d Mon Sep 17 00:00:00 2001 From: Janosch Frank Date: Tue, 8 Jul 2025 12:57:57 +0000 Subject: KVM: s390: Add capability that forwards operation exceptions Setting KVM_CAP_S390_USER_OPEREXEC will forward all operation exceptions to user space. This also includes the 0x0000 instructions managed by KVM_CAP_S390_USER_INSTR0. It's helpful if user space wants to emulate instructions which do not (yet) have an opcode. While we're at it refine the documentation for KVM_CAP_S390_USER_INSTR0. Signed-off-by: Janosch Frank Reviewed-by: Claudio Imbrenda Acked-by: Christian Borntraeger Signed-off-by: Janosch Frank --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 52f6000ab020..8ab07396ce3b 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -963,6 +963,7 @@ struct kvm_enable_cap { #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_ARM_CACHEABLE_PFNMAP_SUPPORTED 243 #define KVM_CAP_GUEST_MEMFD_FLAGS 244 +#define KVM_CAP_S390_USER_OPEREXEC 245 struct kvm_irq_routing_irqchip { __u32 irqchip; -- cgit v1.2.3 From d292dbb5640c5b73b5ad889ae31fe889a2bf3137 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Jun 2025 14:59:32 +0200 Subject: bug: Add BUG_FORMAT infrastructure Add BUG_FORMAT; an architecture opt-in feature that allows adding the WARN_printf() format string to the bug_entry table. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251110115757.223371452@infradead.org --- include/asm-generic/bug.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 2d9f61346dab..c7a1407b8669 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -42,6 +42,13 @@ struct bug_entry { #else signed int bug_addr_disp; #endif +#ifdef HAVE_ARCH_BUG_FORMAT +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS + const char *format; +#else + signed int format_disp; +#endif +#endif #ifdef CONFIG_DEBUG_BUGVERBOSE #ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS const char *file; -- cgit v1.2.3 From 30b82568b04e279d0d99482db036f1bdfecac522 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Jun 2025 15:01:38 +0200 Subject: bug: Clean up CONFIG_GENERIC_BUG_RELATIVE_POINTERS Three repeated CONFIG_GENERIC_BUG_RELATIVE_POINTERS #ifdefs right after one another yields unreadable code. Add a helper. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251110115757.341703850@infradead.org --- include/asm-generic/bug.h | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index c7a1407b8669..9ee622ae0c9a 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -35,26 +35,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #ifdef CONFIG_BUG -#ifdef CONFIG_GENERIC_BUG -struct bug_entry { #ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS - unsigned long bug_addr; +#define BUG_REL(type, name) type name #else - signed int bug_addr_disp; +#define BUG_REL(type, name) signed int name##_disp #endif + +#ifdef CONFIG_GENERIC_BUG +struct bug_entry { + BUG_REL(unsigned long, bug_addr); #ifdef HAVE_ARCH_BUG_FORMAT -#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS - const char *format; -#else - signed int format_disp; -#endif + BUG_REL(const char *, format); #endif #ifdef CONFIG_DEBUG_BUGVERBOSE -#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS - const char *file; -#else - signed int file_disp; -#endif + BUG_REL(const char *, file); unsigned short line; #endif unsigned short flags; -- cgit v1.2.3 From 5c47b7f3d1a9d7589026a201abb8ad445f029246 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 7 Jun 2025 10:51:24 +0200 Subject: bug: Add BUG_FORMAT_ARGS infrastructure Add BUG_FORMAT_ARGS; when an architecture is able to provide a va_list given pt_regs, use this to print format arguments. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251110115757.457339417@infradead.org --- include/asm-generic/bug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 9ee622ae0c9a..228873e13b95 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -13,6 +13,7 @@ #define BUGFLAG_ONCE (1 << 1) #define BUGFLAG_DONE (1 << 2) #define BUGFLAG_NO_CUT_HERE (1 << 3) /* CUT_HERE already sent */ +#define BUGFLAG_ARGS (1 << 4) #define BUGFLAG_TAINT(taint) ((taint) << 8) #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif -- cgit v1.2.3 From 7d2c27a0ec5ecec980b623ded45758918c00b164 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 7 Jun 2025 10:52:28 +0200 Subject: bug: Add report_bug_entry() Add a report_bug() variant where the bug_entry is already known. This is useful when the exception instruction is not instantiated per-site. But instead has a single instance. In such a case the bug_entry address might be passed along in a known register or something. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251110115757.575795595@infradead.org --- include/linux/bug.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/bug.h b/include/linux/bug.h index a9948a9f1093..17a4933c611b 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -42,6 +42,7 @@ void bug_get_file_line(struct bug_entry *bug, const char **file, struct bug_entry *find_bug(unsigned long bugaddr); enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs); +enum bug_trap_type report_bug_entry(struct bug_entry *bug, struct pt_regs *regs); /* These are defined by the architecture */ int is_valid_bugaddr(unsigned long addr); @@ -62,6 +63,13 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr, } struct bug_entry; + +static inline enum bug_trap_type +report_bug_entry(struct bug_entry *bug, struct pt_regs *regs) +{ + return BUG_TRAP_TYPE_BUG; +} + static inline void bug_get_file_line(struct bug_entry *bug, const char **file, unsigned int *line) { -- cgit v1.2.3 From 3fd45b871fde00f4fac96318a136bd256ec0b90b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Nov 2025 11:58:37 +0100 Subject: bug: Implement WARN_ON() using __WARN_FLAGS() This completes 3bc3c9c3ab6d ("bugs/core: Pass down the condition string of WARN_ON_ONCE(cond) warnings to __WARN_FLAGS()") and makes WARN_ON() and WARN_ON_ONCE() behaviour consistent. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251110115757.690999560@infradead.org --- include/asm-generic/bug.h | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 228873e13b95..4bfbeae30c42 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -109,21 +109,35 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); } while (0) #else #define __WARN() __WARN_FLAGS("", BUGFLAG_TAINT(TAINT_WARN)) + #define __WARN_printf(taint, arg...) do { \ instrumentation_begin(); \ __warn_printk(arg); \ __WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ instrumentation_end(); \ } while (0) -#define WARN_ON_ONCE(condition) ({ \ - int __ret_warn_on = !!(condition); \ - if (unlikely(__ret_warn_on)) \ - __WARN_FLAGS("["#condition"] ", \ - BUGFLAG_ONCE | \ - BUGFLAG_TAINT(TAINT_WARN)); \ - unlikely(__ret_warn_on); \ + +#ifndef WARN_ON +#define WARN_ON(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + __WARN_FLAGS("["#condition"] ", \ + BUGFLAG_TAINT(TAINT_WARN)); \ + unlikely(__ret_warn_on); \ +}) +#endif + +#ifndef WARN_ON_ONCE +#define WARN_ON_ONCE(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + __WARN_FLAGS("["#condition"] ", \ + BUGFLAG_ONCE | \ + BUGFLAG_TAINT(TAINT_WARN)); \ + unlikely(__ret_warn_on); \ }) #endif +#endif /* __WARN_FLAGS */ /* used internally by panic.c */ -- cgit v1.2.3 From b9b2c455f462b67954bee5f17c3d68355d37586f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Jun 2025 15:08:36 +0200 Subject: bug: Allow architectures to provide __WARN_printf() In addition to providing __WARN_FLAGS(), allow an architecture to also provide __WARN_printf(). Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251110115757.807154591@infradead.org --- include/asm-generic/bug.h | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 4bfbeae30c42..21d2c8f88d49 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -100,23 +100,9 @@ void warn_slowpath_fmt(const char *file, const int line, unsigned taint, const char *fmt, ...); extern __printf(1, 2) void __warn_printk(const char *fmt, ...); -#ifndef __WARN_FLAGS -#define __WARN() __WARN_printf(TAINT_WARN, NULL) -#define __WARN_printf(taint, arg...) do { \ - instrumentation_begin(); \ - warn_slowpath_fmt(__FILE__, __LINE__, taint, arg); \ - instrumentation_end(); \ - } while (0) -#else +#ifdef __WARN_FLAGS #define __WARN() __WARN_FLAGS("", BUGFLAG_TAINT(TAINT_WARN)) -#define __WARN_printf(taint, arg...) do { \ - instrumentation_begin(); \ - __warn_printk(arg); \ - __WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ - instrumentation_end(); \ - } while (0) - #ifndef WARN_ON #define WARN_ON(condition) ({ \ int __ret_warn_on = !!(condition); \ @@ -139,6 +125,27 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #endif #endif /* __WARN_FLAGS */ +#if defined(__WARN_FLAGS) && !defined(__WARN_printf) +#define __WARN_printf(taint, arg...) do { \ + instrumentation_begin(); \ + __warn_printk(arg); \ + __WARN_FLAGS("", BUGFLAG_NO_CUT_HERE | BUGFLAG_TAINT(taint));\ + instrumentation_end(); \ + } while (0) +#endif + +#ifndef __WARN_printf +#define __WARN_printf(taint, arg...) do { \ + instrumentation_begin(); \ + warn_slowpath_fmt(__FILE__, __LINE__, taint, arg); \ + instrumentation_end(); \ + } while (0) +#endif + +#ifndef __WARN +#define __WARN() __WARN_printf(TAINT_WARN, NULL) +#endif + /* used internally by panic.c */ #ifndef WARN_ON -- cgit v1.2.3 From a67df6d1b939ca98e1ad403f53e3ee57299b8c44 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 11 Nov 2025 14:46:10 +0100 Subject: uapi: cdc.h: cleanly provide for more interfaces and countries The spec requires at least one interface respectively country. It allows multiple ones. This needs to be clearly said in the UAPI. This is subject to sanity checking in cdc_parse_cdc_header(), thus we can trust the length. Signed-off-by: Oliver Neukum Link: https://patch.msgid.link/20251111134641.4118827-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/usb/cdc.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/usb/cdc.h b/include/uapi/linux/usb/cdc.h index 1924cf665448..7bd5d12d8b26 100644 --- a/include/uapi/linux/usb/cdc.h +++ b/include/uapi/linux/usb/cdc.h @@ -104,8 +104,10 @@ struct usb_cdc_union_desc { __u8 bDescriptorSubType; __u8 bMasterInterface0; - __u8 bSlaveInterface0; - /* ... and there could be other slave interfaces */ + union { + __u8 bSlaveInterface0; + __DECLARE_FLEX_ARRAY(__u8, bSlaveInterfaces); + }; } __attribute__ ((packed)); /* "Country Selection Functional Descriptor" from CDC spec 5.2.3.9 */ @@ -115,8 +117,10 @@ struct usb_cdc_country_functional_desc { __u8 bDescriptorSubType; __u8 iCountryCodeRelDate; - __le16 wCountyCode0; - /* ... and there can be a lot of country codes */ + union { + __le16 wCountryCode0; + __DECLARE_FLEX_ARRAY(__le16, wCountryCodes); + }; } __attribute__ ((packed)); /* "Network Channel Terminal Functional Descriptor" from CDC spec 5.2.3.11 */ -- cgit v1.2.3 From a75a5b148b4e1d7c0525359be455d5a54024b714 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Fri, 14 Nov 2025 19:37:55 +0100 Subject: usb: ohci-da8xx: remove unused platform data We no longer support any board files for DaVinci in mainline and so struct da8xx_ohci_root_hub is no longer used. Remove it together with all the code it's used for. Signed-off-by: Bartosz Golaszewski Acked-by: Alan Stern Link: https://patch.msgid.link/20251114-davinci-usb-v1-1-737380353a74@linaro.org Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/usb-davinci.h | 22 ---------------------- 1 file changed, 22 deletions(-) delete mode 100644 include/linux/platform_data/usb-davinci.h (limited to 'include') diff --git a/include/linux/platform_data/usb-davinci.h b/include/linux/platform_data/usb-davinci.h deleted file mode 100644 index 879f5c78b91a..000000000000 --- a/include/linux/platform_data/usb-davinci.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * USB related definitions - * - * Copyright (C) 2009 MontaVista Software, Inc. - * - * This file is licensed under the terms of the GNU General Public License - * version 2. This program is licensed "as is" without any warranty of any - * kind, whether express or implied. - */ - -#ifndef __ASM_ARCH_USB_H -#define __ASM_ARCH_USB_H - -/* Passed as the platform data to the OHCI driver */ -struct da8xx_ohci_root_hub { - /* Time from power on to power good (in 2 ms units) */ - u8 potpgt; -}; - -void davinci_setup_usb(unsigned mA, unsigned potpgt_ms); - -#endif /* ifndef __ASM_ARCH_USB_H */ -- cgit v1.2.3 From c460697d3472d4252917fba9bbc1d1a23eafc124 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 17 Nov 2025 10:47:56 +0000 Subject: lib: Support ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION ARCH_HAS_CPU_CACHE_INVALIDATE_MEMREGION provides the mechanism for invalidating certain memory regions in a cache-incoherent manner. Currently this is used by NVDIMM and CXL memory drivers in cases where it is necessary to flush all data from caches by physical address range. The operations in question are effectively memory hotplug, where stale data might otherwise remain in the caches. This is separate from the invalidates done to enable use of non-coherent DMA masters, primarily in terms of when it is needed (not related to DMA mappings) and how deep the flush must push data. The flushes done for non-coherent DMA only need to reach the Point of Coherence of a single host (which is often nearer CPUs and DMA masters than the physical storage). This operation must push the data out of non architectural caches (memory-side caches, write buffers etc) and typically all the way to the memory device. In some architectures these operations are supported by system components that may become available only later in boot as they are either present on a discoverable bus, or via a firmware description of an MMIO interface (e.g. ACPI DSDT). Provide a framework to handle this case. Architectures can opt in for this support via CONFIG_GENERIC_CPU_CACHE_MAINTENANCE Add a registration framework. Each driver provides an ops structure and the first op is Write Back and Invalidate by PA Range. The driver may over invalidate. For systems that can perform this operation asynchronously an optional completion check operation is also provided. If present that must be called to ensure that the action has finished. This provides a considerable performance advantage if multiple agents are involved in the maintenance operation. When multiple agents are present in the system each should register with this framework and the core code will issue the invalidate to all of them before checking for completion on each. This is done to avoid need for filtering in the core code which can become complex when interleave, potentially across different cache coherency hardware is going on, so it is easier to tell everyone and let those who don't care do nothing. Signed-off-by: Yicong Yang Co-developed-by: Jonathan Cameron Signed-off-by: Jonathan Cameron Acked-by: Conor Dooley Signed-off-by: Conor Dooley --- include/linux/cache_coherency.h | 61 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 include/linux/cache_coherency.h (limited to 'include') diff --git a/include/linux/cache_coherency.h b/include/linux/cache_coherency.h new file mode 100644 index 000000000000..cc81c5733e31 --- /dev/null +++ b/include/linux/cache_coherency.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Cache coherency maintenance operation device drivers + * + * Copyright Huawei 2025 + */ +#ifndef _LINUX_CACHE_COHERENCY_H_ +#define _LINUX_CACHE_COHERENCY_H_ + +#include +#include +#include + +struct cc_inval_params { + phys_addr_t addr; + size_t size; +}; + +struct cache_coherency_ops_inst; + +struct cache_coherency_ops { + int (*wbinv)(struct cache_coherency_ops_inst *cci, + struct cc_inval_params *invp); + int (*done)(struct cache_coherency_ops_inst *cci); +}; + +struct cache_coherency_ops_inst { + struct kref kref; + struct list_head node; + const struct cache_coherency_ops *ops; +}; + +int cache_coherency_ops_instance_register(struct cache_coherency_ops_inst *cci); +void cache_coherency_ops_instance_unregister(struct cache_coherency_ops_inst *cci); + +struct cache_coherency_ops_inst * +_cache_coherency_ops_instance_alloc(const struct cache_coherency_ops *ops, + size_t size); +/** + * cache_coherency_ops_instance_alloc - Allocate cache coherency ops instance + * @ops: Cache maintenance operations + * @drv_struct: structure that contains the struct cache_coherency_ops_inst + * @member: Name of the struct cache_coherency_ops_inst member in @drv_struct. + * + * This allocates a driver specific structure and initializes the + * cache_coherency_ops_inst embedded in the drv_struct. Upon success the + * pointer must be freed via cache_coherency_ops_instance_put(). + * + * Returns a &drv_struct * on success, %NULL on error. + */ +#define cache_coherency_ops_instance_alloc(ops, drv_struct, member) \ + ({ \ + static_assert(__same_type(struct cache_coherency_ops_inst, \ + ((drv_struct *)NULL)->member)); \ + static_assert(offsetof(drv_struct, member) == 0); \ + (drv_struct *)_cache_coherency_ops_instance_alloc(ops, \ + sizeof(drv_struct)); \ + }) +void cache_coherency_ops_instance_put(struct cache_coherency_ops_inst *cci); + +#endif -- cgit v1.2.3 From 1d6c915819f5b805c35487b6ce5923e31a28266b Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Thu, 20 Nov 2025 16:05:38 -0800 Subject: powercap: intel_rapl: Prepare read_raw() interface for atomic-context callers The current read_raw() implementation of the TPMI, MMIO and MSR interfaces does not distinguish between atomic and non-atomic callers. rapl_msr_read_raw() uses rdmsrq_safe_on_cpu(), which can sleep and issue cross CPU calls. When MSR-based RAPL PMU support is enabled, PMU event handlers can invoke this function from atomic context where sleeping or rescheduling is not allowed. In atomic context, the caller is already executing on the target CPU, so a direct rdmsrq() is sufficient. To support such usage, introduce an atomic flag to the read_raw() interface to allow callers pass the context information. Modify the common RAPL code to propagate this flag, and set the flag to reflect the calling contexts. Utilize the atomic flag in rapl_msr_read_raw() to perform direct MSR read with rdmsrq() when running in atomic context, and a sanity check to ensure target CPU matches the current CPU for such use cases. The TPMI and MMIO implementations do not require special atomic handling, so the flag is ignored in those paths. This is a preparatory patch for adding MSR-based RAPL PMU support. Signed-off-by: Kuppuswamy Sathyanarayanan Reviewed-by: Srinivas Pandruvada [ rjw: Subject tweak ] Link: https://patch.msgid.link/20251121000539.386069-2-sathyanarayanan.kuppuswamy@linux.intel.com Signed-off-by: Rafael J. Wysocki --- include/linux/intel_rapl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/intel_rapl.h b/include/linux/intel_rapl.h index c0397423d3a8..e9ade2ff4af6 100644 --- a/include/linux/intel_rapl.h +++ b/include/linux/intel_rapl.h @@ -152,7 +152,7 @@ struct rapl_if_priv { union rapl_reg reg_unit; union rapl_reg regs[RAPL_DOMAIN_MAX][RAPL_DOMAIN_REG_MAX]; int limits[RAPL_DOMAIN_MAX]; - int (*read_raw)(int id, struct reg_action *ra); + int (*read_raw)(int id, struct reg_action *ra, bool atomic); int (*write_raw)(int id, struct reg_action *ra); void *defaults; void *rpi; -- cgit v1.2.3 From 7923ae7698cf9728501974d76d8ea712686281bc Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 12 Nov 2025 18:57:29 -0600 Subject: x86,fs/resctrl: Detect io_alloc feature AMD's SDCIAE (SDCI Allocation Enforcement) PQE feature enables system software to control the portions of L3 cache used for direct insertion of data from I/O devices into the L3 cache. Introduce a generic resctrl cache resource property "io_alloc_capable" as the first part of the new "io_alloc" resctrl feature that will support AMD's SDCIAE. Any architecture can set a cache resource as "io_alloc_capable" if a portion of the cache can be allocated for I/O traffic. Set the "io_alloc_capable" property for the L3 cache resource on x86 (AMD) systems that support SDCIAE. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://patch.msgid.link/df85a9a6081674fd3ef6b4170920485512ce2ded.1762995456.git.babu.moger@amd.com --- include/linux/resctrl.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index a7d92718b653..533f240dbe21 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -206,6 +206,8 @@ struct rdt_mon_domain { * @arch_has_sparse_bitmasks: True if a bitmask like f00f is valid. * @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache * level has CPU scope. + * @io_alloc_capable: True if portion of the cache can be configured + * for I/O traffic. */ struct resctrl_cache { unsigned int cbm_len; @@ -213,6 +215,7 @@ struct resctrl_cache { unsigned int shareable_bits; bool arch_has_sparse_bitmasks; bool arch_has_per_cpu_cfg; + bool io_alloc_capable; }; /** -- cgit v1.2.3 From 556d2892aa715286d840a74216c8fff885559261 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 12 Nov 2025 18:57:30 -0600 Subject: x86,fs/resctrl: Implement "io_alloc" enable/disable handlers "io_alloc" is the generic name of the new resctrl feature that enables system software to configure the portion of cache allocated for I/O traffic. On AMD systems, "io_alloc" resctrl feature is backed by AMD's L3 Smart Data Cache Injection Allocation Enforcement (SDCIAE). Introduce the architecture-specific functions that resctrl fs should call to enable, disable, or check status of the "io_alloc" feature. Change SDCIAE state by setting (to enable) or clearing (to disable) bit 1 of MSR_IA32_L3_QOS_EXT_CFG on all logical processors within the cache domain. Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Reinette Chatre Link: https://patch.msgid.link/9e9070100c320eab5368e088a3642443dee95ed7.1762995456.git.babu.moger@amd.com --- include/linux/resctrl.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h index 533f240dbe21..54701668b3df 100644 --- a/include/linux/resctrl.h +++ b/include/linux/resctrl.h @@ -657,6 +657,27 @@ void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_mon_domain *d, u32 closid, u32 rmid, int cntr_id, enum resctrl_event_id eventid); +/** + * resctrl_arch_io_alloc_enable() - Enable/disable io_alloc feature. + * @r: The resctrl resource. + * @enable: Enable (true) or disable (false) io_alloc on resource @r. + * + * This can be called from any CPU. + * + * Return: + * 0 on success, <0 on error. + */ +int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable); + +/** + * resctrl_arch_get_io_alloc_enabled() - Get io_alloc feature state. + * @r: The resctrl resource. + * + * Return: + * true if io_alloc is enabled or false if disabled. + */ +bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r); + extern unsigned int resctrl_rmid_realloc_threshold; extern unsigned int resctrl_rmid_realloc_limit; -- cgit v1.2.3 From e40f5a6bf88a781d5f81bc6b8aab9ac31d8c98dd Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 19 Nov 2025 17:03:54 +0100 Subject: bpf: correct stack liveness for tail calls This updates bpf_insn_successors() reflecting that control flow might jump over the instructions between tail call and function exit, verifier might assume that some writes to parent stack always happen, which is not the case. Signed-off-by: Eduard Zingerman Signed-off-by: Martin Teichmann Link: https://lore.kernel.org/r/20251119160355.1160932-4-martin.teichmann@xfel.eu Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 5441341f1ab9..8d0b60fa5f2b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -527,7 +527,6 @@ struct bpf_insn_aux_data { struct { u32 map_index; /* index into used_maps[] */ u32 map_off; /* offset from value base address */ - struct bpf_iarray *jt; /* jump table for gotox instruction */ }; struct { enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ @@ -550,6 +549,7 @@ struct bpf_insn_aux_data { /* remember the offset of node field within type to rewrite */ u64 insert_off; }; + struct bpf_iarray *jt; /* jump table for gotox or bpf_tailcall call instruction */ struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ @@ -652,6 +652,7 @@ struct bpf_subprog_info { u32 start; /* insn idx of function entry point */ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ u32 postorder_start; /* The idx to the env->cfg.insn_postorder */ + u32 exit_idx; /* Index of one of the BPF_EXIT instructions in this subprogram */ u16 stack_depth; /* max. stack depth used by this function */ u16 stack_extra; /* offsets in range [stack_depth .. fastcall_stack_off) @@ -669,9 +670,9 @@ struct bpf_subprog_info { bool keep_fastcall_stack: 1; bool changes_pkt_data: 1; bool might_sleep: 1; + u8 arg_cnt:3; enum priv_stack_mode priv_stack_mode; - u8 arg_cnt; struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS]; }; -- cgit v1.2.3 From c7dcb041ce7d32c0becd43e8f99f993365e6bd20 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 13 Nov 2025 18:57:08 -0800 Subject: crypto: ansi_cprng - Remove unused ansi_cprng algorithm Remove ansi_cprng, since it's obsolete and unused, as confirmed at https://lore.kernel.org/r/aQxpnckYMgAAOLpZ@gondor.apana.org.au/ This was originally added in 2008, apparently as a FIPS approved random number generator. Whether this has ever belonged upstream is questionable. Either way, ansi_cprng is no longer usable for this purpose, since it's been superseded by the more modern algorithms in crypto/drbg.c, and FIPS itself no longer allows it. (NIST SP 800-131A Rev 1 (2015) says that RNGs based on ANSI X9.31 will be disallowed after 2015. NIST SP 800-131A Rev 2 (2019) confirms they are now disallowed.) Therefore, there is no reason to keep it around. Suggested-by: Herbert Xu Cc: Haotian Zhang Cc: Neil Horman Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- include/crypto/rng.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/crypto/rng.h b/include/crypto/rng.h index f8224cc390f8..d451b54b322a 100644 --- a/include/crypto/rng.h +++ b/include/crypto/rng.h @@ -169,12 +169,11 @@ static inline int crypto_rng_get_bytes(struct crypto_rng *tfm, * * The reset function completely re-initializes the random number generator * referenced by the cipher handle by clearing the current state. The new state - * is initialized with the caller provided seed or automatically, depending - * on the random number generator type (the ANSI X9.31 RNG requires - * caller-provided seed, the SP800-90A DRBGs perform an automatic seeding). - * The seed is provided as a parameter to this function call. The provided seed - * should have the length of the seed size defined for the random number - * generator as defined by crypto_rng_seedsize. + * is initialized with the caller provided seed or automatically, depending on + * the random number generator type. (The SP800-90A DRBGs perform an automatic + * seeding.) The seed is provided as a parameter to this function call. The + * provided seed should have the length of the seed size defined for the random + * number generator as defined by crypto_rng_seedsize. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ -- cgit v1.2.3 From 4dffc9bbffb9ccfcda730d899c97c553599e7ca8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 15 Nov 2025 15:08:16 -0800 Subject: crypto: scatterwalk - Fix memcpy_sglist() to always succeed The original implementation of memcpy_sglist() was broken because it didn't handle scatterlists that describe exactly the same memory, which is a case that many callers rely on. The current implementation is broken too because it calls the skcipher_walk functions which can fail. It ignores any errors from those functions. Fix it by replacing it with a new implementation written from scratch. It always succeeds. It's also a bit faster, since it avoids the overhead of skcipher_walk. skcipher_walk includes a lot of functionality (such as alignmask handling) that's irrelevant here. Reported-by: Colin Ian King Closes: https://lore.kernel.org/r/20251114122620.111623-1-coking@nvidia.com Fixes: 131bdceca1f0 ("crypto: scatterwalk - Add memcpy_sglist") Fixes: 0f8d42bf128d ("crypto: scatterwalk - Move skcipher walk and use it for memcpy_sglist") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- include/crypto/scatterwalk.h | 52 ++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h index 83d14376ff2b..f485454e3955 100644 --- a/include/crypto/scatterwalk.h +++ b/include/crypto/scatterwalk.h @@ -227,6 +227,34 @@ static inline void scatterwalk_done_src(struct scatter_walk *walk, scatterwalk_advance(walk, nbytes); } +/* + * Flush the dcache of any pages that overlap the region + * [offset, offset + nbytes) relative to base_page. + * + * This should be called only when ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE, to ensure + * that all relevant code (including the call to sg_page() in the caller, if + * applicable) gets fully optimized out when !ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE. + */ +static inline void __scatterwalk_flush_dcache_pages(struct page *base_page, + unsigned int offset, + unsigned int nbytes) +{ + unsigned int num_pages; + + base_page += offset / PAGE_SIZE; + offset %= PAGE_SIZE; + + /* + * This is an overflow-safe version of + * num_pages = DIV_ROUND_UP(offset + nbytes, PAGE_SIZE). + */ + num_pages = nbytes / PAGE_SIZE; + num_pages += DIV_ROUND_UP(offset + (nbytes % PAGE_SIZE), PAGE_SIZE); + + for (unsigned int i = 0; i < num_pages; i++) + flush_dcache_page(base_page + i); +} + /** * scatterwalk_done_dst() - Finish one step of a walk of destination scatterlist * @walk: the scatter_walk @@ -240,27 +268,9 @@ static inline void scatterwalk_done_dst(struct scatter_walk *walk, unsigned int nbytes) { scatterwalk_unmap(walk); - /* - * Explicitly check ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE instead of just - * relying on flush_dcache_page() being a no-op when not implemented, - * since otherwise the BUG_ON in sg_page() does not get optimized out. - * This also avoids having to consider whether the loop would get - * reliably optimized out or not. - */ - if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE) { - struct page *base_page; - unsigned int offset; - int start, end, i; - - base_page = sg_page(walk->sg); - offset = walk->offset; - start = offset >> PAGE_SHIFT; - end = start + (nbytes >> PAGE_SHIFT); - end += (offset_in_page(offset) + offset_in_page(nbytes) + - PAGE_SIZE - 1) >> PAGE_SHIFT; - for (i = start; i < end; i++) - flush_dcache_page(base_page + i); - } + if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE) + __scatterwalk_flush_dcache_pages(sg_page(walk->sg), + walk->offset, nbytes); scatterwalk_advance(walk, nbytes); } -- cgit v1.2.3 From 20d868a77f11ba050fe96e7b8efb8ec3b6f2737f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 15 Nov 2025 15:08:17 -0800 Subject: Revert "crypto: scatterwalk - Move skcipher walk and use it for memcpy_sglist" This reverts commit 0f8d42bf128d349ad490e87d5574d211245e40f1, with the memcpy_sglist() part dropped. Now that memcpy_sglist() no longer uses the skcipher_walk code, the skcipher_walk code can be moved back to where it belongs. Signed-off-by: Eric Biggers Signed-off-by: Herbert Xu --- include/crypto/algapi.h | 12 +++++++ include/crypto/internal/skcipher.h | 48 +++++++++++++++++++++++++++- include/crypto/scatterwalk.h | 65 ++------------------------------------ 3 files changed, 61 insertions(+), 64 deletions(-) (limited to 'include') diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h index fc4574940636..05deea9dac5e 100644 --- a/include/crypto/algapi.h +++ b/include/crypto/algapi.h @@ -107,6 +107,18 @@ struct crypto_queue { unsigned int max_qlen; }; +struct scatter_walk { + /* Must be the first member, see struct skcipher_walk. */ + union { + void *const addr; + + /* Private API field, do not touch. */ + union crypto_no_such_thing *__addr; + }; + struct scatterlist *sg; + unsigned int offset; +}; + struct crypto_attr_alg { char name[CRYPTO_MAX_ALG_NAME]; }; diff --git a/include/crypto/internal/skcipher.h b/include/crypto/internal/skcipher.h index d5aa535263f6..0cad8e7364c8 100644 --- a/include/crypto/internal/skcipher.h +++ b/include/crypto/internal/skcipher.h @@ -10,7 +10,6 @@ #include #include -#include #include #include @@ -55,6 +54,47 @@ struct crypto_lskcipher_spawn { struct crypto_spawn base; }; +struct skcipher_walk { + union { + /* Virtual address of the source. */ + struct { + struct { + const void *const addr; + } virt; + } src; + + /* Private field for the API, do not use. */ + struct scatter_walk in; + }; + + union { + /* Virtual address of the destination. */ + struct { + struct { + void *const addr; + } virt; + } dst; + + /* Private field for the API, do not use. */ + struct scatter_walk out; + }; + + unsigned int nbytes; + unsigned int total; + + u8 *page; + u8 *buffer; + u8 *oiv; + void *iv; + + unsigned int ivsize; + + int flags; + unsigned int blocksize; + unsigned int stride; + unsigned int alignmask; +}; + static inline struct crypto_instance *skcipher_crypto_instance( struct skcipher_instance *inst) { @@ -171,6 +211,7 @@ void crypto_unregister_lskciphers(struct lskcipher_alg *algs, int count); int lskcipher_register_instance(struct crypto_template *tmpl, struct lskcipher_instance *inst); +int skcipher_walk_done(struct skcipher_walk *walk, int res); int skcipher_walk_virt(struct skcipher_walk *__restrict walk, struct skcipher_request *__restrict req, bool atomic); @@ -181,6 +222,11 @@ int skcipher_walk_aead_decrypt(struct skcipher_walk *__restrict walk, struct aead_request *__restrict req, bool atomic); +static inline void skcipher_walk_abort(struct skcipher_walk *walk) +{ + skcipher_walk_done(walk, -ECANCELED); +} + static inline void *crypto_skcipher_ctx(struct crypto_skcipher *tfm) { return crypto_tfm_ctx(&tfm->base); diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h index f485454e3955..624fab589c2c 100644 --- a/include/crypto/scatterwalk.h +++ b/include/crypto/scatterwalk.h @@ -11,64 +11,11 @@ #ifndef _CRYPTO_SCATTERWALK_H #define _CRYPTO_SCATTERWALK_H -#include +#include + #include #include #include -#include - -struct scatter_walk { - /* Must be the first member, see struct skcipher_walk. */ - union { - void *const addr; - - /* Private API field, do not touch. */ - union crypto_no_such_thing *__addr; - }; - struct scatterlist *sg; - unsigned int offset; -}; - -struct skcipher_walk { - union { - /* Virtual address of the source. */ - struct { - struct { - const void *const addr; - } virt; - } src; - - /* Private field for the API, do not use. */ - struct scatter_walk in; - }; - - union { - /* Virtual address of the destination. */ - struct { - struct { - void *const addr; - } virt; - } dst; - - /* Private field for the API, do not use. */ - struct scatter_walk out; - }; - - unsigned int nbytes; - unsigned int total; - - u8 *page; - u8 *buffer; - u8 *oiv; - void *iv; - - unsigned int ivsize; - - int flags; - unsigned int blocksize; - unsigned int stride; - unsigned int alignmask; -}; static inline void scatterwalk_crypto_chain(struct scatterlist *head, struct scatterlist *sg, int num) @@ -306,12 +253,4 @@ struct scatterlist *scatterwalk_ffwd(struct scatterlist dst[2], struct scatterlist *src, unsigned int len); -int skcipher_walk_first(struct skcipher_walk *walk, bool atomic); -int skcipher_walk_done(struct skcipher_walk *walk, int res); - -static inline void skcipher_walk_abort(struct skcipher_walk *walk) -{ - skcipher_walk_done(walk, -ECANCELED); -} - #endif /* _CRYPTO_SCATTERWALK_H */ -- cgit v1.2.3 From 4167096cb964325ed88cd558f5b0c61fcaab44c1 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Mon, 17 Nov 2025 20:04:09 +0000 Subject: bpf: support nested rcu critical sections Currently, nested rcu critical sections are rejected by the verifier and rcu_lock state is managed by a boolean variable. Add support for nested rcu critical sections by make active_rcu_locks a counter similar to active_preempt_locks. bpf_rcu_read_lock() increments this counter and bpf_rcu_read_unlock() decrements it, MEM_RCU -> PTR_UNTRUSTED transition happens when active_rcu_locks drops to 0. Signed-off-by: Puranjay Mohan Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20251117200411.25563-2-puranjay@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 8d0b60fa5f2b..130bcbd66f60 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -416,7 +416,7 @@ struct bpf_verifier_state { u32 active_irq_id; u32 active_lock_id; void *active_lock_ptr; - bool active_rcu_lock; + u32 active_rcu_locks; bool speculative; bool in_sleepable; -- cgit v1.2.3 From ac529d86ad26d632d3c70b7c5b839282a3294d2f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:48 +0100 Subject: mempool: add mempool_{alloc,free}_bulk Add a version of the mempool allocator that works for batch allocations of multiple objects. Calling mempool_alloc in a loop is not safe because it could deadlock if multiple threads are performing such an allocation at the same time. As an extra benefit the interface is build so that the same array can be used for alloc_pages_bulk / release_pages so that at least for page backed mempools the fast path can use a nice batch optimization. Note that mempool_alloc_bulk does not take a gfp_mask argument as it must always be able to sleep and doesn't support any non-trivial modifiers. NOFO or NOIO constrainst must be set through the scoped API. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-8-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/mempool.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 34941a4b9026..e914fec0e119 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -66,9 +66,15 @@ extern void mempool_destroy(mempool_t *pool); extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; #define mempool_alloc(...) \ alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) +int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, + unsigned int count, unsigned int allocated); +#define mempool_alloc_bulk(...) \ + alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__)) extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; extern void mempool_free(void *element, mempool_t *pool); +unsigned int mempool_free_bulk(struct mempool *pool, void **elem, + unsigned int count); /* * A mempool_alloc_t and mempool_free_t that get the memory from -- cgit v1.2.3 From 8b41fb80a2cc023591f47d63b094e96af9c2c615 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:50 +0100 Subject: mempool: remove mempool_{init,create}_kvmalloc_pool This was added for bcachefs and is unused now. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-10-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/mempool.h | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'include') diff --git a/include/linux/mempool.h b/include/linux/mempool.h index e914fec0e119..d9332485e8ca 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -103,19 +103,6 @@ void mempool_kfree(void *element, void *pool_data); mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \ (void *)(unsigned long)(_size)) -void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); -void mempool_kvfree(void *element, void *pool_data); - -static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); -} - -static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) -{ - return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); -} - /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that * allocates pages of the order specified by pool_data -- cgit v1.2.3 From 0cab6873b7305abdd0acd95ee8cfa56b983500da Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 09:39:51 +0100 Subject: mempool: de-typedef Switch all uses of the deprecated mempool_t typedef in the core mempool code to use struct mempool instead. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113084022.1255121-11-hch@lst.de Signed-off-by: Vlastimil Babka --- include/linux/mempool.h | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/mempool.h b/include/linux/mempool.h index d9332485e8ca..e8e440e04a06 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -27,32 +27,31 @@ typedef struct mempool { wait_queue_head_t wait; } mempool_t; -static inline bool mempool_initialized(mempool_t *pool) +static inline bool mempool_initialized(struct mempool *pool) { return pool->elements != NULL; } -static inline bool mempool_is_saturated(mempool_t *pool) +static inline bool mempool_is_saturated(struct mempool *pool) { return READ_ONCE(pool->curr_nr) >= pool->min_nr; } -void mempool_exit(mempool_t *pool); -int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id); - -int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); +void mempool_exit(struct mempool *pool); +int mempool_init_node(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int node_id); +int mempool_init_noprof(struct mempool *pool, int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data); #define mempool_init(...) \ alloc_hooks(mempool_init_noprof(__VA_ARGS__)) -extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data); - -extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int nid); +struct mempool *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +struct mempool *mempool_create_node_noprof(int min_nr, + mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, + void *pool_data, gfp_t gfp_mask, int nid); #define mempool_create_node(...) \ alloc_hooks(mempool_create_node_noprof(__VA_ARGS__)) @@ -60,10 +59,10 @@ extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_ mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \ GFP_KERNEL, NUMA_NO_NODE) -extern int mempool_resize(mempool_t *pool, int new_min_nr); -extern void mempool_destroy(mempool_t *pool); +int mempool_resize(struct mempool *pool, int new_min_nr); +void mempool_destroy(struct mempool *pool); -extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; +void *mempool_alloc_noprof(struct mempool *pool, gfp_t gfp_mask) __malloc; #define mempool_alloc(...) \ alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, @@ -71,8 +70,8 @@ int mempool_alloc_bulk_noprof(struct mempool *pool, void **elem, #define mempool_alloc_bulk(...) \ alloc_hooks(mempool_alloc_bulk_noprof(__VA_ARGS__)) -extern void *mempool_alloc_preallocated(mempool_t *pool) __malloc; -extern void mempool_free(void *element, mempool_t *pool); +void *mempool_alloc_preallocated(struct mempool *pool) __malloc; +void mempool_free(void *element, struct mempool *pool); unsigned int mempool_free_bulk(struct mempool *pool, void **elem, unsigned int count); -- cgit v1.2.3 From 447c4e8338dbfad517769d26b53d633b88d51184 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Thu, 30 Oct 2025 20:26:28 +0200 Subject: PM / devfreq: Move governor.h to a public header location Some device drivers (and out-of-tree modules) might want to define device-specific device governors. Rather than restricting all of them to be a part of drivers/devfreq/ (which is not possible for out-of-tree drivers anyway) move governor.h to include/linux/devfreq-governor.h and update all drivers to use it. The devfreq_cpu_data is only used internally, by the passive governor, so it is moved to the driver source rather than being a part of the public interface. Reported-by: Robie Basak Acked-by: Jon Hunter Signed-off-by: Dmitry Baryshkov Reviewed-by: Bjorn Andersson Acked-by: MyungJoo Ham Signed-off-by: Chanwoo Choi Link: https://patchwork.kernel.org/project/linux-pm/patch/20251030-governor-public-v2-1-432a11a9975a@oss.qualcomm.com/ --- include/linux/devfreq-governor.h | 102 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 include/linux/devfreq-governor.h (limited to 'include') diff --git a/include/linux/devfreq-governor.h b/include/linux/devfreq-governor.h new file mode 100644 index 000000000000..dfdd0160a29f --- /dev/null +++ b/include/linux/devfreq-governor.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * governor.h - internal header for devfreq governors. + * + * Copyright (C) 2011 Samsung Electronics + * MyungJoo Ham + * + * This header is for devfreq governors + */ + +#ifndef __LINUX_DEVFREQ_DEVFREQ_H__ +#define __LINUX_DEVFREQ_DEVFREQ_H__ + +#include + +#define DEVFREQ_NAME_LEN 16 + +#define to_devfreq(DEV) container_of((DEV), struct devfreq, dev) + +/* Devfreq events */ +#define DEVFREQ_GOV_START 0x1 +#define DEVFREQ_GOV_STOP 0x2 +#define DEVFREQ_GOV_UPDATE_INTERVAL 0x3 +#define DEVFREQ_GOV_SUSPEND 0x4 +#define DEVFREQ_GOV_RESUME 0x5 + +#define DEVFREQ_MIN_FREQ 0 +#define DEVFREQ_MAX_FREQ ULONG_MAX + +/* + * Definition of the governor feature flags + * - DEVFREQ_GOV_FLAG_IMMUTABLE + * : This governor is never changeable to other governors. + * - DEVFREQ_GOV_FLAG_IRQ_DRIVEN + * : The devfreq won't schedule the work for this governor. + */ +#define DEVFREQ_GOV_FLAG_IMMUTABLE BIT(0) +#define DEVFREQ_GOV_FLAG_IRQ_DRIVEN BIT(1) + +/* + * Definition of governor attribute flags except for common sysfs attributes + * - DEVFREQ_GOV_ATTR_POLLING_INTERVAL + * : Indicate polling_interval sysfs attribute + * - DEVFREQ_GOV_ATTR_TIMER + * : Indicate timer sysfs attribute + */ +#define DEVFREQ_GOV_ATTR_POLLING_INTERVAL BIT(0) +#define DEVFREQ_GOV_ATTR_TIMER BIT(1) + +/** + * struct devfreq_governor - Devfreq policy governor + * @node: list node - contains registered devfreq governors + * @name: Governor's name + * @attrs: Governor's sysfs attribute flags + * @flags: Governor's feature flags + * @get_target_freq: Returns desired operating frequency for the device. + * Basically, get_target_freq will run + * devfreq_dev_profile.get_dev_status() to get the + * status of the device (load = busy_time / total_time). + * @event_handler: Callback for devfreq core framework to notify events + * to governors. Events include per device governor + * init and exit, opp changes out of devfreq, suspend + * and resume of per device devfreq during device idle. + * + * Note that the callbacks are called with devfreq->lock locked by devfreq. + */ +struct devfreq_governor { + struct list_head node; + + const char name[DEVFREQ_NAME_LEN]; + const u64 attrs; + const u64 flags; + int (*get_target_freq)(struct devfreq *this, unsigned long *freq); + int (*event_handler)(struct devfreq *devfreq, + unsigned int event, void *data); +}; + +void devfreq_monitor_start(struct devfreq *devfreq); +void devfreq_monitor_stop(struct devfreq *devfreq); +void devfreq_monitor_suspend(struct devfreq *devfreq); +void devfreq_monitor_resume(struct devfreq *devfreq); +void devfreq_update_interval(struct devfreq *devfreq, unsigned int *delay); + +int devfreq_add_governor(struct devfreq_governor *governor); +int devfreq_remove_governor(struct devfreq_governor *governor); + +int devm_devfreq_add_governor(struct device *dev, + struct devfreq_governor *governor); + +int devfreq_update_status(struct devfreq *devfreq, unsigned long freq); +int devfreq_update_target(struct devfreq *devfreq, unsigned long freq); +void devfreq_get_freq_range(struct devfreq *devfreq, unsigned long *min_freq, + unsigned long *max_freq); + +static inline int devfreq_update_stats(struct devfreq *df) +{ + if (!df->profile->get_dev_status) + return -EINVAL; + + return df->profile->get_dev_status(df->dev.parent, &df->last_status); +} +#endif /* __LINUX_DEVFREQ_DEVFREQ_H__ */ -- cgit v1.2.3 From 074e16d58e6b78612c22ff611aa469ee929cc37f Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 23 Nov 2025 06:48:19 +0100 Subject: compiler_types: introduce at_least parameter decoration pseudo keyword MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clang and recent gcc support warning if they are able to prove that the user is passing to a function an array that is too short in size. For example: void blah(unsigned char herp[at_least 7]); static void schma(void) { unsigned char good[] = { 1, 2, 3, 4, 5, 6, 7 }; unsigned char bad[] = { 1, 2, 3, 4, 5, 6 }; blah(good); blah(bad); } The notation here, `static 7`, which this commit makes explicit by allowing us to write it as `at_least 7`, means that it's incorrect to pass anything less than 7 elements. This is section 6.7.5.3 of C99: If the keyword static also appears within the [ and ] of the array type derivation, then for each call to the function, the value of the corresponding actual argument shall provide access to the first element of an array with at least as many elements as specified by the size expression. Here is the output from gcc 15: zx2c4@thinkpad /tmp $ gcc -c a.c a.c: In function ‘schma’: a.c:9:9: warning: ‘blah’ accessing 7 bytes in a region of size 6 [-Wstringop-overflow=] 9 | blah(bad); | ^~~~~~~~~ a.c:9:9: note: referencing argument 1 of type ‘unsigned char[7]’ a.c:2:6: note: in a call to function ‘blah’ 2 | void blah(unsigned char herp[at_least 7]); | ^~~~ And from clang 21: zx2c4@thinkpad /tmp $ clang -c a.c a.c:9:2: warning: array argument is too small; contains 6 elements, callee requires at least 7 [-Warray-bounds] 9 | blah(bad); | ^ ~~~ a.c:2:25: note: callee declares array parameter as static here 2 | void blah(unsigned char herp[at_least 7]); | ^ ~~~~~~~~~~ 1 warning generated. So these are covered by, variously, -Wstringop-overflow and -Warray-bounds. Acked-by: Ard Biesheuvel Signed-off-by: "Jason A. Donenfeld" Link: https://lore.kernel.org/r/20251123054819.2371989-3-Jason@zx2c4.com Signed-off-by: Eric Biggers --- include/linux/compiler_types.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 59288a2c1ad2..51f0dccdb54d 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -394,6 +394,21 @@ struct ftrace_likely_data { #define __counted_by_be(member) __counted_by(member) #endif +/* + * This designates the minimum number of elements a passed array parameter must + * have. For example: + * + * void some_function(u8 param[at_least 7]); + * + * If a caller passes an array with fewer than 7 elements, the compiler will + * emit a warning. + */ +#ifndef __CHECKER__ +#define at_least static +#else +#define at_least +#endif + /* Do not trap wrapping arithmetic within an annotated function. */ #ifdef CONFIG_UBSAN_INTEGER_WRAP # define __signed_wrap __attribute__((no_sanitize("signed-integer-overflow"))) -- cgit v1.2.3 From ac653d57ad8bb873c1c68fe77a1dee81cc1e365d Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 23 Nov 2025 06:48:20 +0100 Subject: lib/crypto: chacha20poly1305: Statically check fixed array lengths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several parameters of the chacha20poly1305 functions require arrays of an exact length. Use the new at_least keyword to instruct gcc and clang to statically check that the caller is passing an object of at least that length. Here it is in action, with this faulty patch to wireguard's cookie.h: struct cookie_checker { u8 secret[NOISE_HASH_LEN]; - u8 cookie_encryption_key[NOISE_SYMMETRIC_KEY_LEN]; + u8 cookie_encryption_key[NOISE_SYMMETRIC_KEY_LEN - 1]; u8 message_mac1_key[NOISE_SYMMETRIC_KEY_LEN]; If I try compiling this code, I get this helpful warning: CC drivers/net/wireguard/cookie.o drivers/net/wireguard/cookie.c: In function ‘wg_cookie_message_create’: drivers/net/wireguard/cookie.c:193:9: warning: ‘xchacha20poly1305_encrypt’ reading 32 bytes from a region of size 31 [-Wstringop-overread] 193 | xchacha20poly1305_encrypt(dst->encrypted_cookie, cookie, COOKIE_LEN, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 194 | macs->mac1, COOKIE_LEN, dst->nonce, | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 195 | checker->cookie_encryption_key); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/net/wireguard/cookie.c:193:9: note: referencing argument 7 of type ‘const u8 *’ {aka ‘const unsigned char *’} In file included from drivers/net/wireguard/messages.h:10, from drivers/net/wireguard/cookie.h:9, from drivers/net/wireguard/cookie.c:6: include/crypto/chacha20poly1305.h:28:6: note: in a call to function ‘xchacha20poly1305_encrypt’ 28 | void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, Acked-by: Ard Biesheuvel Signed-off-by: "Jason A. Donenfeld" Link: https://lore.kernel.org/r/20251123054819.2371989-4-Jason@zx2c4.com Signed-off-by: Eric Biggers --- include/crypto/chacha20poly1305.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/crypto/chacha20poly1305.h b/include/crypto/chacha20poly1305.h index d2ac3ff7dc1e..0f71b037702d 100644 --- a/include/crypto/chacha20poly1305.h +++ b/include/crypto/chacha20poly1305.h @@ -18,32 +18,33 @@ enum chacha20poly1305_lengths { void chacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, const size_t ad_len, const u64 nonce, - const u8 key[CHACHA20POLY1305_KEY_SIZE]); + const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]); bool __must_check chacha20poly1305_decrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, const size_t ad_len, const u64 nonce, - const u8 key[CHACHA20POLY1305_KEY_SIZE]); + const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]); void xchacha20poly1305_encrypt(u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, const size_t ad_len, - const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], - const u8 key[CHACHA20POLY1305_KEY_SIZE]); + const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE], + const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]); bool __must_check xchacha20poly1305_decrypt( - u8 *dst, const u8 *src, const size_t src_len, const u8 *ad, - const size_t ad_len, const u8 nonce[XCHACHA20POLY1305_NONCE_SIZE], - const u8 key[CHACHA20POLY1305_KEY_SIZE]); + u8 *dst, const u8 *src, const size_t src_len, + const u8 *ad, const size_t ad_len, + const u8 nonce[at_least XCHACHA20POLY1305_NONCE_SIZE], + const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]); bool chacha20poly1305_encrypt_sg_inplace(struct scatterlist *src, size_t src_len, const u8 *ad, const size_t ad_len, const u64 nonce, - const u8 key[CHACHA20POLY1305_KEY_SIZE]); + const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]); bool chacha20poly1305_decrypt_sg_inplace(struct scatterlist *src, size_t src_len, const u8 *ad, const size_t ad_len, const u64 nonce, - const u8 key[CHACHA20POLY1305_KEY_SIZE]); + const u8 key[at_least CHACHA20POLY1305_KEY_SIZE]); bool chacha20poly1305_selftest(void); -- cgit v1.2.3 From 1b31b43bf5c2b7ae8b0f9acac036354ea28b0397 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 22 Nov 2025 11:42:01 -0800 Subject: lib/crypto: chacha: Add at_least decoration to fixed-size array params Add the at_least (i.e. 'static') decoration to the fixed-size array parameters of the chacha library functions. This causes clang to warn when a too-small array of known size is passed. Acked-by: Ard Biesheuvel Acked-by: "Jason A. Donenfeld" Link: https://lore.kernel.org/r/20251122194206.31822-2-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/chacha.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/crypto/chacha.h b/include/crypto/chacha.h index 38e26dff27b0..1cc301a48469 100644 --- a/include/crypto/chacha.h +++ b/include/crypto/chacha.h @@ -38,18 +38,18 @@ struct chacha_state { }; void chacha_block_generic(struct chacha_state *state, - u8 out[CHACHA_BLOCK_SIZE], int nrounds); + u8 out[at_least CHACHA_BLOCK_SIZE], int nrounds); static inline void chacha20_block(struct chacha_state *state, - u8 out[CHACHA_BLOCK_SIZE]) + u8 out[at_least CHACHA_BLOCK_SIZE]) { chacha_block_generic(state, out, 20); } void hchacha_block_generic(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); + u32 out[at_least HCHACHA_OUT_WORDS], int nrounds); void hchacha_block(const struct chacha_state *state, - u32 out[HCHACHA_OUT_WORDS], int nrounds); + u32 out[at_least HCHACHA_OUT_WORDS], int nrounds); enum chacha_constants { /* expand 32-byte k */ CHACHA_CONSTANT_EXPA = 0x61707865U, @@ -67,8 +67,8 @@ static inline void chacha_init_consts(struct chacha_state *state) } static inline void chacha_init(struct chacha_state *state, - const u32 key[CHACHA_KEY_WORDS], - const u8 iv[CHACHA_IV_SIZE]) + const u32 key[at_least CHACHA_KEY_WORDS], + const u8 iv[at_least CHACHA_IV_SIZE]) { chacha_init_consts(state); state->x[4] = key[0]; -- cgit v1.2.3 From 2143d622cdf3bf93e61f2e0a728487bc871785e5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 22 Nov 2025 11:42:02 -0800 Subject: lib/crypto: curve25519: Add at_least decoration to fixed-size array params Add the at_least (i.e. 'static') decoration to the fixed-size array parameters of the curve25519 library functions. This causes clang to warn when a too-small array of known size is passed. Acked-by: Ard Biesheuvel Acked-by: "Jason A. Donenfeld" Link: https://lore.kernel.org/r/20251122194206.31822-3-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/curve25519.h | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/crypto/curve25519.h b/include/crypto/curve25519.h index db63a5577c00..2362b48f8741 100644 --- a/include/crypto/curve25519.h +++ b/include/crypto/curve25519.h @@ -13,24 +13,28 @@ enum curve25519_lengths { CURVE25519_KEY_SIZE = 32 }; -void curve25519_generic(u8 out[CURVE25519_KEY_SIZE], - const u8 scalar[CURVE25519_KEY_SIZE], - const u8 point[CURVE25519_KEY_SIZE]); +void curve25519_generic(u8 out[at_least CURVE25519_KEY_SIZE], + const u8 scalar[at_least CURVE25519_KEY_SIZE], + const u8 point[at_least CURVE25519_KEY_SIZE]); -bool __must_check curve25519(u8 mypublic[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE], - const u8 basepoint[CURVE25519_KEY_SIZE]); +bool __must_check +curve25519(u8 mypublic[at_least CURVE25519_KEY_SIZE], + const u8 secret[at_least CURVE25519_KEY_SIZE], + const u8 basepoint[at_least CURVE25519_KEY_SIZE]); -bool __must_check curve25519_generate_public(u8 pub[CURVE25519_KEY_SIZE], - const u8 secret[CURVE25519_KEY_SIZE]); +bool __must_check +curve25519_generate_public(u8 pub[at_least CURVE25519_KEY_SIZE], + const u8 secret[at_least CURVE25519_KEY_SIZE]); -static inline void curve25519_clamp_secret(u8 secret[CURVE25519_KEY_SIZE]) +static inline void +curve25519_clamp_secret(u8 secret[at_least CURVE25519_KEY_SIZE]) { secret[0] &= 248; secret[31] = (secret[31] & 127) | 64; } -static inline void curve25519_generate_secret(u8 secret[CURVE25519_KEY_SIZE]) +static inline void +curve25519_generate_secret(u8 secret[at_least CURVE25519_KEY_SIZE]) { get_random_bytes_wait(secret, CURVE25519_KEY_SIZE); curve25519_clamp_secret(secret); -- cgit v1.2.3 From 580f1d31dff62b0f0034304bd75f169b8fec6f36 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 22 Nov 2025 11:42:03 -0800 Subject: lib/crypto: md5: Add at_least decoration to fixed-size array params Add the at_least (i.e. 'static') decoration to the fixed-size array parameters of the md5 library functions. This causes clang to warn when a too-small array of known size is passed. Acked-by: Ard Biesheuvel Acked-by: "Jason A. Donenfeld" Link: https://lore.kernel.org/r/20251122194206.31822-4-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/md5.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/crypto/md5.h b/include/crypto/md5.h index c9aa5c3abc53..c47aedfe67ec 100644 --- a/include/crypto/md5.h +++ b/include/crypto/md5.h @@ -76,7 +76,7 @@ void md5_update(struct md5_ctx *ctx, const u8 *data, size_t len); * * Context: Any context. */ -void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]); +void md5_final(struct md5_ctx *ctx, u8 out[at_least MD5_DIGEST_SIZE]); /** * md5() - Compute MD5 message digest in one shot @@ -86,7 +86,7 @@ void md5_final(struct md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]); * * Context: Any context. */ -void md5(const u8 *data, size_t len, u8 out[MD5_DIGEST_SIZE]); +void md5(const u8 *data, size_t len, u8 out[at_least MD5_DIGEST_SIZE]); /** * struct hmac_md5_key - Prepared key for HMAC-MD5 @@ -173,7 +173,7 @@ static inline void hmac_md5_update(struct hmac_md5_ctx *ctx, * * Context: Any context. */ -void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]); +void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[at_least MD5_DIGEST_SIZE]); /** * hmac_md5() - Compute HMAC-MD5 in one shot, using a prepared key @@ -187,7 +187,8 @@ void hmac_md5_final(struct hmac_md5_ctx *ctx, u8 out[MD5_DIGEST_SIZE]); * Context: Any context. */ void hmac_md5(const struct hmac_md5_key *key, - const u8 *data, size_t data_len, u8 out[MD5_DIGEST_SIZE]); + const u8 *data, size_t data_len, + u8 out[at_least MD5_DIGEST_SIZE]); /** * hmac_md5_usingrawkey() - Compute HMAC-MD5 in one shot, using a raw key @@ -204,6 +205,6 @@ void hmac_md5(const struct hmac_md5_key *key, */ void hmac_md5_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, - u8 out[MD5_DIGEST_SIZE]); + u8 out[at_least MD5_DIGEST_SIZE]); #endif /* _CRYPTO_MD5_H */ -- cgit v1.2.3 From c2099fa61664e8fe8844cccdb7d1d18a5f0f94d1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 22 Nov 2025 11:42:04 -0800 Subject: lib/crypto: poly1305: Add at_least decoration to fixed-size array params Add the at_least (i.e. 'static') decoration to the fixed-size array parameters of the poly1305 library functions. This causes clang to warn when a too-small array of known size is passed. Acked-by: Ard Biesheuvel Acked-by: "Jason A. Donenfeld" Link: https://lore.kernel.org/r/20251122194206.31822-5-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/poly1305.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/crypto/poly1305.h b/include/crypto/poly1305.h index d4daeec8da19..190beb427c6d 100644 --- a/include/crypto/poly1305.h +++ b/include/crypto/poly1305.h @@ -59,7 +59,7 @@ struct poly1305_desc_ctx { }; void poly1305_init(struct poly1305_desc_ctx *desc, - const u8 key[POLY1305_KEY_SIZE]); + const u8 key[at_least POLY1305_KEY_SIZE]); void poly1305_update(struct poly1305_desc_ctx *desc, const u8 *src, unsigned int nbytes); void poly1305_final(struct poly1305_desc_ctx *desc, u8 *digest); -- cgit v1.2.3 From d5cc4e731de7edb1a2b7940d0f0badf9956dddb7 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 22 Nov 2025 11:42:05 -0800 Subject: lib/crypto: sha1: Add at_least decoration to fixed-size array params Add the at_least (i.e. 'static') decoration to the fixed-size array parameters of the sha1 library functions. This causes clang to warn when a too-small array of known size is passed. Acked-by: Ard Biesheuvel Acked-by: "Jason A. Donenfeld" Link: https://lore.kernel.org/r/20251122194206.31822-6-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha1.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h index 162a529ec841..27f08b972931 100644 --- a/include/crypto/sha1.h +++ b/include/crypto/sha1.h @@ -84,7 +84,7 @@ void sha1_update(struct sha1_ctx *ctx, const u8 *data, size_t len); * * Context: Any context. */ -void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]); +void sha1_final(struct sha1_ctx *ctx, u8 out[at_least SHA1_DIGEST_SIZE]); /** * sha1() - Compute SHA-1 message digest in one shot @@ -94,7 +94,7 @@ void sha1_final(struct sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]); * * Context: Any context. */ -void sha1(const u8 *data, size_t len, u8 out[SHA1_DIGEST_SIZE]); +void sha1(const u8 *data, size_t len, u8 out[at_least SHA1_DIGEST_SIZE]); /** * struct hmac_sha1_key - Prepared key for HMAC-SHA1 @@ -181,7 +181,8 @@ static inline void hmac_sha1_update(struct hmac_sha1_ctx *ctx, * * Context: Any context. */ -void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]); +void hmac_sha1_final(struct hmac_sha1_ctx *ctx, + u8 out[at_least SHA1_DIGEST_SIZE]); /** * hmac_sha1() - Compute HMAC-SHA1 in one shot, using a prepared key @@ -195,7 +196,8 @@ void hmac_sha1_final(struct hmac_sha1_ctx *ctx, u8 out[SHA1_DIGEST_SIZE]); * Context: Any context. */ void hmac_sha1(const struct hmac_sha1_key *key, - const u8 *data, size_t data_len, u8 out[SHA1_DIGEST_SIZE]); + const u8 *data, size_t data_len, + u8 out[at_least SHA1_DIGEST_SIZE]); /** * hmac_sha1_usingrawkey() - Compute HMAC-SHA1 in one shot, using a raw key @@ -212,6 +214,6 @@ void hmac_sha1(const struct hmac_sha1_key *key, */ void hmac_sha1_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, - u8 out[SHA1_DIGEST_SIZE]); + u8 out[at_least SHA1_DIGEST_SIZE]); #endif /* _CRYPTO_SHA1_H */ -- cgit v1.2.3 From 4f0382b0901b43552b600f8e5f806295778b0fb0 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 22 Nov 2025 11:42:06 -0800 Subject: lib/crypto: sha2: Add at_least decoration to fixed-size array params Add the at_least (i.e. 'static') decoration to the fixed-size array parameters of the sha2 library functions. This causes clang to warn when a too-small array of known size is passed. Acked-by: Ard Biesheuvel Acked-by: "Jason A. Donenfeld" Link: https://lore.kernel.org/r/20251122194206.31822-7-ebiggers@kernel.org Signed-off-by: Eric Biggers --- include/crypto/sha2.h | 53 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h index e5dafb935cc8..7bb8fe169daf 100644 --- a/include/crypto/sha2.h +++ b/include/crypto/sha2.h @@ -190,7 +190,7 @@ static inline void sha224_update(struct sha224_ctx *ctx, * * Context: Any context. */ -void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]); +void sha224_final(struct sha224_ctx *ctx, u8 out[at_least SHA224_DIGEST_SIZE]); /** * sha224() - Compute SHA-224 message digest in one shot @@ -200,7 +200,7 @@ void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]); * * Context: Any context. */ -void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]); +void sha224(const u8 *data, size_t len, u8 out[at_least SHA224_DIGEST_SIZE]); /** * struct hmac_sha224_key - Prepared key for HMAC-SHA224 @@ -287,7 +287,8 @@ static inline void hmac_sha224_update(struct hmac_sha224_ctx *ctx, * * Context: Any context. */ -void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]); +void hmac_sha224_final(struct hmac_sha224_ctx *ctx, + u8 out[at_least SHA224_DIGEST_SIZE]); /** * hmac_sha224() - Compute HMAC-SHA224 in one shot, using a prepared key @@ -301,7 +302,8 @@ void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]); * Context: Any context. */ void hmac_sha224(const struct hmac_sha224_key *key, - const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]); + const u8 *data, size_t data_len, + u8 out[at_least SHA224_DIGEST_SIZE]); /** * hmac_sha224_usingrawkey() - Compute HMAC-SHA224 in one shot, using a raw key @@ -318,7 +320,7 @@ void hmac_sha224(const struct hmac_sha224_key *key, */ void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, - u8 out[SHA224_DIGEST_SIZE]); + u8 out[at_least SHA224_DIGEST_SIZE]); /** * struct sha256_ctx - Context for hashing a message with SHA-256 @@ -363,7 +365,7 @@ static inline void sha256_update(struct sha256_ctx *ctx, * * Context: Any context. */ -void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); +void sha256_final(struct sha256_ctx *ctx, u8 out[at_least SHA256_DIGEST_SIZE]); /** * sha256() - Compute SHA-256 message digest in one shot @@ -373,7 +375,7 @@ void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); * * Context: Any context. */ -void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); +void sha256(const u8 *data, size_t len, u8 out[at_least SHA256_DIGEST_SIZE]); /** * sha256_finup_2x() - Compute two SHA-256 digests from a common initial @@ -390,8 +392,9 @@ void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]); * Context: Any context. */ void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, - const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], - u8 out2[SHA256_DIGEST_SIZE]); + const u8 *data2, size_t len, + u8 out1[at_least SHA256_DIGEST_SIZE], + u8 out2[at_least SHA256_DIGEST_SIZE]); /** * sha256_finup_2x_is_optimized() - Check if sha256_finup_2x() is using a real @@ -488,7 +491,8 @@ static inline void hmac_sha256_update(struct hmac_sha256_ctx *ctx, * * Context: Any context. */ -void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); +void hmac_sha256_final(struct hmac_sha256_ctx *ctx, + u8 out[at_least SHA256_DIGEST_SIZE]); /** * hmac_sha256() - Compute HMAC-SHA256 in one shot, using a prepared key @@ -502,7 +506,8 @@ void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]); * Context: Any context. */ void hmac_sha256(const struct hmac_sha256_key *key, - const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]); + const u8 *data, size_t data_len, + u8 out[at_least SHA256_DIGEST_SIZE]); /** * hmac_sha256_usingrawkey() - Compute HMAC-SHA256 in one shot, using a raw key @@ -519,7 +524,7 @@ void hmac_sha256(const struct hmac_sha256_key *key, */ void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, - u8 out[SHA256_DIGEST_SIZE]); + u8 out[at_least SHA256_DIGEST_SIZE]); /* State for the SHA-512 (and SHA-384) compression function */ struct sha512_block_state { @@ -598,7 +603,7 @@ static inline void sha384_update(struct sha384_ctx *ctx, * * Context: Any context. */ -void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]); +void sha384_final(struct sha384_ctx *ctx, u8 out[at_least SHA384_DIGEST_SIZE]); /** * sha384() - Compute SHA-384 message digest in one shot @@ -608,7 +613,7 @@ void sha384_final(struct sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]); * * Context: Any context. */ -void sha384(const u8 *data, size_t len, u8 out[SHA384_DIGEST_SIZE]); +void sha384(const u8 *data, size_t len, u8 out[at_least SHA384_DIGEST_SIZE]); /** * struct hmac_sha384_key - Prepared key for HMAC-SHA384 @@ -695,7 +700,8 @@ static inline void hmac_sha384_update(struct hmac_sha384_ctx *ctx, * * Context: Any context. */ -void hmac_sha384_final(struct hmac_sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]); +void hmac_sha384_final(struct hmac_sha384_ctx *ctx, + u8 out[at_least SHA384_DIGEST_SIZE]); /** * hmac_sha384() - Compute HMAC-SHA384 in one shot, using a prepared key @@ -709,7 +715,8 @@ void hmac_sha384_final(struct hmac_sha384_ctx *ctx, u8 out[SHA384_DIGEST_SIZE]); * Context: Any context. */ void hmac_sha384(const struct hmac_sha384_key *key, - const u8 *data, size_t data_len, u8 out[SHA384_DIGEST_SIZE]); + const u8 *data, size_t data_len, + u8 out[at_least SHA384_DIGEST_SIZE]); /** * hmac_sha384_usingrawkey() - Compute HMAC-SHA384 in one shot, using a raw key @@ -726,7 +733,7 @@ void hmac_sha384(const struct hmac_sha384_key *key, */ void hmac_sha384_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, - u8 out[SHA384_DIGEST_SIZE]); + u8 out[at_least SHA384_DIGEST_SIZE]); /** * struct sha512_ctx - Context for hashing a message with SHA-512 @@ -771,7 +778,7 @@ static inline void sha512_update(struct sha512_ctx *ctx, * * Context: Any context. */ -void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]); +void sha512_final(struct sha512_ctx *ctx, u8 out[at_least SHA512_DIGEST_SIZE]); /** * sha512() - Compute SHA-512 message digest in one shot @@ -781,7 +788,7 @@ void sha512_final(struct sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]); * * Context: Any context. */ -void sha512(const u8 *data, size_t len, u8 out[SHA512_DIGEST_SIZE]); +void sha512(const u8 *data, size_t len, u8 out[at_least SHA512_DIGEST_SIZE]); /** * struct hmac_sha512_key - Prepared key for HMAC-SHA512 @@ -868,7 +875,8 @@ static inline void hmac_sha512_update(struct hmac_sha512_ctx *ctx, * * Context: Any context. */ -void hmac_sha512_final(struct hmac_sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]); +void hmac_sha512_final(struct hmac_sha512_ctx *ctx, + u8 out[at_least SHA512_DIGEST_SIZE]); /** * hmac_sha512() - Compute HMAC-SHA512 in one shot, using a prepared key @@ -882,7 +890,8 @@ void hmac_sha512_final(struct hmac_sha512_ctx *ctx, u8 out[SHA512_DIGEST_SIZE]); * Context: Any context. */ void hmac_sha512(const struct hmac_sha512_key *key, - const u8 *data, size_t data_len, u8 out[SHA512_DIGEST_SIZE]); + const u8 *data, size_t data_len, + u8 out[at_least SHA512_DIGEST_SIZE]); /** * hmac_sha512_usingrawkey() - Compute HMAC-SHA512 in one shot, using a raw key @@ -899,6 +908,6 @@ void hmac_sha512(const struct hmac_sha512_key *key, */ void hmac_sha512_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, - u8 out[SHA512_DIGEST_SIZE]); + u8 out[at_least SHA512_DIGEST_SIZE]); #endif /* _CRYPTO_SHA2_H */ -- cgit v1.2.3 From 441244d4273a8037b265fd254dfdaca5fa736ee2 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 4 Nov 2025 17:29:25 -0500 Subject: SUNRPC: cleanup common code in backchannel request Create a helper function for common code between rdma and tcp backchannel handling of the backchannel request. Make sure that access is protected by the bc_pa_lock lock. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/bc_xprt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h index f22bf915dcf6..178f34ad8db6 100644 --- a/include/linux/sunrpc/bc_xprt.h +++ b/include/linux/sunrpc/bc_xprt.h @@ -25,6 +25,7 @@ void xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task, void xprt_free_bc_request(struct rpc_rqst *req); int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs); void xprt_destroy_backchannel(struct rpc_xprt *, unsigned int max_reqs); +void xprt_enqueue_bc_request(struct rpc_rqst *req); /* Socket backchannel transport methods */ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs); -- cgit v1.2.3 From 6f8b26c90a4d645fd5c944c41a6f0fd61ec27c50 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 4 Nov 2025 17:29:26 -0500 Subject: SUNRPC: new helper function for stopping backchannel server Create a new backchannel function to stop the backchannel server and clear the bc_serv in transport protected under the bc_pa_lock. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/bc_xprt.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h index 178f34ad8db6..98939cb664cf 100644 --- a/include/linux/sunrpc/bc_xprt.h +++ b/include/linux/sunrpc/bc_xprt.h @@ -32,6 +32,7 @@ int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs); void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs); void xprt_free_bc_rqst(struct rpc_rqst *req); unsigned int xprt_bc_max_slots(struct rpc_xprt *xprt); +void xprt_svc_destroy_nullify_bc(struct rpc_xprt *xprt, struct svc_serv **serv); /* * Determine if a shared backchannel is in use @@ -69,5 +70,10 @@ static inline void set_bc_enabled(struct svc_serv *serv) static inline void xprt_free_bc_request(struct rpc_rqst *req) { } + +static inline void xprt_svc_destroy_nullify_bc(struct rpc_xprt *xprt, struct svc_serv **serv) +{ + svc_destroy(serv); +} #endif /* CONFIG_SUNRPC_BACKCHANNEL */ #endif /* _LINUX_SUNRPC_BC_XPRT_H */ -- cgit v1.2.3 From 130ae65c01862e1ed30ef5ff2258990d7628f360 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 4 Nov 2025 10:06:41 -0500 Subject: NFS: Add support for sending GDD_GETATTR I add this to the existing GETATTR compound as an option extra step that we can send if the "dir_deleg" flag is set to 'true'. Actually enabling this value will happen in a later patch. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 31463286402f..8bf6cba96c46 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1092,12 +1092,19 @@ struct nfs4_getattr_arg { struct nfs4_sequence_args seq_args; const struct nfs_fh * fh; const u32 * bitmask; + bool get_dir_deleg; +}; + +struct nfs4_gdd_res { + u32 status; + nfs4_stateid deleg; }; struct nfs4_getattr_res { struct nfs4_sequence_res seq_res; const struct nfs_server * server; struct nfs_fattr * fattr; + struct nfs4_gdd_res * gdd_res; }; struct nfs4_link_arg { -- cgit v1.2.3 From 156b0948293362b036caf49e6e4d97cae30201de Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 4 Nov 2025 10:06:42 -0500 Subject: NFS: Request a directory delegation on ACCESS, CREATE, and UNLINK This patch adds a new flag: NFS_INO_REQ_DIR_DELEG to signal that a directory wants to request a directory delegation the next time it does a GETATTR. I have the client request a directory delegation when doing an access, create, or unlink call since these calls indicate that a user is working with a directory. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 + include/linux/nfs_fs_sb.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index c585939b6cd6..a6624edb7226 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -344,6 +344,7 @@ struct nfs4_copy_state { #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ #define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */ +#define NFS_INO_REQ_DIR_DELEG (13) /* Request a directory delegation */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index d30c0245031c..4ba04de6b1ca 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -305,6 +305,7 @@ struct nfs_server { #define NFS_CAP_REBOOT_LAYOUTRETURN (1U << 8) #define NFS_CAP_OFFLOAD_STATUS (1U << 9) #define NFS_CAP_ZERO_RANGE (1U << 10) +#define NFS_CAP_DIR_DELEG (1U << 11) #define NFS_CAP_OPEN_XOR (1U << 12) #define NFS_CAP_DELEGTIME (1U << 13) #define NFS_CAP_POSIX_LOCK (1U << 14) -- cgit v1.2.3 From 2da211670782637fd2d4fbba06f91d1e7c70dc0c Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 4 Nov 2025 10:06:43 -0500 Subject: NFS: Request a directory delegation during RENAME If we notice that we're renaming a file within a directory then we take that as a sign that the user is working with the current directory and may want a delegation to avoid extra revalidations when possible. The nfs_request_directory_delegation() function exists within the NFS v4 module, so I add an extra flag to rename_setup() to indicate if a dentry is being renamed within the same parent directory. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 8bf6cba96c46..79fe2dfb470f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1808,7 +1808,8 @@ struct nfs_rpc_ops { int (*unlink_done) (struct rpc_task *, struct inode *); void (*rename_setup) (struct rpc_message *msg, struct dentry *old_dentry, - struct dentry *new_dentry); + struct dentry *new_dentry, + struct inode *same_parent); void (*rename_rpc_prepare)(struct rpc_task *task, struct nfs_renamedata *); int (*rename_done) (struct rpc_task *task, struct inode *old_dir, struct inode *new_dir); int (*link) (struct inode *, struct inode *, const struct qstr *); -- cgit v1.2.3 From 84898f8e9cea06f8178fc5ca53f068180f7bfba0 Mon Sep 17 00:00:00 2001 From: Finley Xiao Date: Fri, 21 Nov 2025 15:53:49 +0800 Subject: dt-bindings: clock: rockchip: Add RK3506 clock and reset unit Add device tree bindings for clock and reset unit on RK3506 SoC. Add clock and reset IDs for RK3506 SoC. Signed-off-by: Finley Xiao Signed-off-by: Elaine Zhang Reviewed-by: Conor Dooley Link: https://patch.msgid.link/20251121075350.2564860-2-zhangqing@rock-chips.com Signed-off-by: Heiko Stuebner --- include/dt-bindings/clock/rockchip,rk3506-cru.h | 285 ++++++++++++++++++++++++ include/dt-bindings/reset/rockchip,rk3506-cru.h | 211 ++++++++++++++++++ 2 files changed, 496 insertions(+) create mode 100644 include/dt-bindings/clock/rockchip,rk3506-cru.h create mode 100644 include/dt-bindings/reset/rockchip,rk3506-cru.h (limited to 'include') diff --git a/include/dt-bindings/clock/rockchip,rk3506-cru.h b/include/dt-bindings/clock/rockchip,rk3506-cru.h new file mode 100644 index 000000000000..71d7dda23cc9 --- /dev/null +++ b/include/dt-bindings/clock/rockchip,rk3506-cru.h @@ -0,0 +1,285 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2023-2025 Rockchip Electronics Co., Ltd. + * Author: Finley Xiao + */ + +#ifndef _DT_BINDINGS_CLK_ROCKCHIP_RK3506_H +#define _DT_BINDINGS_CLK_ROCKCHIP_RK3506_H + +/* cru plls */ +#define PLL_GPLL 0 +#define PLL_V0PLL 1 +#define PLL_V1PLL 2 + +/* cru-clocks indices */ +#define ARMCLK 3 +#define CLK_DDR 4 +#define XIN24M_GATE 5 +#define CLK_GPLL_GATE 6 +#define CLK_V0PLL_GATE 7 +#define CLK_V1PLL_GATE 8 +#define CLK_GPLL_DIV 9 +#define CLK_GPLL_DIV_100M 10 +#define CLK_V0PLL_DIV 11 +#define CLK_V1PLL_DIV 12 +#define CLK_INT_VOICE_MATRIX0 13 +#define CLK_INT_VOICE_MATRIX1 14 +#define CLK_INT_VOICE_MATRIX2 15 +#define CLK_FRAC_UART_MATRIX0_MUX 16 +#define CLK_FRAC_UART_MATRIX1_MUX 17 +#define CLK_FRAC_VOICE_MATRIX0_MUX 18 +#define CLK_FRAC_VOICE_MATRIX1_MUX 19 +#define CLK_FRAC_COMMON_MATRIX0_MUX 20 +#define CLK_FRAC_COMMON_MATRIX1_MUX 21 +#define CLK_FRAC_COMMON_MATRIX2_MUX 22 +#define CLK_FRAC_UART_MATRIX0 23 +#define CLK_FRAC_UART_MATRIX1 24 +#define CLK_FRAC_VOICE_MATRIX0 25 +#define CLK_FRAC_VOICE_MATRIX1 26 +#define CLK_FRAC_COMMON_MATRIX0 27 +#define CLK_FRAC_COMMON_MATRIX1 28 +#define CLK_FRAC_COMMON_MATRIX2 29 +#define CLK_REF_USBPHY_TOP 30 +#define CLK_REF_DPHY_TOP 31 +#define ACLK_CORE_ROOT 32 +#define PCLK_CORE_ROOT 33 +#define PCLK_DBG 34 +#define PCLK_CORE_GRF 35 +#define PCLK_CORE_CRU 36 +#define CLK_CORE_EMA_DETECT 37 +#define CLK_REF_PVTPLL_CORE 38 +#define PCLK_GPIO1 39 +#define DBCLK_GPIO1 40 +#define ACLK_CORE_PERI_ROOT 41 +#define HCLK_CORE_PERI_ROOT 42 +#define PCLK_CORE_PERI_ROOT 43 +#define CLK_DSMC 44 +#define ACLK_DSMC 45 +#define PCLK_DSMC 46 +#define CLK_FLEXBUS_TX 47 +#define CLK_FLEXBUS_RX 48 +#define ACLK_FLEXBUS 49 +#define HCLK_FLEXBUS 50 +#define ACLK_DSMC_SLV 51 +#define HCLK_DSMC_SLV 52 +#define ACLK_BUS_ROOT 53 +#define HCLK_BUS_ROOT 54 +#define PCLK_BUS_ROOT 55 +#define ACLK_SYSRAM 56 +#define HCLK_SYSRAM 57 +#define ACLK_DMAC0 58 +#define ACLK_DMAC1 59 +#define HCLK_M0 60 +#define PCLK_BUS_GRF 61 +#define PCLK_TIMER 62 +#define CLK_TIMER0_CH0 63 +#define CLK_TIMER0_CH1 64 +#define CLK_TIMER0_CH2 65 +#define CLK_TIMER0_CH3 66 +#define CLK_TIMER0_CH4 67 +#define CLK_TIMER0_CH5 68 +#define PCLK_WDT0 69 +#define TCLK_WDT0 70 +#define PCLK_WDT1 71 +#define TCLK_WDT1 72 +#define PCLK_MAILBOX 73 +#define PCLK_INTMUX 74 +#define PCLK_SPINLOCK 75 +#define PCLK_DDRC 76 +#define HCLK_DDRPHY 77 +#define PCLK_DDRMON 78 +#define CLK_DDRMON_OSC 79 +#define PCLK_STDBY 80 +#define HCLK_USBOTG0 81 +#define HCLK_USBOTG0_PMU 82 +#define CLK_USBOTG0_ADP 83 +#define HCLK_USBOTG1 84 +#define HCLK_USBOTG1_PMU 85 +#define CLK_USBOTG1_ADP 86 +#define PCLK_USBPHY 87 +#define ACLK_DMA2DDR 88 +#define PCLK_DMA2DDR 89 +#define STCLK_M0 90 +#define CLK_DDRPHY 91 +#define CLK_DDRC_SRC 92 +#define ACLK_DDRC_0 93 +#define ACLK_DDRC_1 94 +#define CLK_DDRC 95 +#define CLK_DDRMON 96 +#define HCLK_LSPERI_ROOT 97 +#define PCLK_LSPERI_ROOT 98 +#define PCLK_UART0 99 +#define PCLK_UART1 100 +#define PCLK_UART2 101 +#define PCLK_UART3 102 +#define PCLK_UART4 103 +#define SCLK_UART0 104 +#define SCLK_UART1 105 +#define SCLK_UART2 106 +#define SCLK_UART3 107 +#define SCLK_UART4 108 +#define PCLK_I2C0 109 +#define CLK_I2C0 110 +#define PCLK_I2C1 111 +#define CLK_I2C1 112 +#define PCLK_I2C2 113 +#define CLK_I2C2 114 +#define PCLK_PWM1 115 +#define CLK_PWM1 116 +#define CLK_OSC_PWM1 117 +#define CLK_RC_PWM1 118 +#define CLK_FREQ_PWM1 119 +#define CLK_COUNTER_PWM1 120 +#define PCLK_SPI0 121 +#define CLK_SPI0 122 +#define PCLK_SPI1 123 +#define CLK_SPI1 124 +#define PCLK_GPIO2 125 +#define DBCLK_GPIO2 126 +#define PCLK_GPIO3 127 +#define DBCLK_GPIO3 128 +#define PCLK_GPIO4 129 +#define DBCLK_GPIO4 130 +#define HCLK_CAN0 131 +#define CLK_CAN0 132 +#define HCLK_CAN1 133 +#define CLK_CAN1 134 +#define HCLK_PDM 135 +#define MCLK_PDM 136 +#define CLKOUT_PDM 137 +#define MCLK_SPDIFTX 138 +#define HCLK_SPDIFTX 139 +#define HCLK_SPDIFRX 140 +#define MCLK_SPDIFRX 141 +#define MCLK_SAI0 142 +#define HCLK_SAI0 143 +#define MCLK_OUT_SAI0 144 +#define MCLK_SAI1 145 +#define HCLK_SAI1 146 +#define MCLK_OUT_SAI1 147 +#define HCLK_ASRC0 148 +#define CLK_ASRC0 149 +#define HCLK_ASRC1 150 +#define CLK_ASRC1 151 +#define PCLK_CRU 152 +#define PCLK_PMU_ROOT 153 +#define MCLK_ASRC0 154 +#define MCLK_ASRC1 155 +#define MCLK_ASRC2 156 +#define MCLK_ASRC3 157 +#define LRCK_ASRC0_SRC 158 +#define LRCK_ASRC0_DST 159 +#define LRCK_ASRC1_SRC 160 +#define LRCK_ASRC1_DST 161 +#define ACLK_HSPERI_ROOT 162 +#define HCLK_HSPERI_ROOT 163 +#define PCLK_HSPERI_ROOT 164 +#define CCLK_SRC_SDMMC 165 +#define HCLK_SDMMC 166 +#define HCLK_FSPI 167 +#define SCLK_FSPI 168 +#define PCLK_SPI2 169 +#define ACLK_MAC0 170 +#define ACLK_MAC1 171 +#define PCLK_MAC0 172 +#define PCLK_MAC1 173 +#define CLK_MAC_ROOT 174 +#define CLK_MAC0 175 +#define CLK_MAC1 176 +#define MCLK_SAI2 177 +#define HCLK_SAI2 178 +#define MCLK_OUT_SAI2 179 +#define MCLK_SAI3_SRC 180 +#define HCLK_SAI3 181 +#define MCLK_SAI3 182 +#define MCLK_OUT_SAI3 183 +#define MCLK_SAI4_SRC 184 +#define HCLK_SAI4 185 +#define MCLK_SAI4 186 +#define HCLK_DSM 187 +#define MCLK_DSM 188 +#define PCLK_AUDIO_ADC 189 +#define MCLK_AUDIO_ADC 190 +#define MCLK_AUDIO_ADC_DIV4 191 +#define PCLK_SARADC 192 +#define CLK_SARADC 193 +#define PCLK_OTPC_NS 194 +#define CLK_SBPI_OTPC_NS 195 +#define CLK_USER_OTPC_NS 196 +#define PCLK_UART5 197 +#define SCLK_UART5 198 +#define PCLK_GPIO234_IOC 199 +#define CLK_MAC_PTP_ROOT 200 +#define CLK_MAC0_PTP 201 +#define CLK_MAC1_PTP 202 +#define CLK_SPI2 203 +#define ACLK_VIO_ROOT 204 +#define HCLK_VIO_ROOT 205 +#define PCLK_VIO_ROOT 206 +#define HCLK_RGA 207 +#define ACLK_RGA 208 +#define CLK_CORE_RGA 209 +#define ACLK_VOP 210 +#define HCLK_VOP 211 +#define DCLK_VOP 212 +#define PCLK_DPHY 213 +#define PCLK_DSI_HOST 214 +#define PCLK_TSADC 215 +#define CLK_TSADC 216 +#define CLK_TSADC_TSEN 217 +#define PCLK_GPIO1_IOC 218 +#define PCLK_OTPC_S 219 +#define CLK_SBPI_OTPC_S 220 +#define CLK_USER_OTPC_S 221 +#define PCLK_OTP_MASK 222 +#define PCLK_KEYREADER 223 +#define HCLK_BOOTROM 224 +#define PCLK_DDR_SERVICE 225 +#define HCLK_CRYPTO_S 226 +#define HCLK_KEYLAD 227 +#define CLK_CORE_CRYPTO 228 +#define CLK_PKA_CRYPTO 229 +#define CLK_CORE_CRYPTO_S 230 +#define CLK_PKA_CRYPTO_S 231 +#define ACLK_CRYPTO_S 232 +#define HCLK_RNG_S 233 +#define CLK_CORE_CRYPTO_NS 234 +#define CLK_PKA_CRYPTO_NS 235 +#define ACLK_CRYPTO_NS 236 +#define HCLK_CRYPTO_NS 237 +#define HCLK_RNG 238 +#define CLK_PMU 239 +#define PCLK_PMU 240 +#define CLK_PMU_32K 241 +#define PCLK_PMU_CRU 242 +#define PCLK_PMU_GRF 243 +#define PCLK_GPIO0_IOC 244 +#define PCLK_GPIO0 245 +#define DBCLK_GPIO0 246 +#define PCLK_GPIO1_SHADOW 247 +#define DBCLK_GPIO1_SHADOW 248 +#define PCLK_PMU_HP_TIMER 249 +#define CLK_PMU_HP_TIMER 250 +#define CLK_PMU_HP_TIMER_32K 251 +#define PCLK_PWM0 252 +#define CLK_PWM0 253 +#define CLK_OSC_PWM0 254 +#define CLK_RC_PWM0 255 +#define CLK_MAC_OUT 256 +#define CLK_REF_OUT0 257 +#define CLK_REF_OUT1 258 +#define CLK_32K_FRAC 259 +#define CLK_32K_RC 260 +#define CLK_32K 261 +#define CLK_32K_PMU 262 +#define PCLK_TOUCH_KEY 263 +#define CLK_TOUCH_KEY 264 +#define CLK_REF_PHY_PLL 265 +#define CLK_REF_PHY_PMU_MUX 266 +#define CLK_WIFI_OUT 267 +#define CLK_V0PLL_REF 268 +#define CLK_V1PLL_REF 269 +#define CLK_32K_FRAC_MUX 270 + +#endif diff --git a/include/dt-bindings/reset/rockchip,rk3506-cru.h b/include/dt-bindings/reset/rockchip,rk3506-cru.h new file mode 100644 index 000000000000..31c0d4aa410f --- /dev/null +++ b/include/dt-bindings/reset/rockchip,rk3506-cru.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* + * Copyright (c) 2023-2025 Rockchip Electronics Co., Ltd. + * Author: Finley Xiao + */ + +#ifndef _DT_BINDINGS_REST_ROCKCHIP_RK3506_H +#define _DT_BINDINGS_REST_ROCKCHIP_RK3506_H + +/* CRU-->SOFTRST_CON00 */ +#define SRST_NCOREPORESET0_AC 0 +#define SRST_NCOREPORESET1_AC 1 +#define SRST_NCOREPORESET2_AC 2 +#define SRST_NCORESET0_AC 3 +#define SRST_NCORESET1_AC 4 +#define SRST_NCORESET2_AC 5 +#define SRST_NL2RESET_AC 6 +#define SRST_A_CORE_BIU_AC 7 +#define SRST_H_M0_AC 8 + +/* CRU-->SOFTRST_CON02 */ +#define SRST_NDBGRESET 9 +#define SRST_P_CORE_BIU 10 +#define SRST_PMU 11 + +/* CRU-->SOFTRST_CON03 */ +#define SRST_P_DBG 12 +#define SRST_POT_DBG 13 +#define SRST_P_CORE_GRF 14 +#define SRST_CORE_EMA_DETECT 15 +#define SRST_REF_PVTPLL_CORE 16 +#define SRST_P_GPIO1 17 +#define SRST_DB_GPIO1 18 + +/* CRU-->SOFTRST_CON04 */ +#define SRST_A_CORE_PERI_BIU 19 +#define SRST_A_DSMC 20 +#define SRST_P_DSMC 21 +#define SRST_FLEXBUS 22 +#define SRST_A_FLEXBUS 23 +#define SRST_H_FLEXBUS 24 +#define SRST_A_DSMC_SLV 25 +#define SRST_H_DSMC_SLV 26 +#define SRST_DSMC_SLV 27 + +/* CRU-->SOFTRST_CON05 */ +#define SRST_A_BUS_BIU 28 +#define SRST_H_BUS_BIU 29 +#define SRST_P_BUS_BIU 30 +#define SRST_A_SYSRAM 31 +#define SRST_H_SYSRAM 32 +#define SRST_A_DMAC0 33 +#define SRST_A_DMAC1 34 +#define SRST_H_M0 35 +#define SRST_M0_JTAG 36 +#define SRST_H_CRYPTO 37 + +/* CRU-->SOFTRST_CON06 */ +#define SRST_H_RNG 38 +#define SRST_P_BUS_GRF 39 +#define SRST_P_TIMER0 40 +#define SRST_TIMER0_CH0 41 +#define SRST_TIMER0_CH1 42 +#define SRST_TIMER0_CH2 43 +#define SRST_TIMER0_CH3 44 +#define SRST_TIMER0_CH4 45 +#define SRST_TIMER0_CH5 46 +#define SRST_P_WDT0 47 +#define SRST_T_WDT0 48 +#define SRST_P_WDT1 49 +#define SRST_T_WDT1 50 +#define SRST_P_MAILBOX 51 +#define SRST_P_INTMUX 52 +#define SRST_P_SPINLOCK 53 + +/* CRU-->SOFTRST_CON07 */ +#define SRST_P_DDRC 54 +#define SRST_H_DDRPHY 55 +#define SRST_P_DDRMON 56 +#define SRST_DDRMON_OSC 57 +#define SRST_P_DDR_LPC 58 +#define SRST_H_USBOTG0 59 +#define SRST_USBOTG0_ADP 60 +#define SRST_H_USBOTG1 61 +#define SRST_USBOTG1_ADP 62 +#define SRST_P_USBPHY 63 +#define SRST_USBPHY_POR 64 +#define SRST_USBPHY_OTG0 65 +#define SRST_USBPHY_OTG1 66 + +/* CRU-->SOFTRST_CON08 */ +#define SRST_A_DMA2DDR 67 +#define SRST_P_DMA2DDR 68 + +/* CRU-->SOFTRST_CON09 */ +#define SRST_USBOTG0_UTMI 69 +#define SRST_USBOTG1_UTMI 70 + +/* CRU-->SOFTRST_CON10 */ +#define SRST_A_DDRC_0 71 +#define SRST_A_DDRC_1 72 +#define SRST_A_DDR_BIU 73 +#define SRST_DDRC 74 +#define SRST_DDRMON 75 + +/* CRU-->SOFTRST_CON11 */ +#define SRST_H_LSPERI_BIU 76 +#define SRST_P_UART0 77 +#define SRST_P_UART1 78 +#define SRST_P_UART2 79 +#define SRST_P_UART3 80 +#define SRST_P_UART4 81 +#define SRST_UART0 82 +#define SRST_UART1 83 +#define SRST_UART2 84 +#define SRST_UART3 85 +#define SRST_UART4 86 +#define SRST_P_I2C0 87 +#define SRST_I2C0 88 + +/* CRU-->SOFTRST_CON12 */ +#define SRST_P_I2C1 89 +#define SRST_I2C1 90 +#define SRST_P_I2C2 91 +#define SRST_I2C2 92 +#define SRST_P_PWM1 93 +#define SRST_PWM1 94 +#define SRST_P_SPI0 95 +#define SRST_SPI0 96 +#define SRST_P_SPI1 97 +#define SRST_SPI1 98 +#define SRST_P_GPIO2 99 +#define SRST_DB_GPIO2 100 + +/* CRU-->SOFTRST_CON13 */ +#define SRST_P_GPIO3 101 +#define SRST_DB_GPIO3 102 +#define SRST_P_GPIO4 103 +#define SRST_DB_GPIO4 104 +#define SRST_H_CAN0 105 +#define SRST_CAN0 106 +#define SRST_H_CAN1 107 +#define SRST_CAN1 108 +#define SRST_H_PDM 109 +#define SRST_M_PDM 110 +#define SRST_PDM 111 +#define SRST_SPDIFTX 112 +#define SRST_H_SPDIFTX 113 +#define SRST_H_SPDIFRX 114 +#define SRST_SPDIFRX 115 +#define SRST_M_SAI0 116 + +/* CRU-->SOFTRST_CON14 */ +#define SRST_H_SAI0 117 +#define SRST_M_SAI1 118 +#define SRST_H_SAI1 119 +#define SRST_H_ASRC0 120 +#define SRST_ASRC0 121 +#define SRST_H_ASRC1 122 +#define SRST_ASRC1 123 + +/* CRU-->SOFTRST_CON17 */ +#define SRST_H_HSPERI_BIU 124 +#define SRST_H_SDMMC 125 +#define SRST_H_FSPI 126 +#define SRST_S_FSPI 127 +#define SRST_P_SPI2 128 +#define SRST_A_MAC0 129 +#define SRST_A_MAC1 130 + +/* CRU-->SOFTRST_CON18 */ +#define SRST_M_SAI2 131 +#define SRST_H_SAI2 132 +#define SRST_H_SAI3 133 +#define SRST_M_SAI3 134 +#define SRST_H_SAI4 135 +#define SRST_M_SAI4 136 +#define SRST_H_DSM 137 +#define SRST_M_DSM 138 +#define SRST_P_AUDIO_ADC 139 +#define SRST_M_AUDIO_ADC 140 + +/* CRU-->SOFTRST_CON19 */ +#define SRST_P_SARADC 141 +#define SRST_SARADC 142 +#define SRST_SARADC_PHY 143 +#define SRST_P_OTPC_NS 144 +#define SRST_SBPI_OTPC_NS 145 +#define SRST_USER_OTPC_NS 146 +#define SRST_P_UART5 147 +#define SRST_UART5 148 +#define SRST_P_GPIO234_IOC 149 + +/* CRU-->SOFTRST_CON21 */ +#define SRST_A_VIO_BIU 150 +#define SRST_H_VIO_BIU 151 +#define SRST_H_RGA 152 +#define SRST_A_RGA 153 +#define SRST_CORE_RGA 154 +#define SRST_A_VOP 155 +#define SRST_H_VOP 156 +#define SRST_VOP 157 +#define SRST_P_DPHY 158 +#define SRST_P_DSI_HOST 159 +#define SRST_P_TSADC 160 +#define SRST_TSADC 161 + +/* CRU-->SOFTRST_CON22 */ +#define SRST_P_GPIO1_IOC 162 + +#endif -- cgit v1.2.3 From 0f1f9b5e47cec229dc2127481807823b75e933b0 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Thu, 20 Nov 2025 17:15:15 +0200 Subject: RDMA/core: Add new IB rate for XDR (8x) support Add the new rates as defined in the Infiniband spec for XDR and 8x link width support. Furthermore, modify the utility conversion methods accordingly. Reference: IB Spec Release 1.8 Reviewed-by: Michael Guralnik Signed-off-by: Maher Sanalla Link: https://patch.msgid.link/20251120-speed-8-v1-1-e6a7efef8cb8@nvidia.com Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0a85af610b6b..6aad66bc5dd7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -859,6 +859,7 @@ enum ib_rate { IB_RATE_400_GBPS = 21, IB_RATE_600_GBPS = 22, IB_RATE_800_GBPS = 23, + IB_RATE_1600_GBPS = 25, }; /** -- cgit v1.2.3 From e950d1f84d3c16e86dd1b6066c3ac3958099fa79 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 19 Nov 2025 15:37:56 +0100 Subject: s390/percpu: Get rid of ARCH_MODULE_NEEDS_WEAK_PER_CPU Since the rework of the kernel virtual address space [1] the module area and the kernel image are within the same 4GB area. Therefore there is no need for the weak per cpu workaround for modules anymore. Remove it. [1] commit c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- include/linux/percpu-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index 12d90360f6db..43c854a273c3 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -52,7 +52,7 @@ __section(".discard") __attribute__((unused)) /* - * s390 and alpha modules require percpu variables to be defined as + * alpha modules require percpu variables to be defined as * weak to force the compiler to generate GOT based external * references for them. This is necessary because percpu sections * will be located outside of the usually addressable area. -- cgit v1.2.3 From 385aab8fccd7a8746b9f1a17f3c1e38498a14bc7 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 8 Oct 2025 12:41:48 +0200 Subject: wifi: mt76: wed: use proper wed reference in mt76 wed driver callabacks MT7996 driver can use both wed and wed_hif2 devices to offload traffic from/to the wireless NIC. In the current codebase we assume to always use the primary wed device in wed callbacks resulting in the following crash if the hw runs wed_hif2 (e.g. 6GHz link). [ 297.455876] Unable to handle kernel read from unreadable memory at virtual address 000000000000080a [ 297.464928] Mem abort info: [ 297.467722] ESR = 0x0000000096000005 [ 297.471461] EC = 0x25: DABT (current EL), IL = 32 bits [ 297.476766] SET = 0, FnV = 0 [ 297.479809] EA = 0, S1PTW = 0 [ 297.482940] FSC = 0x05: level 1 translation fault [ 297.487809] Data abort info: [ 297.490679] ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 [ 297.496156] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 297.501196] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 297.506500] user pgtable: 4k pages, 39-bit VAs, pgdp=0000000107480000 [ 297.512927] [000000000000080a] pgd=08000001097fb003, p4d=08000001097fb003, pud=08000001097fb003, pmd=0000000000000000 [ 297.523532] Internal error: Oops: 0000000096000005 [#1] SMP [ 297.715393] CPU: 2 UID: 0 PID: 45 Comm: kworker/u16:2 Tainted: G O 6.12.50 #0 [ 297.723908] Tainted: [O]=OOT_MODULE [ 297.727384] Hardware name: Banana Pi BPI-R4 (2x SFP+) (DT) [ 297.732857] Workqueue: nf_ft_offload_del nf_flow_rule_route_ipv6 [nf_flow_table] [ 297.740254] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 297.747205] pc : mt76_wed_offload_disable+0x64/0xa0 [mt76] [ 297.752688] lr : mtk_wed_flow_remove+0x58/0x80 [ 297.757126] sp : ffffffc080fe3ae0 [ 297.760430] x29: ffffffc080fe3ae0 x28: ffffffc080fe3be0 x27: 00000000deadbef7 [ 297.767557] x26: ffffff80c5ebca00 x25: 0000000000000001 x24: ffffff80c85f4c00 [ 297.774683] x23: ffffff80c1875b78 x22: ffffffc080d42cd0 x21: ffffffc080660018 [ 297.781809] x20: ffffff80c6a076d0 x19: ffffff80c6a043c8 x18: 0000000000000000 [ 297.788935] x17: 0000000000000000 x16: 0000000000000001 x15: 0000000000000000 [ 297.796060] x14: 0000000000000019 x13: ffffff80c0ad8ec0 x12: 00000000fa83b2da [ 297.803185] x11: ffffff80c02700c0 x10: ffffff80c0ad8ec0 x9 : ffffff81fef96200 [ 297.810311] x8 : ffffff80c02700c0 x7 : ffffff80c02700d0 x6 : 0000000000000002 [ 297.817435] x5 : 0000000000000400 x4 : 0000000000000000 x3 : 0000000000000000 [ 297.824561] x2 : 0000000000000001 x1 : 0000000000000800 x0 : ffffff80c6a063c8 [ 297.831686] Call trace: [ 297.834123] mt76_wed_offload_disable+0x64/0xa0 [mt76] [ 297.839254] mtk_wed_flow_remove+0x58/0x80 [ 297.843342] mtk_flow_offload_cmd+0x434/0x574 [ 297.847689] mtk_wed_setup_tc_block_cb+0x30/0x40 [ 297.852295] nf_flow_offload_ipv6_hook+0x7f4/0x964 [nf_flow_table] [ 297.858466] nf_flow_rule_route_ipv6+0x438/0x4a4 [nf_flow_table] [ 297.864463] process_one_work+0x174/0x300 [ 297.868465] worker_thread+0x278/0x430 [ 297.872204] kthread+0xd8/0xdc [ 297.875251] ret_from_fork+0x10/0x20 [ 297.878820] Code: 928b5ae0 8b000273 91400a60 f943fa61 (79401421) [ 297.884901] ---[ end trace 0000000000000000 ]--- Fix the issue detecting the proper wed reference to use running wed callabacks. Fixes: 83eafc9251d6 ("wifi: mt76: mt7996: add wed tx support") Tested-by: Daniel Pawlik Tested-by: Matteo Croce Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20251008-wed-fixes-v1-1-8f7678583385@kernel.org Signed-off-by: Felix Fietkau --- include/linux/soc/mediatek/mtk_wed.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index c4ff6bab176d..3fa93bd65004 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -154,6 +154,7 @@ struct mtk_wed_device { bool wcid_512; bool hw_rro; bool msi; + bool hif2; u16 token_start; unsigned int nbuf; -- cgit v1.2.3 From 7fb554b1b623c7da845521604bd05fa9570d07bc Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 17 Oct 2025 10:50:32 +0200 Subject: wifi: mt76: Introduce the NPU generic layer Add the NPU generic layer in mt76 module. NPU will be used to enable traffic forward offloading between the MT76 NIC and the Airoha ethernet one available on the Airoha EN7581 SoC using Netfilter Flowtable APIs. Signed-off-by: Lorenzo Bianconi Link: https://patch.msgid.link/20251017-mt76-npu-devel-v2-4-ddaa90901723@kernel.org Signed-off-by: Felix Fietkau --- include/linux/soc/airoha/airoha_offload.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/soc/airoha/airoha_offload.h b/include/linux/soc/airoha/airoha_offload.h index 6f66eb339b3f..4d23cbb7d407 100644 --- a/include/linux/soc/airoha/airoha_offload.h +++ b/include/linux/soc/airoha/airoha_offload.h @@ -6,6 +6,7 @@ #ifndef AIROHA_OFFLOAD_H #define AIROHA_OFFLOAD_H +#include #include #include -- cgit v1.2.3 From cbbfba4847b8a5299d36e002bf864b21bb83295d Mon Sep 17 00:00:00 2001 From: James Clark Date: Tue, 11 Nov 2025 11:37:55 +0000 Subject: perf: Add perf_event_attr::config4 Arm FEAT_SPE_FDS adds the ability to filter on the data source of a packet using another 64-bits of event filtering control. As the existing perf_event_attr::configN fields are all used up for SPE PMU, an additional field is needed. Add a new 'config4' field. Reviewed-by: Leo Yan Tested-by: Leo Yan Reviewed-by: Ian Rogers Acked-by: Peter Zijlstra (Intel) Signed-off-by: James Clark Signed-off-by: Will Deacon --- include/uapi/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 78a362b80027..0d0ed85ad8cb 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -382,6 +382,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER6 120 /* Add: aux_sample_size */ #define PERF_ATTR_SIZE_VER7 128 /* Add: sig_data */ #define PERF_ATTR_SIZE_VER8 136 /* Add: config3 */ +#define PERF_ATTR_SIZE_VER9 144 /* add: config4 */ /* * 'struct perf_event_attr' contains various attributes that define @@ -543,6 +544,7 @@ struct perf_event_attr { __u64 sig_data; __u64 config3; /* extension of config2 */ + __u64 config4; /* extension of config3 */ }; /* -- cgit v1.2.3 From 935419b9fb74ab2643583fce750cb774c9b5faa6 Mon Sep 17 00:00:00 2001 From: Dinh Nguyen Date: Fri, 14 Nov 2025 12:58:15 -0600 Subject: firmware: stratix10-svc: fix make htmldocs warning Stephen Rothwell reports htmldocs warnings when merging char-misc tree: WARNING: include/linux/firmware/intel/stratix10-svc-client.h:22 This comment starts with '/**', but isn't a kernel-doc comment. WARNING: include/linux/firmware/intel/stratix10-svc-client.h:184 Enum value 'COMMAND_HWMON_READTEMP' not described in enum 'stratix10_svc_command_code' WARNING: include/linux/firmware/intel/stratix10-svc-client.h:184 Enum value 'COMMAND_HWMON_READVOLT' not described in enum 'stratix10_svc_command_code' WARNING: include/linux/firmware/intel/stratix10-svc-client.h:307 function parameter 'cb_arg' not described in 'async_callback_t' Fixes: 4f49088c1625 ("firmware: stratix10-svc: Add definition for voltage and temperature sensor") Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/linux-next/20251114153920.1c5df700@canb.auug.org.au/ Signed-off-by: Dinh Nguyen Link: https://patch.msgid.link/20251114185815.358423-3-dinguyen@kernel.org Signed-off-by: Greg Kroah-Hartman --- include/linux/firmware/intel/stratix10-svc-client.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/firmware/intel/stratix10-svc-client.h b/include/linux/firmware/intel/stratix10-svc-client.h index 1bcc56d14080..d290060f4c73 100644 --- a/include/linux/firmware/intel/stratix10-svc-client.h +++ b/include/linux/firmware/intel/stratix10-svc-client.h @@ -19,7 +19,7 @@ #define SVC_CLIENT_FCS "fcs" #define SVC_CLIENT_HWMON "hwmon" -/** +/* * Status of the sent command, in bit number * * SVC_STATUS_OK: @@ -148,6 +148,12 @@ struct stratix10_svc_chan; * * @COMMAND_FCS_RANDOM_NUMBER_GEN: generate a random number, return status * is SVC_STATUS_OK, SVC_STATUS_ERROR + * + * @COMMAND_HWMON_READTEMP: query the temperature from the hardware monitor, + * return status is SVC_STATUS_OK or SVC_STATUS_ERROR + * + * @COMMAND_HWMON_READVOLT: query the voltage from the hardware monitor, + * return status is SVC_STATUS_OK or SVC_STATUS_ERROR */ enum stratix10_svc_command_code { /* for FPGA */ @@ -303,7 +309,7 @@ void stratix10_svc_done(struct stratix10_svc_chan *chan); * The callback function takes a single argument, which is a pointer to * user-defined data. * - * @param cb_arg A pointer to user-defined data passed to the callback function. + * @cb_arg: Argument to be passed to the callback function. */ typedef void (*async_callback_t)(void *cb_arg); -- cgit v1.2.3 From e6ab504633e4c06e35377ecf3c8cbc304de79858 Mon Sep 17 00:00:00 2001 From: Dave Penkler Date: Mon, 17 Nov 2025 15:40:21 +0100 Subject: staging: gpib: Destage gpib Move the gpib drivers out of staging and into the "real" part of the kernel. This entails: - Remove the gpib Kconfig menu and Makefile build rule from staging. - Remove gpib/uapi from the header file search path in subdir-ccflags of the gpib Makefile - move the gpib/uapi files to include/uapi/linux - Move the gpib tree out of staging to drivers. - Remove the word "Linux" from the gpib Kconfig file. - Add the gpib Kconfig menu and Makefile build rule to drivers Signed-off-by: Dave Penkler Link: https://patch.msgid.link/20251117144021.23569-5-dpenkler@gmail.com Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/gpib.h | 104 +++++++++++++++++++++++++ include/uapi/linux/gpib_ioctl.h | 167 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 include/uapi/linux/gpib.h create mode 100644 include/uapi/linux/gpib_ioctl.h (limited to 'include') diff --git a/include/uapi/linux/gpib.h b/include/uapi/linux/gpib.h new file mode 100644 index 000000000000..2a7f5eeb9777 --- /dev/null +++ b/include/uapi/linux/gpib.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +/*************************************************************************** + * copyright : (C) 2002 by Frank Mori Hess + ***************************************************************************/ + +#ifndef _GPIB_H +#define _GPIB_H + +#define GPIB_MAX_NUM_BOARDS 16 +#define GPIB_MAX_NUM_DESCRIPTORS 0x1000 + +enum ibsta_bit_numbers { + DCAS_NUM = 0, + DTAS_NUM = 1, + LACS_NUM = 2, + TACS_NUM = 3, + ATN_NUM = 4, + CIC_NUM = 5, + REM_NUM = 6, + LOK_NUM = 7, + CMPL_NUM = 8, + EVENT_NUM = 9, + SPOLL_NUM = 10, + RQS_NUM = 11, + SRQI_NUM = 12, + END_NUM = 13, + TIMO_NUM = 14, + ERR_NUM = 15 +}; + +/* IBSTA status bits (returned by all functions) */ +enum ibsta_bits { + DCAS = (1 << DCAS_NUM), /* device clear state */ + DTAS = (1 << DTAS_NUM), /* device trigger state */ + LACS = (1 << LACS_NUM), /* GPIB interface is addressed as Listener */ + TACS = (1 << TACS_NUM), /* GPIB interface is addressed as Talker */ + ATN = (1 << ATN_NUM), /* Attention is asserted */ + CIC = (1 << CIC_NUM), /* GPIB interface is Controller-in-Charge */ + REM = (1 << REM_NUM), /* remote state */ + LOK = (1 << LOK_NUM), /* lockout state */ + CMPL = (1 << CMPL_NUM), /* I/O is complete */ + EVENT = (1 << EVENT_NUM), /* DCAS, DTAS, or IFC has occurred */ + SPOLL = (1 << SPOLL_NUM), /* board serial polled by busmaster */ + RQS = (1 << RQS_NUM), /* Device requesting service */ + SRQI = (1 << SRQI_NUM), /* SRQ is asserted */ + END = (1 << END_NUM), /* EOI or EOS encountered */ + TIMO = (1 << TIMO_NUM), /* Time limit on I/O or wait function exceeded */ + ERR = (1 << ERR_NUM), /* Function call terminated on error */ + + device_status_mask = ERR | TIMO | END | CMPL | RQS, + board_status_mask = ERR | TIMO | END | CMPL | SPOLL | + EVENT | LOK | REM | CIC | ATN | TACS | LACS | DTAS | DCAS | SRQI, +}; + +/* End-of-string (EOS) modes for use with ibeos */ + +enum eos_flags { + EOS_MASK = 0x1c00, + REOS = 0x0400, /* Terminate reads on EOS */ + XEOS = 0x800, /* assert EOI when EOS char is sent */ + BIN = 0x1000 /* Do 8-bit compare on EOS */ +}; + +/* GPIB Bus Control Lines bit vector */ +enum bus_control_line { + VALID_DAV = 0x01, + VALID_NDAC = 0x02, + VALID_NRFD = 0x04, + VALID_IFC = 0x08, + VALID_REN = 0x10, + VALID_SRQ = 0x20, + VALID_ATN = 0x40, + VALID_EOI = 0x80, + VALID_ALL = 0xff, + BUS_DAV = 0x0100, /* DAV line status bit */ + BUS_NDAC = 0x0200, /* NDAC line status bit */ + BUS_NRFD = 0x0400, /* NRFD line status bit */ + BUS_IFC = 0x0800, /* IFC line status bit */ + BUS_REN = 0x1000, /* REN line status bit */ + BUS_SRQ = 0x2000, /* SRQ line status bit */ + BUS_ATN = 0x4000, /* ATN line status bit */ + BUS_EOI = 0x8000 /* EOI line status bit */ +}; + +enum ppe_bits { + PPC_DISABLE = 0x10, + PPC_SENSE = 0x8, /* parallel poll sense bit */ + PPC_DIO_MASK = 0x7 +}; + +enum { + request_service_bit = 0x40, +}; + +enum gpib_events { + EVENT_NONE = 0, + EVENT_DEV_TRG = 1, + EVENT_DEV_CLR = 2, + EVENT_IFC = 3 +}; + +#endif /* _GPIB_H */ + diff --git a/include/uapi/linux/gpib_ioctl.h b/include/uapi/linux/gpib_ioctl.h new file mode 100644 index 000000000000..d544d8e4362c --- /dev/null +++ b/include/uapi/linux/gpib_ioctl.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +/*************************************************************************** + * copyright : (C) 2002 by Frank Mori Hess + ***************************************************************************/ + +#ifndef _GPIB_IOCTL_H +#define _GPIB_IOCTL_H + +#include +#include + +#define GPIB_CODE 160 + +struct gpib_board_type_ioctl { + char name[100]; +}; + +/* argument for read/write/command ioctls */ +struct gpib_read_write_ioctl { + __u64 buffer_ptr; + __u32 requested_transfer_count; + __u32 completed_transfer_count; + __s32 end; /* end flag return for reads, end io suppression request for cmd*/ + __s32 handle; +}; + +struct gpib_open_dev_ioctl { + __u32 handle; + __u32 pad; + __s32 sad; + __u32 is_board; +}; + +struct gpib_close_dev_ioctl { + __u32 handle; +}; + +struct gpib_serial_poll_ioctl { + __u32 pad; + __s32 sad; + __u8 status_byte; + __u8 padding[3]; /* align to 32 bit boundary */ +}; + +struct gpib_eos_ioctl { + __s32 eos; + __s32 eos_flags; +}; + +struct gpib_wait_ioctl { + __s32 handle; + __s32 wait_mask; + __s32 clear_mask; + __s32 set_mask; + __s32 ibsta; + __s32 pad; + __s32 sad; + __u32 usec_timeout; +}; + +struct gpib_online_ioctl { + __u64 init_data_ptr; + __s32 init_data_length; + __s32 online; +}; + +struct gpib_spoll_bytes_ioctl { + __u32 num_bytes; + __u32 pad; + __s32 sad; +}; + +struct gpib_board_info_ioctl { + __u32 pad; + __s32 sad; + __s32 parallel_poll_configuration; + __s32 autopolling; + __s32 is_system_controller; + __u32 t1_delay; + unsigned ist : 1; + unsigned no_7_bit_eos : 1; + unsigned padding :30; /* align to 32 bit boundary */ +}; + +struct gpib_select_pci_ioctl { + __s32 pci_bus; + __s32 pci_slot; +}; + +struct gpib_ppoll_config_ioctl { + __u8 config; + unsigned set_ist : 1; + unsigned clear_ist : 1; + unsigned padding :22; /* align to 32 bit boundary */ +}; + +struct gpib_pad_ioctl { + __u32 handle; + __u32 pad; +}; + +struct gpib_sad_ioctl { + __u32 handle; + __s32 sad; +}; + +/* select a piece of hardware to attach by its sysfs device path */ +struct gpib_select_device_path_ioctl { + char device_path[0x1000]; +}; + +/* update status byte and request service */ +struct gpib_request_service2 { + __u8 status_byte; + __u8 padding[3]; /* align to 32 bit boundary */ + __s32 new_reason_for_service; +}; + +/* Standard functions. */ +enum gpib_ioctl { + IBRD = _IOWR(GPIB_CODE, 100, struct gpib_read_write_ioctl), + IBWRT = _IOWR(GPIB_CODE, 101, struct gpib_read_write_ioctl), + IBCMD = _IOWR(GPIB_CODE, 102, struct gpib_read_write_ioctl), + IBOPENDEV = _IOWR(GPIB_CODE, 3, struct gpib_open_dev_ioctl), + IBCLOSEDEV = _IOW(GPIB_CODE, 4, struct gpib_close_dev_ioctl), + IBWAIT = _IOWR(GPIB_CODE, 5, struct gpib_wait_ioctl), + IBRPP = _IOWR(GPIB_CODE, 6, __u8), + + IBSIC = _IOW(GPIB_CODE, 9, __u32), + IBSRE = _IOW(GPIB_CODE, 10, __s32), + IBGTS = _IO(GPIB_CODE, 11), + IBCAC = _IOW(GPIB_CODE, 12, __s32), + IBLINES = _IOR(GPIB_CODE, 14, __s16), + IBPAD = _IOW(GPIB_CODE, 15, struct gpib_pad_ioctl), + IBSAD = _IOW(GPIB_CODE, 16, struct gpib_sad_ioctl), + IBTMO = _IOW(GPIB_CODE, 17, __u32), + IBRSP = _IOWR(GPIB_CODE, 18, struct gpib_serial_poll_ioctl), + IBEOS = _IOW(GPIB_CODE, 19, struct gpib_eos_ioctl), + IBRSV = _IOW(GPIB_CODE, 20, __u8), + CFCBASE = _IOW(GPIB_CODE, 21, __u64), + CFCIRQ = _IOW(GPIB_CODE, 22, __u32), + CFCDMA = _IOW(GPIB_CODE, 23, __u32), + CFCBOARDTYPE = _IOW(GPIB_CODE, 24, struct gpib_board_type_ioctl), + + IBMUTEX = _IOW(GPIB_CODE, 26, __s32), + IBSPOLL_BYTES = _IOWR(GPIB_CODE, 27, struct gpib_spoll_bytes_ioctl), + IBPPC = _IOW(GPIB_CODE, 28, struct gpib_ppoll_config_ioctl), + IBBOARD_INFO = _IOR(GPIB_CODE, 29, struct gpib_board_info_ioctl), + + IBQUERY_BOARD_RSV = _IOR(GPIB_CODE, 31, __s32), + IBSELECT_PCI = _IOWR(GPIB_CODE, 32, struct gpib_select_pci_ioctl), + IBEVENT = _IOR(GPIB_CODE, 33, __s16), + IBRSC = _IOW(GPIB_CODE, 34, __s32), + IB_T1_DELAY = _IOW(GPIB_CODE, 35, __u32), + IBLOC = _IO(GPIB_CODE, 36), + + IBAUTOSPOLL = _IOW(GPIB_CODE, 38, __s16), + IBONL = _IOW(GPIB_CODE, 39, struct gpib_online_ioctl), + IBPP2_SET = _IOW(GPIB_CODE, 40, __s16), + IBPP2_GET = _IOR(GPIB_CODE, 41, __s16), + IBSELECT_DEVICE_PATH = _IOW(GPIB_CODE, 43, struct gpib_select_device_path_ioctl), + /* 44 was IBSELECT_SERIAL_NUMBER */ + IBRSV2 = _IOW(GPIB_CODE, 45, struct gpib_request_service2) +}; + +#endif /* _GPIB_IOCTL_H */ -- cgit v1.2.3 From 25e4e3565d45f567f78089f38822fa64abee5230 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 18 Nov 2025 20:36:29 +0800 Subject: ftrace: Introduce FTRACE_OPS_FL_JMP For now, the "nop" will be replaced with a "call" instruction when a function is hooked by the ftrace. However, sometimes the "call" can break the RSB and introduce extra overhead. Therefore, introduce the flag FTRACE_OPS_FL_JMP, which indicate that the ftrace_ops should be called with a "jmp" instead of "call". For now, it is only used by the direct call case. When a direct ftrace_ops is marked with FTRACE_OPS_FL_JMP, the last bit of the ops->direct_call will be set to 1. Therefore, we can tell if we should use "jmp" for the callback in ftrace_call_replace(). Signed-off-by: Menglong Dong Acked-by: Steven Rostedt (Google) Link: https://lore.kernel.org/r/20251118123639.688444-2-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/ftrace.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 07f8c309e432..015dd1049bea 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -359,6 +359,7 @@ enum { FTRACE_OPS_FL_DIRECT = BIT(17), FTRACE_OPS_FL_SUBOP = BIT(18), FTRACE_OPS_FL_GRAPH = BIT(19), + FTRACE_OPS_FL_JMP = BIT(20), }; #ifndef CONFIG_DYNAMIC_FTRACE_WITH_ARGS @@ -577,6 +578,38 @@ static inline void arch_ftrace_set_direct_caller(struct ftrace_regs *fregs, unsigned long addr) { } #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP +static inline bool ftrace_is_jmp(unsigned long addr) +{ + return addr & 1; +} + +static inline unsigned long ftrace_jmp_set(unsigned long addr) +{ + return addr | 1UL; +} + +static inline unsigned long ftrace_jmp_get(unsigned long addr) +{ + return addr & ~1UL; +} +#else +static inline bool ftrace_is_jmp(unsigned long addr) +{ + return false; +} + +static inline unsigned long ftrace_jmp_set(unsigned long addr) +{ + return addr; +} + +static inline unsigned long ftrace_jmp_get(unsigned long addr) +{ + return addr; +} +#endif /* CONFIG_DYNAMIC_FTRACE_WITH_JMP */ + #ifdef CONFIG_STACK_TRACER int stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer, -- cgit v1.2.3 From 373f2f44c300815c5f170e89560ac361c0053dfe Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 18 Nov 2025 20:36:32 +0800 Subject: bpf,x86: adjust the "jmp" mode for bpf trampoline In the origin call case, if BPF_TRAMP_F_SKIP_FRAME is not set, it means that the trampoline is not called, but "jmp". Introduce the function bpf_trampoline_use_jmp() to check if the trampoline is in "jmp" mode. Do some adjustment on the "jmp" mode for the x86_64. The main adjustment that we make is for the stack parameter passing case, as the stack alignment logic changes in the "jmp" mode without the "rip". What's more, the location of the parameters on the stack also changes. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20251118123639.688444-5-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 30fb40421405..2f79afe81482 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1264,6 +1264,18 @@ typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start, bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog); bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog); +#ifdef CONFIG_DYNAMIC_FTRACE_WITH_JMP +static inline bool bpf_trampoline_use_jmp(u64 flags) +{ + return flags & BPF_TRAMP_F_CALL_ORIG && !(flags & BPF_TRAMP_F_SKIP_FRAME); +} +#else +static inline bool bpf_trampoline_use_jmp(u64 flags) +{ + return false; +} +#endif + struct bpf_ksym { unsigned long start; unsigned long end; -- cgit v1.2.3 From ae4a3160d19cd16b874737ebc1798c7bc2fe3c9e Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 18 Nov 2025 20:36:33 +0800 Subject: bpf: specify the old and new poke_type for bpf_arch_text_poke In the origin logic, the bpf_arch_text_poke() assume that the old and new instructions have the same opcode. However, they can have different opcode if we want to replace a "call" insn with a "jmp" insn. Therefore, add the new function parameter "old_t" along with the "new_t", which are used to indicate the old and new poke type. Meanwhile, adjust the implement of bpf_arch_text_poke() for all the archs. "BPF_MOD_NOP" is added to make the code more readable. In bpf_arch_text_poke(), we still check if the new and old address is NULL to determine if nop insn should be used, which I think is more safe. Signed-off-by: Menglong Dong Link: https://lore.kernel.org/r/20251118123639.688444-6-dongml2@chinatelecom.cn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2f79afe81482..a9b788c7b4aa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3710,12 +3710,14 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, #endif /* CONFIG_INET */ enum bpf_text_poke_type { + BPF_MOD_NOP, BPF_MOD_CALL, BPF_MOD_JUMP, }; -int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, - void *addr1, void *addr2); +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type old_t, + enum bpf_text_poke_type new_t, void *old_addr, + void *new_addr); void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke, struct bpf_prog *new, struct bpf_prog *old); -- cgit v1.2.3 From 7584edf15892e29190b2145294cc1680aa142586 Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Mon, 24 Nov 2025 17:15:35 +0000 Subject: firmware: cs_dsp: Store control length as 32-bit The architectures supported by this driver have a maximum of 32-bits of address, so we don't need more than 32-bits to store the length of control data. Change the length in struct cs_dsp_coeff_ctl to an unsigned int instead of a size_t. Also make a corresponding trivial change to wm_adsp.c to prevent a compiler warning. Tested on x86_64 builds this saves at least 4 bytes per control (another 4 bytes might be saved if the compiler was inserting padding to align the size_t). Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20251124171536.78962-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/linux/firmware/cirrus/cs_dsp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/firmware/cirrus/cs_dsp.h b/include/linux/firmware/cirrus/cs_dsp.h index 69959032f8f5..0ec1cdc5585d 100644 --- a/include/linux/firmware/cirrus/cs_dsp.h +++ b/include/linux/firmware/cirrus/cs_dsp.h @@ -102,7 +102,7 @@ struct cs_dsp_coeff_ctl { const char *subname; unsigned int subname_len; unsigned int offset; - size_t len; + unsigned int len; unsigned int type; unsigned int flags; unsigned int set:1; -- cgit v1.2.3 From 2a6c045640c38a407a39cd40c3c4d8dd2fd89aa8 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 6 Nov 2025 14:34:00 +0100 Subject: bitfield: Add less-checking __FIELD_{GET,PREP}() The BUILD_BUG_ON_MSG() check against "~0ull" works only with "unsigned (long) long" _mask types. For constant masks, that condition is usually met, as GENMASK() yields an UL value. The few places where the constant mask is stored in an intermediate variable were fixed by changing the variable type to u64 (see e.g. [1] and [2]). However, for non-constant masks, smaller unsigned types should be valid, too, but currently lead to "result of comparison of constant 18446744073709551615 with expression of type ... is always false"-warnings with clang and W=1. Hence refactor the __BF_FIELD_CHECK() helper, and factor out __FIELD_{GET,PREP}(). The later lack the single problematic check, but are otherwise identical to FIELD_{GET,PREP}(), and are intended to be used in the fully non-const variants later. [1] commit 5c667d5a5a3ec166 ("clk: sp7021: Adjust width of _m in HWM_FIELD_PREP()") [2] commit cfd6fb45cfaf46fa ("crypto: ccree - avoid out-of-range warnings from clang") Signed-off-by: Geert Uytterhoeven Link: https://git.kernel.org/torvalds/c/5c667d5a5a3ec166 [1] Signed-off-by: Yury Norov (NVIDIA) --- include/linux/bitfield.h | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index 5355f8f806a9..bf8e0ae4b5b4 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -60,7 +60,7 @@ #define __bf_cast_unsigned(type, x) ((__unsigned_scalar_typeof(type))(x)) -#define __BF_FIELD_CHECK(_mask, _reg, _val, _pfx) \ +#define __BF_FIELD_CHECK_MASK(_mask, _val, _pfx) \ ({ \ BUILD_BUG_ON_MSG(!__builtin_constant_p(_mask), \ _pfx "mask is not constant"); \ @@ -69,13 +69,33 @@ ~((_mask) >> __bf_shf(_mask)) & \ (0 + (_val)) : 0, \ _pfx "value too large for the field"); \ - BUILD_BUG_ON_MSG(__bf_cast_unsigned(_mask, _mask) > \ - __bf_cast_unsigned(_reg, ~0ull), \ - _pfx "type of reg too small for mask"); \ __BUILD_BUG_ON_NOT_POWER_OF_2((_mask) + \ (1ULL << __bf_shf(_mask))); \ }) +#define __BF_FIELD_CHECK_REG(mask, reg, pfx) \ + BUILD_BUG_ON_MSG(__bf_cast_unsigned(mask, mask) > \ + __bf_cast_unsigned(reg, ~0ull), \ + pfx "type of reg too small for mask") + +#define __BF_FIELD_CHECK(mask, reg, val, pfx) \ + ({ \ + __BF_FIELD_CHECK_MASK(mask, val, pfx); \ + __BF_FIELD_CHECK_REG(mask, reg, pfx); \ + }) + +#define __FIELD_PREP(mask, val, pfx) \ + ({ \ + __BF_FIELD_CHECK_MASK(mask, val, pfx); \ + ((typeof(mask))(val) << __bf_shf(mask)) & (mask); \ + }) + +#define __FIELD_GET(mask, reg, pfx) \ + ({ \ + __BF_FIELD_CHECK_MASK(mask, 0U, pfx); \ + (typeof(mask))(((reg) & (mask)) >> __bf_shf(mask)); \ + }) + /** * FIELD_MAX() - produce the maximum value representable by a field * @_mask: shifted mask defining the field's length and position @@ -112,8 +132,8 @@ */ #define FIELD_PREP(_mask, _val) \ ({ \ - __BF_FIELD_CHECK(_mask, 0ULL, _val, "FIELD_PREP: "); \ - ((typeof(_mask))(_val) << __bf_shf(_mask)) & (_mask); \ + __BF_FIELD_CHECK_REG(_mask, 0ULL, "FIELD_PREP: "); \ + __FIELD_PREP(_mask, _val, "FIELD_PREP: "); \ }) #define __BF_CHECK_POW2(n) BUILD_BUG_ON_ZERO(((n) & ((n) - 1)) != 0) @@ -152,8 +172,8 @@ */ #define FIELD_GET(_mask, _reg) \ ({ \ - __BF_FIELD_CHECK(_mask, _reg, 0U, "FIELD_GET: "); \ - (typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask)); \ + __BF_FIELD_CHECK_REG(_mask, _reg, "FIELD_GET: "); \ + __FIELD_GET(_mask, _reg, "FIELD_GET: "); \ }) /** -- cgit v1.2.3 From c1c6ab80b25c8db1e2ef5ae3ac8075d2c242ae13 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 6 Nov 2025 14:34:01 +0100 Subject: bitfield: Add non-constant field_{prep,get}() helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing FIELD_{GET,PREP}() macros are limited to compile-time constants. However, it is very common to prepare or extract bitfield elements where the bitfield mask is not a compile-time constant. To avoid this limitation, the AT91 clock driver and several other drivers already have their own non-const field_{prep,get}() macros. Make them available for general use by adding them to , and improve them slightly: 1. Avoid evaluating macro parameters more than once, 2. Replace "ffs() - 1" by "__ffs()", 3. Support 64-bit use on 32-bit architectures, 4. Wire field_{get,prep}() to FIELD_{GET,PREP}() when mask is actually constant. This is deliberately not merged into the existing FIELD_{GET,PREP}() macros, as people expressed the desire to keep stricter variants for increased safety, or for performance critical paths. Yury: use __mask withing new macros. Signed-off-by: Geert Uytterhoeven Acked-by: Alexandre Belloni Acked-by: Jonathan Cameron Acked-by: Crt Mori Acked-by: Nuno Sá Acked-by: Richard Genoud Reviewed-by: Andy Shevchenko Reviewed-by: Yury Norov (NVIDIA) Signed-off-by: Yury Norov (NVIDIA) --- include/linux/bitfield.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'include') diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h index bf8e0ae4b5b4..126dc5b380af 100644 --- a/include/linux/bitfield.h +++ b/include/linux/bitfield.h @@ -17,6 +17,7 @@ * FIELD_{GET,PREP} macros take as first parameter shifted mask * from which they extract the base mask and shift amount. * Mask must be a compilation time constant. + * field_{get,prep} are variants that take a non-const mask. * * Example: * @@ -240,4 +241,62 @@ __MAKE_OP(64) #undef __MAKE_OP #undef ____MAKE_OP +#define __field_prep(mask, val) \ + ({ \ + __auto_type __mask = (mask); \ + typeof(__mask) __val = (val); \ + unsigned int __shift = BITS_PER_TYPE(__mask) <= 32 ? \ + __ffs(__mask) : __ffs64(__mask); \ + (__val << __shift) & __mask; \ + }) + +#define __field_get(mask, reg) \ + ({ \ + __auto_type __mask = (mask); \ + typeof(__mask) __reg = (reg); \ + unsigned int __shift = BITS_PER_TYPE(__mask) <= 32 ? \ + __ffs(__mask) : __ffs64(__mask); \ + (__reg & __mask) >> __shift; \ + }) + +/** + * field_prep() - prepare a bitfield element + * @mask: shifted mask defining the field's length and position, must be + * non-zero + * @val: value to put in the field + * + * Return: field value masked and shifted to its final destination + * + * field_prep() masks and shifts up the value. The result should be + * combined with other fields of the bitfield using logical OR. + * Unlike FIELD_PREP(), @mask is not limited to a compile-time constant. + * Typical usage patterns are a value stored in a table, or calculated by + * shifting a constant by a variable number of bits. + * If you want to ensure that @mask is a compile-time constant, please use + * FIELD_PREP() directly instead. + */ +#define field_prep(mask, val) \ + (__builtin_constant_p(mask) ? __FIELD_PREP(mask, val, "field_prep: ") \ + : __field_prep(mask, val)) + +/** + * field_get() - extract a bitfield element + * @mask: shifted mask defining the field's length and position, must be + * non-zero + * @reg: value of entire bitfield + * + * Return: extracted field value + * + * field_get() extracts the field specified by @mask from the + * bitfield passed in as @reg by masking and shifting it down. + * Unlike FIELD_GET(), @mask is not limited to a compile-time constant. + * Typical usage patterns are a value stored in a table, or calculated by + * shifting a constant by a variable number of bits. + * If you want to ensure that @mask is a compile-time constant, please use + * FIELD_GET() directly instead. + */ +#define field_get(mask, reg) \ + (__builtin_constant_p(mask) ? __FIELD_GET(mask, reg, "field_get: ") \ + : __field_get(mask, reg)) + #endif -- cgit v1.2.3 From 4f1b701f24bea0900e349aa1c860db24ba0150aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Nov 2025 10:24:09 +0100 Subject: x86/bug: Use BUG_FORMAT for DEBUG_BUGVERBOSE_DETAILED Since we have an explicit format string, use it for the condition string instead of frobbing it in the file string. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251110115758.097401406@infradead.org --- include/asm-generic/bug.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 21d2c8f88d49..e512071216be 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -18,11 +18,13 @@ #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif +#ifndef WARN_CONDITION_STR #ifdef CONFIG_DEBUG_BUGVERBOSE_DETAILED -# define WARN_CONDITION_STR(cond_str) cond_str +# define WARN_CONDITION_STR(cond_str) "[" cond_str "] " #else # define WARN_CONDITION_STR(cond_str) #endif +#endif /* WARN_CONDITION_STR */ #ifndef __ASSEMBLY__ #include @@ -107,7 +109,7 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define WARN_ON(condition) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_FLAGS("["#condition"] ", \ + __WARN_FLAGS(#condition, \ BUGFLAG_TAINT(TAINT_WARN)); \ unlikely(__ret_warn_on); \ }) @@ -117,7 +119,7 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); #define WARN_ON_ONCE(condition) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ - __WARN_FLAGS("["#condition"] ", \ + __WARN_FLAGS(#condition, \ BUGFLAG_ONCE | \ BUGFLAG_TAINT(TAINT_WARN)); \ unlikely(__ret_warn_on); \ -- cgit v1.2.3 From 11bb4944f014d756f35261f5afcb346901ef1efa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 Jun 2025 15:08:30 +0200 Subject: x86/bug: Implement WARN_ONCE() Implement WARN_ONCE like WARN using BUGFLAG_ONCE. Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20251110115758.339309119@infradead.org --- include/asm-generic/bug.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index e512071216be..09e8eccee8ed 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -180,8 +180,10 @@ extern __printf(1, 2) void __warn_printk(const char *fmt, ...); DO_ONCE_LITE_IF(condition, WARN_ON, 1) #endif +#ifndef WARN_ONCE #define WARN_ONCE(condition, format...) \ DO_ONCE_LITE_IF(condition, WARN, 1, format) +#endif #define WARN_TAINT_ONCE(condition, taint, format...) \ DO_ONCE_LITE_IF(condition, WARN_TAINT, 1, taint, format) -- cgit v1.2.3 From 645b9ad2dc6b2d6d31e2944bd7f680f3f9d827ea Mon Sep 17 00:00:00 2001 From: Kriish Sharma Date: Tue, 18 Nov 2025 18:48:28 +0000 Subject: string: Add missing kernel-doc return descriptions While running kernel-doc validation on linux-next, warnings were emitted for functions in include/linux/string.h due to missing return value documentation: Warning: include/linux/string.h:375 No description found for return value of 'kbasename' Warning: include/linux/string.h:560 No description found for return value of 'strstarts' This patch adds the missing return value descriptions for both functions and clears the related kernel-doc warnings. Signed-off-by: Kriish Sharma Reviewed-by: Andy Shevchenko Link: https://patch.msgid.link/20251118184828.2621595-1-kriish.sharma2006@gmail.com Signed-off-by: Kees Cook --- include/linux/string.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index fdd3442c6bcb..434b152df66a 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -371,6 +371,10 @@ static inline void memzero_explicit(void *s, size_t count) * kbasename - return the last part of a pathname. * * @path: path to extract the filename from. + * + * Returns: + * Pointer to the filename portion inside @path. If no '/' exists, + * returns @path unchanged. */ static inline const char *kbasename(const char *path) { @@ -556,6 +560,9 @@ static __always_inline size_t str_has_prefix(const char *str, const char *prefix * strstarts - does @str start with @prefix? * @str: string to examine * @prefix: prefix to look for. + * + * Returns: + * True if @str begins with @prefix. False in all other cases. */ static inline bool strstarts(const char *str, const char *prefix) { -- cgit v1.2.3 From 6b1ac78dd0f29fe66421c460c12ec15e45af38c3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Oct 2025 10:22:04 +1030 Subject: btrfs: implement shutdown ioctl The shutdown ioctl should follow the XFS one, which use magic number 'X', and ioctl number 125, with a uint32 as flags. For now btrfs don't distinguish DEFAULT and LOGFLUSH flags (just like f2fs), both will freeze the fs first (implies committing the current transaction), setting the SHUTDOWN flag and finally thaw the fs. For NOLOGFLUSH flag, the freeze/thaw part is skipped thus the current transaction is aborted. The new shutdown ioctl is hidden behind experimental features for more testing. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Tested-by: Anand Jain Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 8e710bbb688e..e8fd92789423 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -1099,6 +1099,12 @@ enum btrfs_err_code { BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, }; +/* Flags for IOC_SHUTDOWN, must match XFS_FSOP_GOING_FLAGS_* flags. */ +#define BTRFS_SHUTDOWN_FLAGS_DEFAULT 0x0 +#define BTRFS_SHUTDOWN_FLAGS_LOGFLUSH 0x1 +#define BTRFS_SHUTDOWN_FLAGS_NOLOGFLUSH 0x2 +#define BTRFS_SHUTDOWN_FLAGS_LAST 0x3 + #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ struct btrfs_ioctl_vol_args) #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ @@ -1220,6 +1226,9 @@ enum btrfs_err_code { #define BTRFS_IOC_SUBVOL_SYNC_WAIT _IOW(BTRFS_IOCTL_MAGIC, 65, \ struct btrfs_ioctl_subvol_wait) +/* Shutdown ioctl should follow XFS's interfaces, thus not using btrfs magic. */ +#define BTRFS_IOC_SHUTDOWN _IOR('X', 125, __u32) + #ifdef __cplusplus } #endif -- cgit v1.2.3 From 4bd68e475300bc97b33a7f1ef9bd112970018789 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Nov 2025 21:39:59 +0100 Subject: cpumask: Don't use "proxy" headers Update header inclusions to follow IWYU (Include What You Use) principle. Note that kernel.h is discouraged to be included as it's written at the top of that file. Signed-off-by: Andy Shevchenko Signed-off-by: Yury Norov (NVIDIA) --- include/linux/cpumask.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index ff8f41ab7ce6..df89eedc6e91 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -7,14 +7,16 @@ * set of CPUs in a system, one bit position per CPU number. In general, * only nr_cpu_ids (<= NR_CPUS) bits are valid. */ -#include -#include +#include #include +#include #include -#include -#include #include #include +#include +#include + +#include /** * cpumask_pr_args - printf args to output a cpumask -- cgit v1.2.3 From 8cb4ecec5e366b7dbbf200629a22624ad2340af5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:51 +0000 Subject: irqchip/gic: Add missing GICH_HCR control bits The GICH_HCR description is missing a bunch of control bits that control the maintenance interrupt. Add them. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-2-maz@kernel.org Signed-off-by: Oliver Upton --- include/linux/irqchip/arm-gic.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h index 2223f95079ce..d45fa19f9e47 100644 --- a/include/linux/irqchip/arm-gic.h +++ b/include/linux/irqchip/arm-gic.h @@ -86,7 +86,13 @@ #define GICH_HCR_EN (1 << 0) #define GICH_HCR_UIE (1 << 1) +#define GICH_HCR_LRENPIE (1 << 2) #define GICH_HCR_NPIE (1 << 3) +#define GICH_HCR_VGrp0EIE (1 << 4) +#define GICH_HCR_VGrp0DIE (1 << 5) +#define GICH_HCR_VGrp1EIE (1 << 6) +#define GICH_HCR_VGrp1DIE (1 << 7) +#define GICH_HCR_EOICOUNT GENMASK(31, 27) #define GICH_LR_VIRTUALID (0x3ff << 0) #define GICH_LR_PHYSID_CPUID_SHIFT (10) -- cgit v1.2.3 From fa8f11e8e18383d234c77ba08d347aed7883d39a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:52 +0000 Subject: irqchip/gic: Expose CPU interface VA to KVM Future changes will require KVM to be able to perform deactivations by writing to the physical CPU interface. Add the corresponding VA to the kvm_info structure, and let KVM stash it. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-3-maz@kernel.org Signed-off-by: Oliver Upton --- include/kvm/arm_vgic.h | 3 +++ include/linux/irqchip/arm-vgic-info.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 7a0b972eb1b1..577723f5599b 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -59,6 +59,9 @@ struct vgic_global { /* virtual control interface mapping, HYP VA */ void __iomem *vctrl_hyp; + /* Physical CPU interface, kernel VA */ + void __iomem *gicc_base; + /* Number of implemented list registers */ int nr_lr; diff --git a/include/linux/irqchip/arm-vgic-info.h b/include/linux/irqchip/arm-vgic-info.h index a470a73a805a..67d9d960273b 100644 --- a/include/linux/irqchip/arm-vgic-info.h +++ b/include/linux/irqchip/arm-vgic-info.h @@ -24,6 +24,8 @@ struct gic_kvm_info { enum gic_type type; /* Virtual CPU interface */ struct resource vcpu; + /* GICv2 GICC VA */ + void __iomem *gicc_base; /* Interrupt number */ unsigned int maint_irq; /* No interrupt mask, no need to use the above field */ -- cgit v1.2.3 From a4413a7c31cfca49d3f4830cf8a45edf4a713f63 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:57 +0000 Subject: KVM: arm64: Repack struct vgic_irq fields struct vgic_irq has grown over the years, in a rather bad way. Repack it using bitfields so that the individual flags, and move things around a bit so that it a bit smaller. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-8-maz@kernel.org Signed-off-by: Oliver Upton --- include/kvm/arm_vgic.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 577723f5599b..e84a1bc5cf17 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -123,6 +123,7 @@ struct irq_ops { struct vgic_irq { raw_spinlock_t irq_lock; /* Protects the content of the struct */ + u32 intid; /* Guest visible INTID */ struct rcu_head rcu; struct list_head ap_list; @@ -137,17 +138,17 @@ struct vgic_irq { * affinity reg (v3). */ - u32 intid; /* Guest visible INTID */ - bool line_level; /* Level only */ - bool pending_latch; /* The pending latch state used to calculate - * the pending state for both level - * and edge triggered IRQs. */ - bool active; - bool pending_release; /* Used for LPIs only, unreferenced IRQ + bool pending_release:1; /* Used for LPIs only, unreferenced IRQ * pending a release */ - bool enabled; - bool hw; /* Tied to HW IRQ */ + bool pending_latch:1; /* The pending latch state used to calculate + * the pending state for both level + * and edge triggered IRQs. */ + enum vgic_irq_config config:1; /* Level or edge */ + bool line_level:1; /* Level only */ + bool enabled:1; + bool active:1; + bool hw:1; /* Tied to HW IRQ */ refcount_t refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ @@ -159,7 +160,6 @@ struct vgic_irq { u8 active_source; /* GICv2 SGIs only */ u8 priority; u8 group; /* 0 == group 0, 1 == group 1 */ - enum vgic_irq_config config; /* Level or edge */ struct irq_ops *ops; -- cgit v1.2.3 From 879a7fd4fd64656d953f887e6a18e13e0b9a9f8f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:24:58 +0000 Subject: KVM: arm64: Add tracking of vgic_irq being present in a LR We currently cannot identify whether an interrupt is queued into a LR. It wasn't needed until now, but that's about to change. Add yet another flag to track that state. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-9-maz@kernel.org Signed-off-by: Oliver Upton --- include/kvm/arm_vgic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index e84a1bc5cf17..ec349c5a4a8b 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -149,6 +149,7 @@ struct vgic_irq { bool enabled:1; bool active:1; bool hw:1; /* Tied to HW IRQ */ + bool on_lr:1; /* Present in a CPU LR */ refcount_t refcount; /* Used for LPIs */ u32 hwintid; /* HW INTID number */ unsigned int host_irq; /* linux irq corresponding to hwintid */ -- cgit v1.2.3 From cd4f6ee99b28f10692c2444c8dc0bab77357a25e Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:18 +0000 Subject: KVM: arm64: GICv3: Handle deactivation via ICV_DIR_EL1 traps Deactivation via ICV_DIR_EL1 is both relatively straightforward (we have the interrupt that needs deactivation) and really awkward. The main issue is that the interrupt may either be in an LR on another CPU, or ourside of any LR. In the former case, we process the deactivation is if ot was a write to GICD_CACTIVERn, which is already implemented as a big hammer IPI'ing all vcpus. In the latter case, we just perform a normal deactivation, similar to what we do for EOImode==0. Another annoying aspect is that we need to tell the CPU owning the interrupt that its ap_list needs laudering. We use a brand new vcpu request to that effect. Note that this doesn't address deactivation via the GICV MMIO view, which will be taken care of in a later change. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-29-maz@kernel.org Signed-off-by: Oliver Upton --- include/kvm/arm_vgic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index ec349c5a4a8b..b798546755a3 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -421,6 +421,7 @@ bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu); void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid); +void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu); void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1); -- cgit v1.2.3 From 1c3b3cadcd69f7415e8b3b1b1e81459e0e8c9f33 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:21 +0000 Subject: KVM: arm64: GICv3: Add SPI tracking to handle asymmetric deactivation SPIs are specially annpying, as they can be activated on a CPU and deactivated on another. WHich means that when an SPI is in flight anywhere, all CPUs need to have their TDIR trap bit set. This translates into broadcasting an IPI across all CPUs to make sure they set their trap bit, The number of in-flight SPIs is kept in an atomic variable so that CPUs can turn the trap bit off as soon as possible. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-32-maz@kernel.org Signed-off-by: Oliver Upton --- include/kvm/arm_vgic.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index b798546755a3..6a4d3d205596 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -263,6 +263,9 @@ struct vgic_dist { /* The GIC maintenance IRQ for nested hypervisors. */ u32 mi_intid; + /* Track the number of in-flight active SPIs */ + atomic_t active_spis; + /* base addresses in guest physical address space: */ gpa_t vgic_dist_base; /* distributor */ union { -- cgit v1.2.3 From 255de897e7fb918a34845167c572b5bf8e1d9d79 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 20 Nov 2025 17:25:28 +0000 Subject: KVM: arm64: GICv2: Handle deactivation via GICV_DIR traps Add the plumbing of GICv2 interrupt deactivation via GICV_DIR. This requires adding a new device so that we can easily decode the DIR address. The deactivation itself is very similar to the GICv3 version. Tested-by: Fuad Tabba Signed-off-by: Marc Zyngier Tested-by: Mark Brown Link: https://msgid.link/20251120172540.2267180-39-maz@kernel.org Signed-off-by: Oliver Upton --- include/kvm/arm_vgic.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 6a4d3d205596..b261fb3968d0 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -287,6 +287,7 @@ struct vgic_dist { struct vgic_irq *spis; struct vgic_io_device dist_iodev; + struct vgic_io_device cpuif_iodev; bool has_its; bool table_write_in_progress; -- cgit v1.2.3 From d245f9b4ab806733a77e51a218ca7b8bc3135cd9 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:52 +1000 Subject: mm/zone_device: support large zone device private folios MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: support device-private THP", v7. This patch series introduces support for Transparent Huge Page (THP) migration in zone device-private memory. The implementation enables efficient migration of large folios between system memory and device-private memory Background Current zone device-private memory implementation only supports PAGE_SIZE granularity, leading to: - Increased TLB pressure - Inefficient migration between CPU and device memory This series extends the existing zone device-private infrastructure to support THP, leading to: - Reduced page table overhead - Improved memory bandwidth utilization - Seamless fallback to base pages when needed In my local testing (using lib/test_hmm) and a throughput test, the series shows a 350% improvement in data transfer throughput and a 80% improvement in latency These patches build on the earlier posts by Ralph Campbell [1] Two new flags are added in vma_migration to select and mark compound pages. migrate_vma_setup(), migrate_vma_pages() and migrate_vma_finalize() support migration of these pages when MIGRATE_VMA_SELECT_COMPOUND is passed in as arguments. The series also adds zone device awareness to (m)THP pages along with fault handling of large zone device private pages. page vma walk and the rmap code is also zone device aware. Support has also been added for folios that might need to be split in the middle of migration (when the src and dst do not agree on MIGRATE_PFN_COMPOUND), that occurs when src side of the migration can migrate large pages, but the destination has not been able to allocate large pages. The code supported and used folio_split() when migrating THP pages, this is used when MIGRATE_VMA_SELECT_COMPOUND is not passed as an argument to migrate_vma_setup(). The test infrastructure lib/test_hmm.c has been enhanced to support THP migration. A new ioctl to emulate failure of large page allocations has been added to test the folio split code path. hmm-tests.c has new test cases for huge page migration and to test the folio split path. A new throughput test has been added as well. The nouveau dmem code has been enhanced to use the new THP migration capability. mTHP support: The patches hard code, HPAGE_PMD_NR in a few places, but the code has been kept generic to support various order sizes. With additional refactoring of the code support of different order sizes should be possible. The future plan is to post enhancements to support mTHP with a rough design as follows: 1. Add the notion of allowable thp orders to the HMM based test driver 2. For non PMD based THP paths in migrate_device.c, check to see if a suitable order is found and supported by the driver 3. Iterate across orders to check the highest supported order for migration 4. Migrate and finalize The mTHP patches can be built on top of this series, the key design elements that need to be worked out are infrastructure and driver support for multiple ordered pages and their migration. HMM support for large folios was added in 10b9feee2d0d ("mm/hmm: populate PFNs from PMD swap entry"). This patch (of 16) Add routines to support allocation of large order zone device folios and helper functions for zone device folios, to check if a folio is device private and helpers for setting zone device data. When large folios are used, the existing page_free() callback in pgmap is called when the folio is freed, this is true for both PAGE_SIZE and higher order pages. Zone device private large folios do not support deferred split and scan like normal THP folios. Link: https://lkml.kernel.org/r/20251001065707.920170-1-balbirs@nvidia.com Link: https://lkml.kernel.org/r/20251001065707.920170-2-balbirs@nvidia.com Link: https://lore.kernel.org/linux-mm/20201106005147.20113-1-rcampbell@nvidia.com/ [1] Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Cc: Madhavan Srinivasan Cc: Christophe Leroy Cc: Felix Kuehling Cc: Alex Deucher Cc: "Christian König" Signed-off-by: Andrew Morton --- include/linux/memremap.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index e5951ba12a28..d2487a19cba2 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -206,7 +206,7 @@ static inline bool is_fsdax_page(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE -void zone_device_page_init(struct page *page); +void zone_device_page_init(struct page *page, unsigned int order); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); @@ -215,6 +215,14 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); + +static inline void zone_device_folio_init(struct folio *folio, unsigned int order) +{ + zone_device_page_init(&folio->page, order); + if (order) + folio_set_large_rmappable(folio); +} + #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) -- cgit v1.2.3 From 3a5a06554566fcc9f7de7327cfc365ed384d396c Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:53 +1000 Subject: mm/zone_device: rename page_free callback to folio_free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change page_free to folio_free to make the folio support for zone device-private more consistent. The PCI P2PDMA callback has also been updated and changed to folio_free() as a result. For drivers that do not support folios (yet), the folio is converted back into page via &folio->page and the page is used as is, in the current callback implementation. Link: https://lkml.kernel.org/r/20251001065707.920170-3-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Cc: Madhavan Srinivasan Cc: Christophe Leroy Cc: Felix Kuehling Cc: Alex Deucher Cc: "Christian König" Signed-off-by: Andrew Morton --- include/linux/memremap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d2487a19cba2..cd28d1666801 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -77,11 +77,11 @@ enum memory_type { struct dev_pagemap_ops { /* - * Called once the page refcount reaches 0. The reference count will be + * Called once the folio refcount reaches 0. The reference count will be * reset to one by the core code after the method is called to prepare - * for handing out the page again. + * for handing out the folio again. */ - void (*page_free)(struct page *page); + void (*folio_free)(struct folio *folio); /* * Used for private (un-addressable) device memory only. Must migrate -- cgit v1.2.3 From 368076f52ebeecd33e10a9f80905d7508b6b6149 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:54 +1000 Subject: mm/huge_memory: add device-private THP support to PMD operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend core huge page management functions to handle device-private THP entries. This enables proper handling of large device-private folios in fundamental MM operations. The following functions have been updated: - copy_huge_pmd(): Handle device-private entries during fork/clone - zap_huge_pmd(): Properly free device-private THP during munmap - change_huge_pmd(): Support protection changes on device-private THP - __pte_offset_map(): Add device-private entry awareness Link: https://lkml.kernel.org/r/20251001065707.920170-4-balbirs@nvidia.com Signed-off-by: Matthew Brost Signed-off-by: Balbir Singh Acked-by: Zi Yan Cc: David Hildenbrand Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/swapops.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 64ea151a7ae3..2687928a8146 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -594,10 +594,42 @@ static inline int is_pmd_migration_entry(pmd_t pmd) } #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) + +/** + * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry + * @pmd: The PMD to check + * + * Returns true if the PMD contains a swap entry that represents a device private + * page mapping. This is used for zone device private pages that have been + * swapped out but still need special handling during various memory management + * operations. + * + * Return: 1 if PMD contains device private entry, 0 otherwise + */ +static inline int is_pmd_device_private_entry(pmd_t pmd) +{ + return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); +} + +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static inline int is_pmd_device_private_entry(pmd_t pmd) +{ + return 0; +} + +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } +static inline int is_pmd_non_present_folio_entry(pmd_t pmd) +{ + return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd); +} + #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ -- cgit v1.2.3 From a30b48bf1b244f11bf9b6d20cdccfe0c2264130c Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:58 +1000 Subject: mm/migrate_device: implement THP migration of zone device pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MIGRATE_VMA_SELECT_COMPOUND will be used to select THP pages during migrate_vma_setup() and MIGRATE_PFN_COMPOUND will make migrating device pages as compound pages during device pfn migration. migrate_device code paths go through the collect, setup and finalize phases of migration. The entries in src and dst arrays passed to these functions still remain at a PAGE_SIZE granularity. When a compound page is passed, the first entry has the PFN along with MIGRATE_PFN_COMPOUND and other flags set (MIGRATE_PFN_MIGRATE, MIGRATE_PFN_VALID), the remaining entries (HPAGE_PMD_NR - 1) are filled with 0's. This representation allows for the compound page to be split into smaller page sizes. migrate_vma_collect_hole(), migrate_vma_collect_pmd() are now THP page aware. Two new helper functions migrate_vma_collect_huge_pmd() and migrate_vma_insert_huge_pmd_page() have been added. migrate_vma_collect_huge_pmd() can collect THP pages, but if for some reason this fails, there is fallback support to split the folio and migrate it. migrate_vma_insert_huge_pmd_page() closely follows the logic of migrate_vma_insert_page() Support for splitting pages as needed for migration will follow in later patches in this series. Link: https://lkml.kernel.org/r/20251001065707.920170-8-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/migrate.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 1f0ac122c3bf..41b4cc05a450 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -125,6 +125,7 @@ static inline int migrate_misplaced_folio(struct folio *folio, int node) #define MIGRATE_PFN_VALID (1UL << 0) #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_WRITE (1UL << 3) +#define MIGRATE_PFN_COMPOUND (1UL << 4) #define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) @@ -143,6 +144,7 @@ enum migrate_vma_direction { MIGRATE_VMA_SELECT_SYSTEM = 1 << 0, MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1, MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2, + MIGRATE_VMA_SELECT_COMPOUND = 1 << 3, }; struct migrate_vma { -- cgit v1.2.3 From 4964099163d0524a769d039ffa886bb4515136d0 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:56:59 +1000 Subject: mm/memory/fault: add THP fault handling for zone device private pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement CPU fault handling for zone device THP entries through do_huge_pmd_device_private(), enabling transparent migration of device-private large pages back to system memory on CPU access. When the CPU accesses a zone device THP entry, the fault handler calls the device driver's migrate_to_ram() callback to migrate the entire large page back to system memory. Link: https://lkml.kernel.org/r/20251001065707.920170-9-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index fee4cf7fa300..82408c90b396 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -481,6 +481,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio) vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); +vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf); + extern struct folio *huge_zero_folio; extern unsigned long huge_zero_pfn; @@ -662,6 +664,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) return 0; } +static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf) +{ + return 0; +} + static inline bool is_huge_zero_folio(const struct folio *folio) { return false; -- cgit v1.2.3 From 775465fd26a325359887f9c3129444fcc76c6298 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:00 +1000 Subject: lib/test_hmm: add zone device private THP test infrastructure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhance the hmm test driver (lib/test_hmm) with support for THP pages. A new pool of free_folios() has now been added to the dmirror device, which can be allocated when a request for a THP zone device private page is made. Add compound page awareness to the allocation function during normal migration and fault based migration. These routines also copy folio_nr_pages() when moving data between system memory and device memory. args.src and args.dst used to hold migration entries are now dynamically allocated (as they need to hold HPAGE_PMD_NR entries or more). Split and migrate support will be added in future patches in this series. Link: https://lkml.kernel.org/r/20251001065707.920170-10-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/memremap.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index cd28d1666801..7df4dd037b69 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -177,6 +177,18 @@ static inline bool folio_is_pci_p2pdma(const struct folio *folio) folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } +static inline void *folio_zone_device_data(const struct folio *folio) +{ + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); + return folio->page.zone_device_data; +} + +static inline void folio_set_zone_device_data(struct folio *folio, void *data) +{ + VM_WARN_ON_FOLIO(!folio_is_device_private(folio), folio); + folio->page.zone_device_data = data; +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && -- cgit v1.2.3 From 56ef398996435a0021569b86293d376649f12540 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:01 +1000 Subject: mm/memremap: add driver callback support for folio splitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a zone device page is split (via huge pmd folio split). The driver callback for folio_split is invoked to let the device driver know that the folio size has been split into a smaller order. Provide a default implementation for drivers that do not provide this callback that copies the pgmap and mapping fields for the split folios. Update the HMM test driver to handle the split. Link: https://lkml.kernel.org/r/20251001065707.920170-11-balbirs@nvidia.com Signed-off-by: Balbir Singh Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/memremap.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 7df4dd037b69..aca2b16d6889 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -100,6 +100,13 @@ struct dev_pagemap_ops { */ int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn, unsigned long nr_pages, int mf_flags); + + /* + * Used for private (un-addressable) device memory only. + * This callback is used when a folio is split into + * a smaller folio + */ + void (*folio_split)(struct folio *head, struct folio *tail); }; #define PGMAP_ALTMAP_VALID (1 << 0) @@ -235,6 +242,23 @@ static inline void zone_device_folio_init(struct folio *folio, unsigned int orde folio_set_large_rmappable(folio); } +static inline void zone_device_private_split_cb(struct folio *original_folio, + struct folio *new_folio) +{ + if (folio_is_device_private(original_folio)) { + if (!original_folio->pgmap->ops->folio_split) { + if (new_folio) { + new_folio->pgmap = original_folio->pgmap; + new_folio->page.mapping = + original_folio->page.mapping; + } + } else { + original_folio->pgmap->ops->folio_split(original_folio, + new_folio); + } + } +} + #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) @@ -268,6 +292,11 @@ static inline unsigned long memremap_compat_align(void) { return PAGE_SIZE; } + +static inline void zone_device_private_split_cb(struct folio *original_folio, + struct folio *new_folio) +{ +} #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) -- cgit v1.2.3 From 4265d67e405a41562634279ca1ededf79fdadcd7 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 1 Oct 2025 16:57:02 +1000 Subject: mm/migrate_device: add THP splitting during migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement migrate_vma_split_pages() to handle THP splitting during the migration process when destination cannot allocate compound pages. This addresses the common scenario where migrate_vma_setup() succeeds with MIGRATE_PFN_COMPOUND pages, but the destination device cannot allocate large pages during the migration phase. Key changes: - migrate_vma_split_pages(): Split already-isolated pages during migration - Enhanced folio_split() and __split_unmapped_folio() with isolated parameter to avoid redundant unmap/remap operations This provides a fallback mechansim to ensure migration succeeds even when large page allocation fails at the destination. [matthew.brost@intel.com: add THP splitting during migration] Link: https://lkml.kernel.org/r/20251120230825.181072-2-matthew.brost@intel.com Link: https://lkml.kernel.org/r/20251001065707.920170-12-balbirs@nvidia.com Signed-off-by: Balbir Singh Signed-off-by: Matthew Brost Cc: David Hildenbrand Cc: Zi Yan Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 82408c90b396..ed99e6bd31ac 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -365,8 +365,8 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add vm_flags_t vm_flags); bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); -int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order); +int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order, bool unmapped); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); bool uniform_split_supported(struct folio *folio, unsigned int new_order, @@ -375,6 +375,13 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, bool warns); int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); + +static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, + unsigned int new_order) +{ + return __split_huge_page_to_list_to_order(page, list, new_order, false); +} + /* * try_folio_split_to_order - try to split a @folio at @page to @new_order using * non uniform split. -- cgit v1.2.3 From ac7756771a34f19c9a757eb86efe028e51f57b23 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 8 Oct 2025 09:54:53 +0000 Subject: mm/khugepaged: unify pmd folio installation with map_anon_folio_pmd() Currently we install pmd folio with map_anon_folio_pmd() in __do_huge_pmd_anonymous_page() and do_huge_zero_wp_pmd(). While in collapse_huge_page(), it is done with identical code except statistics adjustment. Unify the process with map_anon_folio_pmd() to install pmd folio. Split it to map_anon_folio_pmd_pf() and map_anon_folio_pmd_nopf() to be used in page fault or not respectively. No functional change is intended. [akpm@linux-foundation.org: remove unneeded map_anon_folio_pmd_nopf() stub, per Wei & David] Link: https://lkml.kernel.org/r/20251008095453.18772-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: Lance Yang Cc: David Hildenbrand Cc: Lance Yang Cc: Dev Jain Cc: Zi Yan Cc: Usama Arif Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ed99e6bd31ac..396d9e3d1d46 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -533,6 +533,8 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, bool freeze); bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio); +void map_anon_folio_pmd_nopf(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v1.2.3 From a7ef12c64fd991c0f42b2e1bf0c4f09068575864 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 31 Oct 2025 12:19:59 -0400 Subject: mm/huge_memory: add split_huge_page_to_order() Patch series "Optimize folio split in memory failure", v5. This patchset optimizes folio split operations in memory failure code by always splitting a folio to min_order_for_split() to minimize unusable pages, even if min_order_for_split() is non zero and memory failure code would take the failed path eventually for a successfully split folio. This means instead of making the entire original folio unusable memory failure code would only make its after-split folio, which has order of min_order_for_split() and contains HWPoison page, unusable. For soft offline case, since the original folio is still accessible, no split is performed if the folio cannot be split to order-0 to prevent potential performance loss. In addition, add split_huge_page_to_order() to improve code readability and fix kernel-doc comment format for folio_split() and other related functions. Background ========== This patchset is a follow-up of "[PATCH v3] mm/huge_memory: do not change split_huge_page*() target order silently."[1] and [PATCH v4] mm/huge_memory: preserve PG_has_hwpoisoned if a folio is split to >0 order[2], since both are separated out as hotfixes. It improves how memory failure code handles large block size(LBS) folios with min_order_for_split() > 0. By splitting a large folio containing HW poisoned pages to min_order_for_split(), the after-split folios without HW poisoned pages could be freed for reuse. To achieve this, folio split code needs to set has_hwpoisoned on after-split folios containing HW poisoned pages and it is done in the hotfix in [2]. This patchset includes: 1. A patch adds split_huge_page_to_order(), 2. Patch 2 and Patch 3 of "[PATCH v2 0/3] Do not change split folio target order"[3], This patch (of 3): When the caller does not supply a list to split_huge_page_to_list_to_order(), use split_huge_page_to_order() instead. Link: https://lkml.kernel.org/r/20251031162001.670503-1-ziy@nvidia.com Link: https://lkml.kernel.org/r/20251031162001.670503-2-ziy@nvidia.com Link: https://lore.kernel.org/all/20251017013630.139907-1-ziy@nvidia.com/ [1] Link: https://lore.kernel.org/all/20251023030521.473097-1-ziy@nvidia.com/ [2] Link: https://lore.kernel.org/all/20251016033452.125479-1-ziy@nvidia.com/ [3] Signed-off-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Wei Yang Reviewed-by: Miaohe Lin Reviewed-by: Barry Song Reviewed-by: Lance Yang Cc: Baolin Wang Cc: Dev Jain Cc: Jane Chu Cc: Liam Howlett Cc: Luis Chamberalin Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: Nico Pache Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 396d9e3d1d46..a06924cf4065 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -381,6 +381,10 @@ static inline int split_huge_page_to_list_to_order(struct page *page, struct lis { return __split_huge_page_to_list_to_order(page, list, new_order, false); } +static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) +{ + return split_huge_page_to_list_to_order(page, NULL, new_order); +} /* * try_folio_split_to_order - try to split a @folio at @page to @new_order using @@ -400,8 +404,7 @@ static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) { if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) - return split_huge_page_to_list_to_order(&folio->page, NULL, - new_order); + return split_huge_page_to_order(&folio->page, new_order); return folio_split(folio, new_order, page, NULL); } static inline int split_huge_page(struct page *page) @@ -587,6 +590,11 @@ split_huge_page_to_list_to_order(struct page *page, struct list_head *list, VM_WARN_ON_ONCE_PAGE(1, page); return -EINVAL; } +static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) +{ + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; +} static inline int split_huge_page(struct page *page) { VM_WARN_ON_ONCE_PAGE(1, page); -- cgit v1.2.3 From 50d0598cf2c9d33e1f08c3b1a357752ea8a9b94a Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 31 Oct 2025 12:20:01 -0400 Subject: mm/huge_memory: fix kernel-doc comments for folio_split() and related try_folio_split_to_order(), folio_split, __folio_split(), and __split_unmapped_folio() do not have correct kernel-doc comment format. Fix them. [ziy@nvidia.com: kernel-doc fixup] Link: https://lkml.kernel.org/r/BE7AC5F3-9E64-4923-861D-C2C4E0CB91EB@nvidia.com [ziy@nvidia.com: add newline to fix an error and a warning from docutils] Link: https://lkml.kernel.org/r/040B38C0-23C6-4AEA-B069-69AE6DAA828B@nvidia.com Link: https://lkml.kernel.org/r/20251031162001.670503-4-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Lance Yang Reviewed-by: Barry Song Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Dev Jain Cc: Jane Chu Cc: Liam Howlett Cc: Luis Chamberalin Cc: Matthew Wilcox (Oracle) Cc: Naoya Horiguchi Cc: Nico Pache Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a06924cf4065..9f7f7d772fe5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -386,9 +386,9 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o return split_huge_page_to_list_to_order(page, NULL, new_order); } -/* - * try_folio_split_to_order - try to split a @folio at @page to @new_order using - * non uniform split. +/** + * try_folio_split_to_order() - try to split a @folio at @page to @new_order + * using non uniform split. * @folio: folio to be split * @page: split to @new_order at the given page * @new_order: the target split order @@ -398,7 +398,7 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o * folios are put back to LRU list. Use min_order_for_split() to get the lower * bound of @new_order. * - * Return: 0: split is successful, otherwise split failed. + * Return: 0 - split is successful, otherwise split failed. */ static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) @@ -483,6 +483,8 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, /** * folio_test_pmd_mappable - Can we map this folio with a PMD? * @folio: The folio to test + * + * Return: true - @folio can be mapped, false - @folio cannot be mapped. */ static inline bool folio_test_pmd_mappable(struct folio *folio) { -- cgit v1.2.3 From c467061fbb6eb483d59f546c145b2ff2249455e4 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Nov 2025 03:41:54 +0000 Subject: mm/huge_memory: introduce enum split_type for clarity Patch series "mm/huge_memory: Define split_type and consolidate split support checks", v3. This two-patch series focuses on improving code clarity and removing redundancy in the huge memory handling logic related to folio splitting. The series is based on an original proposal to merge two significantly identical functions that check folio split support[1]. During this process, we found an opportunity to improve readability by explicitly defining the split types. Patch 1: define split_type and use it Patch 2: merge uniform_split_supported() and non_uniform_split_supported() This patch (of 2): We currently handle two distinct types of large folio splitting: * uniform split * non-uniform split Differentiating between these types using a simple boolean variable is not obvious and can harm code readability. This commit introduces enum split_type to explicitly define these two types. Replacing the existing boolean variable with this enumeration significantly improves code clarity and expressiveness when dealing with folio splitting logic. No functional change is expected. [akpm@linux-foundation.org: tweak layout, per David] Link: https://lkml.kernel.org/r/20251106034155.21398-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20251106034155.21398-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Zi Yan Cc: "David Hildenbrand (Red Hat)" Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Nico Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9f7f7d772fe5..b74708dc5b5f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -364,6 +364,11 @@ unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long add unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); +enum split_type { + SPLIT_TYPE_UNIFORM, + SPLIT_TYPE_NON_UNIFORM, +}; + bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order, bool unmapped); -- cgit v1.2.3 From 8a0e4bdddd1c998b894d879a1d22f1e745606215 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 6 Nov 2025 03:41:55 +0000 Subject: mm/huge_memory: merge uniform_split_supported() and non_uniform_split_supported() uniform_split_supported() and non_uniform_split_supported() share significantly similar logic. The only functional difference is that uniform_split_supported() includes an additional check on the requested @new_order. The reason for this check comes from the following two aspects: * some file system or swap cache just supports order-0 folio * the behavioral difference between uniform/non-uniform split The behavioral difference between uniform split and non-uniform: * uniform split splits folio directly to @new_order * non-uniform split creates after-split folios with orders from folio_order(folio) - 1 to new_order. This means for non-uniform split or !new_order split we should check the file system and swap cache respectively. This commit unifies the logic and merge the two functions into a single combined helper, removing redundant code and simplifying the split support checking mechanism. Link: https://lkml.kernel.org/r/20251106034155.21398-3-richard.weiyang@gmail.com Fixes: c010d47f107f ("mm: thp: split huge page to any lower order pages") Signed-off-by: Wei Yang Reviewed-by: Zi Yan Cc: Zi Yan Cc: "David Hildenbrand (Red Hat)" Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Nico Pache Cc: Ryan Roberts Cc: Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b74708dc5b5f..19d4a5f52ca2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -374,10 +374,8 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list unsigned int new_order, bool unmapped); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); -bool uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns); -bool non_uniform_split_supported(struct folio *folio, unsigned int new_order, - bool warns); +bool folio_split_supported(struct folio *folio, unsigned int new_order, + enum split_type split_type, bool warns); int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); @@ -408,7 +406,7 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) { - if (!non_uniform_split_supported(folio, new_order, /* warns= */ false)) + if (!folio_split_supported(folio, new_order, SPLIT_TYPE_NON_UNIFORM, /* warns= */ false)) return split_huge_page_to_order(&folio->page, new_order); return folio_split(folio, new_order, page, NULL); } -- cgit v1.2.3 From c093cf451094a9a03c4d4929bc30122a53038b7b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:19 +0000 Subject: mm: correctly handle UFFD PTE markers Patch series "mm: remove is_swap_[pte, pmd]() + non-swap entries, introduce leaf entries", v3. There's an established convention in the kernel that we treat leaf page tables (so far at the PTE, PMD level) as containing 'swap entries' should they be neither empty (i.e. p**_none() evaluating true) nor present (i.e. p**_present() evaluating true). However, at the same time we also have helper predicates - is_swap_pte(), is_swap_pmd() - which are inconsistently used. This is problematic, as it is logical to assume that should somebody wish to operate upon a page table swap entry they should first check to see if it is in fact one. It also implies that perhaps, in future, we might introduce a non-present, none page table entry that is not a swap entry. This series resolves this issue by systematically eliminating all use of the is_swap_pte() and is swap_pmd() predicates so we retain only the convention that should a leaf page table entry be neither none nor present it is a swap entry. We also have the further issue that 'swap entry' is unfortunately a really rather overloaded term and in fact refers to both entries for swap and for other information such as migration entries, page table markers, and device private entries. We therefore have the rather 'unique' concept of a 'non-swap' swap entry. This series therefore introduces the concept of 'software leaf entries', of type softleaf_t, to eliminate this confusion. A software leaf entry in this sense is any page table entry which is non-present, and represented by the softleaf_t type. That is - page table leaf entries which are software-controlled by the kernel. This includes 'none' or empty entries, which are simply represented by an zero leaf entry value. In order to maintain compatibility as we transition the kernel to this new type, we simply typedef swp_entry_t to softleaf_t. We introduce a number of predicates and helpers to interact with software leaf entries in include/linux/leafops.h which, as it imports swapops.h, can be treated as a drop-in replacement for swapops.h wherever leaf entry helpers are used. Since softleaf_from_[pte, pmd]() treats present entries as they were empty/none leaf entries, this allows for a great deal of simplification of code throughout the code base, which this series utilises a great deal. We additionally change from swap entry to software leaf entry handling where it makes sense to and eliminate functions from swapops.h where software leaf entries obviate the need for the functions. This patch (of 16): PTE markers were previously only concerned with UFFD-specific logic - that is, PTE entries with the UFFD WP marker set or those marked via UFFDIO_POISON. However since the introduction of guard markers in commit 7c53dfbdb024 ("mm: add PTE_MARKER_GUARD PTE marker"), this has no longer been the case. Issues have been avoided as guard regions are not permitted in conjunction with UFFD, but it still leaves very confusing logic in place, most notably the misleading and poorly named pte_none_mostly() and huge_pte_none_mostly(). This predicate returns true for PTE entries that ought to be treated as none, but only in certain circumstances, and on the assumption we are dealing with H/W poison markers or UFFD WP markers. This patch removes these functions and makes each invocation of these functions instead explicitly check what it needs to check. As part of this effort it introduces is_uffd_pte_marker() to explicitly determine if a marker in fact is used as part of UFFD or not. In the HMM logic we note that the only time we would need to check for a fault is in the case of a UFFD WP marker, otherwise we simply encounter a fault error (VM_FAULT_HWPOISON for H/W poisoned marker, VM_FAULT_SIGSEGV for a guard marker), so only check for the UFFD WP case. While we're here we also refactor code to make it easier to understand. [akpm@linux-foundation.org: fix comment typo, per Mike] Link: https://lkml.kernel.org/r/cover.1762812360.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/c38625fd9a1c1f1cf64ae8a248858e45b3dcdf11.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/asm-generic/hugetlb.h | 8 -------- include/linux/swapops.h | 18 ------------------ include/linux/userfaultfd_k.h | 21 +++++++++++++++++++++ 3 files changed, 21 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index dcb8727f2b82..e1a2e1b7c8e7 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -97,14 +97,6 @@ static inline int huge_pte_none(pte_t pte) } #endif -/* Please refer to comments above pte_none_mostly() for the usage */ -#ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY -static inline int huge_pte_none_mostly(pte_t pte) -{ - return huge_pte_none(pte) || is_pte_marker(pte); -} -#endif - #ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 2687928a8146..d1f665935cfc 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -469,24 +469,6 @@ static inline int is_guard_swp_entry(swp_entry_t entry) (pte_marker_get(entry) & PTE_MARKER_GUARD); } -/* - * This is a special version to check pte_none() just to cover the case when - * the pte is a pte marker. It existed because in many cases the pte marker - * should be seen as a none pte; it's just that we have stored some information - * onto the none pte so it becomes not-none any more. - * - * It should be used when the pte is file-backed, ram-based and backing - * userspace pages, like shmem. It is not needed upon pgtables that do not - * support pte markers at all. For example, it's not needed on anonymous - * memory, kernel-only memory (including when the system is during-boot), - * non-ram based generic file-system. It's fine to be used even there, but the - * extra pte marker check will be pure overhead. - */ -static inline int pte_none_mostly(pte_t pte) -{ - return pte_none(pte) || is_pte_marker(pte); -} - static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c0e716aec26a..da0b4fcc566f 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -479,4 +479,25 @@ static inline bool pte_swp_uffd_wp_any(pte_t pte) return false; } + +static inline bool is_uffd_pte_marker(pte_t pte) +{ + swp_entry_t entry; + + if (pte_present(pte)) + return false; + + entry = pte_to_swp_entry(pte); + if (!is_pte_marker_entry(entry)) + return false; + + /* UFFD WP, poisoned swap entries are UFFD handled. */ + if (pte_marker_entry_uffd_wp(entry)) + return true; + if (is_poisoned_swp_entry(entry)) + return true; + + return false; +} + #endif /* _LINUX_USERFAULTFD_K_H */ -- cgit v1.2.3 From 68aa2fdbf57f769e552f472ddb762aba028a207e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:20 +0000 Subject: mm: introduce leaf entry type and use to simplify leaf entry logic The kernel maintains leaf page table entries which contain either: The kernel maintains leaf page table entries which contain either: - Nothing ('none' entries) - Present entries* - Everything else that will cause a fault which the kernel handles * Present entries are either entries the hardware can navigate without page fault or special cases like NUMA hint protnone or PMD with cleared present bit which contain hardware-valid entries modulo the present bit. In the 'everything else' group we include swap entries, but we also include a number of other things such as migration entries, device private entries and marker entries. Unfortunately this 'everything else' group expresses everything through a swp_entry_t type, and these entries are referred to swap entries even though they may well not contain a... swap entry. This is compounded by the rather mind-boggling concept of a non-swap swap entry (checked via non_swap_entry()) and the means by which we twist and turn to satisfy this. This patch lays the foundation for reducing this confusion. We refer to 'everything else' as a 'software-define leaf entry' or 'softleaf'. for short And in fact we scoop up the 'none' entries into this concept also so we are left with: - Present entries. - Softleaf entries (which may be empty). This allows for radical simplification across the board - one can simply convert any leaf page table entry to a leaf entry via softleaf_from_pte(). If the entry is present, we return an empty leaf entry, so it is assumed the caller is aware that they must differentiate between the two categories of page table entries, checking for the former via pte_present(). As a result, we can eliminate a number of places where we would otherwise need to use predicates to see if we can proceed with leaf page table entry conversion and instead just go ahead and do it unconditionally. We do so where we can, adjusting surrounding logic as necessary to integrate the new softleaf_t logic as far as seems reasonable at this stage. We typedef swp_entry_t to softleaf_t for the time being until the conversion can be complete, meaning everything remains compatible regardless of which type is used. We will eventually remove swp_entry_t when the conversion is complete. We introduce a new header file to keep things clear - leafops.h - this imports swapops.h so can direct replace swapops imports without issue, and we do so in all the files that require it. Additionally, add new leafops.h file to core mm maintainers entry. Link: https://lkml.kernel.org/r/c879383aac77d96a03e4d38f7daba893cd35fc76.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/leafops.h | 387 ++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_inline.h | 6 +- include/linux/mm_types.h | 25 +++ include/linux/swapops.h | 28 --- include/linux/userfaultfd_k.h | 51 +----- 5 files changed, 417 insertions(+), 80 deletions(-) create mode 100644 include/linux/leafops.h (limited to 'include') diff --git a/include/linux/leafops.h b/include/linux/leafops.h new file mode 100644 index 000000000000..cff9d94fd5d1 --- /dev/null +++ b/include/linux/leafops.h @@ -0,0 +1,387 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Describes operations that can be performed on software-defined page table + * leaf entries. These are abstracted from the hardware page table entries + * themselves by the softleaf_t type, see mm_types.h. + */ +#ifndef _LINUX_LEAFOPS_H +#define _LINUX_LEAFOPS_H + +#include +#include +#include + +#ifdef CONFIG_MMU + +/* Temporary until swp_entry_t eliminated. */ +#define LEAF_TYPE_SHIFT SWP_TYPE_SHIFT + +enum softleaf_type { + /* Fundamental types. */ + SOFTLEAF_NONE, + SOFTLEAF_SWAP, + /* Migration types. */ + SOFTLEAF_MIGRATION_READ, + SOFTLEAF_MIGRATION_READ_EXCLUSIVE, + SOFTLEAF_MIGRATION_WRITE, + /* Device types. */ + SOFTLEAF_DEVICE_PRIVATE_READ, + SOFTLEAF_DEVICE_PRIVATE_WRITE, + SOFTLEAF_DEVICE_EXCLUSIVE, + /* H/W posion types. */ + SOFTLEAF_HWPOISON, + /* Marker types. */ + SOFTLEAF_MARKER, +}; + +/** + * softleaf_mk_none() - Create an empty ('none') leaf entry. + * Returns: empty leaf entry. + */ +static inline softleaf_t softleaf_mk_none(void) +{ + return ((softleaf_t) { 0 }); +} + +/** + * softleaf_from_pte() - Obtain a leaf entry from a PTE entry. + * @pte: PTE entry. + * + * If @pte is present (therefore not a leaf entry) the function returns an empty + * leaf entry. Otherwise, it returns a leaf entry. + * + * Returns: Leaf entry. + */ +static inline softleaf_t softleaf_from_pte(pte_t pte) +{ + if (pte_present(pte) || pte_none(pte)) + return softleaf_mk_none(); + + /* Temporary until swp_entry_t eliminated. */ + return pte_to_swp_entry(pte); +} + +/** + * softleaf_is_none() - Is the leaf entry empty? + * @entry: Leaf entry. + * + * Empty entries are typically the result of a 'none' page table leaf entry + * being converted to a leaf entry. + * + * Returns: true if the entry is empty, false otherwise. + */ +static inline bool softleaf_is_none(softleaf_t entry) +{ + return entry.val == 0; +} + +/** + * softleaf_type() - Identify the type of leaf entry. + * @enntry: Leaf entry. + * + * Returns: the leaf entry type associated with @entry. + */ +static inline enum softleaf_type softleaf_type(softleaf_t entry) +{ + unsigned int type_num; + + if (softleaf_is_none(entry)) + return SOFTLEAF_NONE; + + type_num = entry.val >> LEAF_TYPE_SHIFT; + + if (type_num < MAX_SWAPFILES) + return SOFTLEAF_SWAP; + + switch (type_num) { +#ifdef CONFIG_MIGRATION + case SWP_MIGRATION_READ: + return SOFTLEAF_MIGRATION_READ; + case SWP_MIGRATION_READ_EXCLUSIVE: + return SOFTLEAF_MIGRATION_READ_EXCLUSIVE; + case SWP_MIGRATION_WRITE: + return SOFTLEAF_MIGRATION_WRITE; +#endif +#ifdef CONFIG_DEVICE_PRIVATE + case SWP_DEVICE_WRITE: + return SOFTLEAF_DEVICE_PRIVATE_WRITE; + case SWP_DEVICE_READ: + return SOFTLEAF_DEVICE_PRIVATE_READ; + case SWP_DEVICE_EXCLUSIVE: + return SOFTLEAF_DEVICE_EXCLUSIVE; +#endif +#ifdef CONFIG_MEMORY_FAILURE + case SWP_HWPOISON: + return SOFTLEAF_HWPOISON; +#endif + case SWP_PTE_MARKER: + return SOFTLEAF_MARKER; + } + + /* Unknown entry type. */ + VM_WARN_ON_ONCE(1); + return SOFTLEAF_NONE; +} + +/** + * softleaf_is_swap() - Is this leaf entry a swap entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a swap entry, otherwise false. + */ +static inline bool softleaf_is_swap(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_SWAP; +} + +/** + * softleaf_is_migration() - Is this leaf entry a migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a migration entry, otherwise false. + */ +static inline bool softleaf_is_migration(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_MIGRATION_READ: + case SOFTLEAF_MIGRATION_READ_EXCLUSIVE: + case SOFTLEAF_MIGRATION_WRITE: + return true; + default: + return false; + } +} + +/** + * softleaf_is_device_private() - Is this leaf entry a device private entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private entry, otherwise false. + */ +static inline bool softleaf_is_device_private(softleaf_t entry) +{ + switch (softleaf_type(entry)) { + case SOFTLEAF_DEVICE_PRIVATE_WRITE: + case SOFTLEAF_DEVICE_PRIVATE_READ: + return true; + default: + return false; + } +} + +/** + * softleaf_is_device_exclusive() - Is this leaf entry a device exclusive entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device exclusive entry, otherwise false. + */ +static inline bool softleaf_is_device_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_DEVICE_EXCLUSIVE; +} + +/** + * softleaf_is_hwpoison() - Is this leaf entry a hardware poison entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a hardware poison entry, otherwise false. + */ +static inline bool softleaf_is_hwpoison(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_HWPOISON; +} + +/** + * softleaf_is_marker() - Is this leaf entry a marker? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a marker entry, otherwise false. + */ +static inline bool softleaf_is_marker(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MARKER; +} + +/** + * softleaf_to_marker() - Obtain marker associated with leaf entry. + * @entry: Leaf entry, softleaf_is_marker(@entry) must return true. + * + * Returns: Marker associated with the leaf entry. + */ +static inline pte_marker softleaf_to_marker(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_marker(entry)); + + return swp_offset(entry) & PTE_MARKER_MASK; +} + +/** + * softleaf_has_pfn() - Does this leaf entry encode a valid PFN number? + * @entry: Leaf entry. + * + * A pfn swap entry is a special type of swap entry that always has a pfn stored + * in the swap offset. They can either be used to represent unaddressable device + * memory, to restrict access to a page undergoing migration or to represent a + * pfn which has been hwpoisoned and unmapped. + * + * Returns: true if the leaf entry encodes a PFN, otherwise false. + */ +static inline bool softleaf_has_pfn(softleaf_t entry) +{ + /* Make sure the swp offset can always store the needed fields. */ + BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); + + if (softleaf_is_migration(entry)) + return true; + if (softleaf_is_device_private(entry)) + return true; + if (softleaf_is_device_exclusive(entry)) + return true; + if (softleaf_is_hwpoison(entry)) + return true; + + return false; +} + +/** + * softleaf_to_pfn() - Obtain PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: The PFN associated with the leaf entry. + */ +static inline unsigned long softleaf_to_pfn(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return swp_offset_pfn(entry); +} + +/** + * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: Pointer to the struct page associated with the leaf entry's PFN. + */ +static inline struct page *softleaf_to_page(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return pfn_swap_entry_to_page(entry); +} + +/** + * softleaf_to_folio() - Obtains struct folio for PFN encoded within leaf entry. + * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. + * + * Returns: Pointer to the struct folio associated with the leaf entry's PFN. + */ +static inline struct folio *softleaf_to_folio(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + + /* Temporary until swp_entry_t eliminated. */ + return pfn_swap_entry_folio(entry); +} + +/** + * softleaf_is_poison_marker() - Is this leaf entry a poison marker? + * @entry: Leaf entry. + * + * The poison marker is set via UFFDIO_POISON. Userfaultfd-specific. + * + * Returns: true if the leaf entry is a poison marker, otherwise false. + */ +static inline bool softleaf_is_poison_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_POISONED; +} + +/** + * softleaf_is_guard_marker() - Is this leaf entry a guard region marker? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a guard marker, otherwise false. + */ +static inline bool softleaf_is_guard_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_GUARD; +} + +/** + * softleaf_is_uffd_wp_marker() - Is this leaf entry a userfautlfd write protect + * marker? + * @entry: Leaf entry. + * + * Userfaultfd-specific. + * + * Returns: true if the leaf entry is a UFFD WP marker, otherwise false. + */ +static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry) +{ + if (!softleaf_is_marker(entry)) + return false; + + return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP; +} + +/** + * pte_is_marker() - Does the PTE entry encode a marker leaf entry? + * @pte: PTE entry. + * + * Returns: true if this PTE is a marker leaf entry, otherwise false. + */ +static inline bool pte_is_marker(pte_t pte) +{ + return softleaf_is_marker(softleaf_from_pte(pte)); +} + +/** + * pte_is_uffd_wp_marker() - Does this PTE entry encode a userfaultfd write + * protect marker leaf entry? + * @pte: PTE entry. + * + * Returns: true if this PTE is a UFFD WP marker leaf entry, otherwise false. + */ +static inline bool pte_is_uffd_wp_marker(pte_t pte) +{ + const softleaf_t entry = softleaf_from_pte(pte); + + return softleaf_is_uffd_wp_marker(entry); +} + +/** + * pte_is_uffd_marker() - Does this PTE entry encode a userfault-specific marker + * leaf entry? + * @entry: Leaf entry. + * + * It's useful to be able to determine which leaf entries encode UFFD-specific + * markers so we can handle these correctly. + * + * Returns: true if this PTE entry is a UFFD-specific marker, otherwise false. + */ +static inline bool pte_is_uffd_marker(pte_t pte) +{ + const softleaf_t entry = softleaf_from_pte(pte); + + if (!softleaf_is_marker(entry)) + return false; + + /* UFFD WP, poisoned swap entries are UFFD-handled. */ + if (softleaf_is_uffd_wp_marker(entry)) + return true; + if (softleaf_is_poison_marker(entry)) + return true; + + return false; +} + +#endif /* CONFIG_MMU */ +#endif /* _LINUX_LEAFOPS_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index f6a2b2d20016..ca7a18351797 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -8,7 +8,7 @@ #include #include #include -#include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -541,9 +541,9 @@ static inline bool mm_tlb_flush_nested(const struct mm_struct *mm) * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( - swp_entry_t entry, struct vm_area_struct *dst_vma) + softleaf_t entry, struct vm_area_struct *dst_vma) { - pte_marker srcm = pte_marker_get(entry); + const pte_marker srcm = softleaf_to_marker(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5021047485a9..4f66a3206a63 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -285,6 +285,31 @@ typedef struct { unsigned long val; } swp_entry_t; +/** + * typedef softleaf_t - Describes a page table software leaf entry, abstracted + * from its architecture-specific encoding. + * + * Page table leaf entries are those which do not reference any descendent page + * tables but rather either reference a data page, are an empty (or 'none' + * entry), or contain a non-present entry. + * + * If referencing another page table or a data page then the page table entry is + * pertinent to hardware - that is it tells the hardware how to decode the page + * table entry. + * + * Otherwise it is a software-defined leaf page table entry, which this type + * describes. See leafops.h and specifically @softleaf_type for a list of all + * possible kinds of software leaf entry. + * + * A softleaf_t entry is abstracted from the hardware page table entry, so is + * not architecture-specific. + * + * NOTE: While we transition from the confusing swp_entry_t type used for this + * purpose, we simply alias this type. This will be removed once the + * transition is complete. + */ +typedef swp_entry_t softleaf_t; + #if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT) /* We have some extra room after the refcount in tail pages. */ #define NR_PAGES_IN_LARGE_FOLIO diff --git a/include/linux/swapops.h b/include/linux/swapops.h index d1f665935cfc..0a4b3f51ecf5 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -426,21 +426,6 @@ static inline swp_entry_t make_pte_marker_entry(pte_marker marker) return swp_entry(SWP_PTE_MARKER, marker); } -static inline bool is_pte_marker_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_PTE_MARKER; -} - -static inline pte_marker pte_marker_get(swp_entry_t entry) -{ - return swp_offset(entry) & PTE_MARKER_MASK; -} - -static inline bool is_pte_marker(pte_t pte) -{ - return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte)); -} - static inline pte_t make_pte_marker(pte_marker marker) { return swp_entry_to_pte(make_pte_marker_entry(marker)); @@ -451,24 +436,11 @@ static inline swp_entry_t make_poisoned_swp_entry(void) return make_pte_marker_entry(PTE_MARKER_POISONED); } -static inline int is_poisoned_swp_entry(swp_entry_t entry) -{ - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_POISONED); - -} - static inline swp_entry_t make_guard_swp_entry(void) { return make_pte_marker_entry(PTE_MARKER_GUARD); } -static inline int is_guard_swp_entry(swp_entry_t entry) -{ - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_GUARD); -} - static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index da0b4fcc566f..983c860a00f1 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include @@ -434,32 +434,6 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) return userfaultfd_wp_unpopulated(vma); } -static inline bool pte_marker_entry_uffd_wp(swp_entry_t entry) -{ -#ifdef CONFIG_PTE_MARKER_UFFD_WP - return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_UFFD_WP); -#else - return false; -#endif -} - -static inline bool pte_marker_uffd_wp(pte_t pte) -{ -#ifdef CONFIG_PTE_MARKER_UFFD_WP - swp_entry_t entry; - - if (!is_swap_pte(pte)) - return false; - - entry = pte_to_swp_entry(pte); - - return pte_marker_entry_uffd_wp(entry); -#else - return false; -#endif -} - /* * Returns true if this is a swap pte and was uffd-wp wr-protected in either * forms (pte marker or a normal swap pte), false otherwise. @@ -473,31 +447,10 @@ static inline bool pte_swp_uffd_wp_any(pte_t pte) if (pte_swp_uffd_wp(pte)) return true; - if (pte_marker_uffd_wp(pte)) + if (pte_is_uffd_wp_marker(pte)) return true; #endif return false; } - -static inline bool is_uffd_pte_marker(pte_t pte) -{ - swp_entry_t entry; - - if (pte_present(pte)) - return false; - - entry = pte_to_swp_entry(pte); - if (!is_pte_marker_entry(entry)) - return false; - - /* UFFD WP, poisoned swap entries are UFFD handled. */ - if (pte_marker_entry_uffd_wp(entry)) - return true; - if (is_poisoned_swp_entry(entry)) - return true; - - return false; -} - #endif /* _LINUX_USERFAULTFD_K_H */ -- cgit v1.2.3 From fb888710e26a8a8a37dc0f8ed09a3c908c63eb71 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:21 +0000 Subject: mm: avoid unnecessary uses of is_swap_pte() There's an established convention in the kernel that we treat PTEs as containing swap entries (and the unfortunately named non-swap swap entries) should they be neither empty (i.e. pte_none() evaluating true) nor present (i.e. pte_present() evaluating true). However, there is some inconsistency in how this is applied, as we also have the is_swap_pte() helper which explicitly performs this check: /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { return !pte_none(pte) && !pte_present(pte); } As this represents a predicate, and it's logical to assume that in order to establish that a PTE entry can correctly be manipulated as a swap/non-swap entry, this predicate seems as if it must first be checked. But we instead, we far more often utilise the established convention of checking pte_none() / pte_present() before operating on entries as if they were swap/non-swap. This patch works towards correcting this inconsistency by removing all uses of is_swap_pte() where we are already in a position where we perform pte_none()/pte_present() checks anyway or otherwise it is clearly logical to do so. We also take advantage of the fact that pte_swp_uffd_wp() is only set on swap entries. Additionally, update comments referencing to is_swap_pte() and non_swap_entry(). No functional change intended. Link: https://lkml.kernel.org/r/17fd6d7f46a846517fd455fadd640af47fcd7c55.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/userfaultfd_k.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 983c860a00f1..96b089dff4ef 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -441,9 +441,8 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) static inline bool pte_swp_uffd_wp_any(pte_t pte) { #ifdef CONFIG_PTE_MARKER_UFFD_WP - if (!is_swap_pte(pte)) + if (pte_present(pte)) return false; - if (pte_swp_uffd_wp(pte)) return true; -- cgit v1.2.3 From fb410d8b89e89ef61b18326f07c477f563b631f6 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:23 +0000 Subject: mm: use leaf entries in debug pgtable + remove is_swap_pte() Remove invocations of is_swap_pte() in mm/debug_vm_pgtable.c and use softleaf_from_pte() and softleaf_is_swap() as necessary to replace this usage. We update the test code to use a 'true' swap entry throughout so we are guaranteed this is not a non-swap entry, so all asserts continue to operate correctly. With this change in place, we no longer use is_swap_pte() anywhere, so remove it. Link: https://lkml.kernel.org/r/222f352e7a99191b4bdfa77e835f2fc0dd83fa72.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swapops.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 0a4b3f51ecf5..a66ac4f2105c 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -120,12 +120,6 @@ static inline unsigned long swp_offset_pfn(swp_entry_t entry) return swp_offset(entry) & SWP_PFN_MASK; } -/* check whether a pte points to a swap entry */ -static inline int is_swap_pte(pte_t pte) -{ - return !pte_none(pte) && !pte_present(pte); -} - /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. -- cgit v1.2.3 From aa62204cb680d8ff32497181fc9e0dac4956f7e5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:25 +0000 Subject: mm: avoid unnecessary use of is_swap_pmd() PMD 'non-swap' swap entries are currently used for PMD-level migration entries and device private entries. To add to the confusion in this terminology we use is_swap_pmd() in an inconsistent way similar to how is_swap_pte() was being used - sometimes adopting the convention that !pmd_none(), !pmd_present() implies PMD 'swap' entry, sometimes not. This patch handles the low-hanging fruit of cases where we can simply substitute other predicates for is_swap_pmd(). No functional change intended. Link: https://lkml.kernel.org/r/8a1704b36a009c18032d5bea4cb68e71448fbbe5.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swapops.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index a66ac4f2105c..3e8dd6ea94dd 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -509,7 +509,13 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) static inline int is_pmd_migration_entry(pmd_t pmd) { - return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd)); + swp_entry_t entry; + + if (pmd_present(pmd)) + return 0; + + entry = pmd_to_swp_entry(pmd); + return is_migration_entry(entry); } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, @@ -557,7 +563,13 @@ static inline int is_pmd_migration_entry(pmd_t pmd) */ static inline int is_pmd_device_private_entry(pmd_t pmd) { - return is_swap_pmd(pmd) && is_device_private_entry(pmd_to_swp_entry(pmd)); + swp_entry_t entry; + + if (pmd_present(pmd)) + return 0; + + entry = pmd_to_swp_entry(pmd); + return is_device_private_entry(entry); } #else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ -- cgit v1.2.3 From 0ac881efe16468503e8c1e7d8a7210b75f027ce3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:28 +0000 Subject: mm: replace pmd_to_swp_entry() with softleaf_from_pmd() Introduce softleaf_from_pmd() to do the equivalent operation for PMDs that softleaf_from_pte() fulfils, and cascade changes through code base accordingly, introducing helpers as necessary. We are then able to eliminate pmd_to_swp_entry(), is_pmd_migration_entry(), is_pmd_device_private_entry() and is_pmd_non_present_folio_entry(). This further establishes the use of leaf operations throughout the code base and further establishes the foundations for eliminating is_swap_pmd(). No functional change intended. [lorenzo.stoakes@oracle.com: check writable, not readable/writable, per Vlastimil] Link: https://lkml.kernel.org/r/cd97b6ec-00f9-45a4-9ae0-8f009c212a94@lucifer.local Link: https://lkml.kernel.org/r/3fb431699639ded8fdc63d2210aa77a38c8891f1.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: SeongJae Park \ Reviewed-by: Vlastimil Babka Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/leafops.h | 218 +++++++++++++++++++++++++++++++++++++++++++++++- include/linux/migrate.h | 2 +- include/linux/swapops.h | 100 ---------------------- 3 files changed, 217 insertions(+), 103 deletions(-) (limited to 'include') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index cff9d94fd5d1..f5ea9b0385ff 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -61,6 +61,57 @@ static inline softleaf_t softleaf_from_pte(pte_t pte) return pte_to_swp_entry(pte); } +/** + * softleaf_to_pte() - Obtain a PTE entry from a leaf entry. + * @entry: Leaf entry. + * + * This generates an architecture-specific PTE entry that can be utilised to + * encode the metadata the leaf entry encodes. + * + * Returns: Architecture-specific PTE entry encoding leaf entry. + */ +static inline pte_t softleaf_to_pte(softleaf_t entry) +{ + /* Temporary until swp_entry_t eliminated. */ + return swp_entry_to_pte(entry); +} + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +/** + * softleaf_from_pmd() - Obtain a leaf entry from a PMD entry. + * @pmd: PMD entry. + * + * If @pmd is present (therefore not a leaf entry) the function returns an empty + * leaf entry. Otherwise, it returns a leaf entry. + * + * Returns: Leaf entry. + */ +static inline softleaf_t softleaf_from_pmd(pmd_t pmd) +{ + softleaf_t arch_entry; + + if (pmd_present(pmd) || pmd_none(pmd)) + return softleaf_mk_none(); + + if (pmd_swp_soft_dirty(pmd)) + pmd = pmd_swp_clear_soft_dirty(pmd); + if (pmd_swp_uffd_wp(pmd)) + pmd = pmd_swp_clear_uffd_wp(pmd); + arch_entry = __pmd_to_swp_entry(pmd); + + /* Temporary until swp_entry_t eliminated. */ + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); +} + +#else + +static inline softleaf_t softleaf_from_pmd(pmd_t pmd) +{ + return softleaf_mk_none(); +} + +#endif + /** * softleaf_is_none() - Is the leaf entry empty? * @entry: Leaf entry. @@ -134,6 +185,43 @@ static inline bool softleaf_is_swap(softleaf_t entry) return softleaf_type(entry) == SOFTLEAF_SWAP; } +/** + * softleaf_is_migration_write() - Is this leaf entry a writable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a writable migration entry, otherwise + * false. + */ +static inline bool softleaf_is_migration_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_WRITE; +} + +/** + * softleaf_is_migration_read() - Is this leaf entry a readable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a readable migration entry, otherwise + * false. + */ +static inline bool softleaf_is_migration_read(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ; +} + +/** + * softleaf_is_migration_read_exclusive() - Is this leaf entry an exclusive + * readable migration entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is an exclusive readable migration entry, + * otherwise false. + */ +static inline bool softleaf_is_migration_read_exclusive(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_MIGRATION_READ_EXCLUSIVE; +} + /** * softleaf_is_migration() - Is this leaf entry a migration entry? * @entry: Leaf entry. @@ -152,6 +240,19 @@ static inline bool softleaf_is_migration(softleaf_t entry) } } +/** + * softleaf_is_device_private_write() - Is this leaf entry a device private + * writable entry? + * @entry: Leaf entry. + * + * Returns: true if the leaf entry is a device private writable entry, otherwise + * false. + */ +static inline bool softleaf_is_device_private_write(softleaf_t entry) +{ + return softleaf_type(entry) == SOFTLEAF_DEVICE_PRIVATE_WRITE; +} + /** * softleaf_is_device_private() - Is this leaf entry a device private entry? * @entry: Leaf entry. @@ -170,10 +271,10 @@ static inline bool softleaf_is_device_private(softleaf_t entry) } /** - * softleaf_is_device_exclusive() - Is this leaf entry a device exclusive entry? + * softleaf_is_device_exclusive() - Is this leaf entry a device-exclusive entry? * @entry: Leaf entry. * - * Returns: true if the leaf entry is a device exclusive entry, otherwise false. + * Returns: true if the leaf entry is a device-exclusive entry, otherwise false. */ static inline bool softleaf_is_device_exclusive(softleaf_t entry) { @@ -332,6 +433,61 @@ static inline bool softleaf_is_uffd_wp_marker(softleaf_t entry) return softleaf_to_marker(entry) & PTE_MARKER_UFFD_WP; } +#ifdef CONFIG_MIGRATION + +/** + * softleaf_is_migration_young() - Does this migration entry contain an accessed + * bit? + * @entry: Leaf entry. + * + * If the architecture can support storing A/D bits in migration entries, this + * determines whether the accessed (or 'young') bit was set on the migrated page + * table entry. + * + * Returns: true if the entry contains an accessed bit, otherwise false. + */ +static inline bool softleaf_is_migration_young(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_migration(entry)); + + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_YOUNG; + /* Keep the old behavior of aging page after migration */ + return false; +} + +/** + * softleaf_is_migration_dirty() - Does this migration entry contain a dirty bit? + * @entry: Leaf entry. + * + * If the architecture can support storing A/D bits in migration entries, this + * determines whether the dirty bit was set on the migrated page table entry. + * + * Returns: true if the entry contains a dirty bit, otherwise false. + */ +static inline bool softleaf_is_migration_dirty(softleaf_t entry) +{ + VM_WARN_ON_ONCE(!softleaf_is_migration(entry)); + + if (migration_entry_supports_ad()) + return swp_offset(entry) & SWP_MIG_DIRTY; + /* Keep the old behavior of clean page after migration */ + return false; +} + +#else /* CONFIG_MIGRATION */ + +static inline bool softleaf_is_migration_young(softleaf_t entry) +{ + return false; +} + +static inline bool softleaf_is_migration_dirty(softleaf_t entry) +{ + return false; +} +#endif /* CONFIG_MIGRATION */ + /** * pte_is_marker() - Does the PTE entry encode a marker leaf entry? * @pte: PTE entry. @@ -383,5 +539,63 @@ static inline bool pte_is_uffd_marker(pte_t pte) return false; } +#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) + +/** + * pmd_is_device_private_entry() - Check if PMD contains a device private swap + * entry. + * @pmd: The PMD to check. + * + * Returns true if the PMD contains a swap entry that represents a device private + * page mapping. This is used for zone device private pages that have been + * swapped out but still need special handling during various memory management + * operations. + * + * Return: true if PMD contains device private entry, false otherwise + */ +static inline bool pmd_is_device_private_entry(pmd_t pmd) +{ + return softleaf_is_device_private(softleaf_from_pmd(pmd)); +} + +#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +static inline bool pmd_is_device_private_entry(pmd_t pmd) +{ + return false; +} + +#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ + +/** + * pmd_is_migration_entry() - Does this PMD entry encode a migration entry? + * @pmd: PMD entry. + * + * Returns: true if the PMD encodes a migration entry, otherwise false. + */ +static inline bool pmd_is_migration_entry(pmd_t pmd) +{ + return softleaf_is_migration(softleaf_from_pmd(pmd)); +} + +/** + * pmd_is_valid_softleaf() - Is this PMD entry a valid leaf entry? + * @pmd: PMD entry. + * + * PMD leaf entries are valid only if they are device private or migration + * entries. This function asserts that a PMD leaf entry is valid in this + * respect. + * + * Returns: true if the PMD entry is a valid leaf entry, otherwise false. + */ +static inline bool pmd_is_valid_softleaf(pmd_t pmd) +{ + const softleaf_t entry = softleaf_from_pmd(pmd); + + /* Only device private, migration entries valid for PMD. */ + return softleaf_is_device_private(entry) || + softleaf_is_migration(entry); +} + #endif /* CONFIG_MMU */ #endif /* _LINUX_LEAFOPS_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 41b4cc05a450..26ca00c325d9 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -65,7 +65,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) +void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); int folio_migrate_mapping(struct address_space *mapping, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3e8dd6ea94dd..f1277647262d 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -283,14 +283,6 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_young(swp_entry_t entry) -{ - if (migration_entry_supports_ad()) - return swp_offset(entry) & SWP_MIG_YOUNG; - /* Keep the old behavior of aging page after migration */ - return false; -} - static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { if (migration_entry_supports_ad()) @@ -299,14 +291,6 @@ static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_dirty(swp_entry_t entry) -{ - if (migration_entry_supports_ad()) - return swp_offset(entry) & SWP_MIG_DIRTY; - /* Keep the old behavior of clean page after migration */ - return false; -} - extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte); @@ -349,20 +333,11 @@ static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) return entry; } -static inline bool is_migration_entry_young(swp_entry_t entry) -{ - return false; -} - static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry) { return entry; } -static inline bool is_migration_entry_dirty(swp_entry_t entry) -{ - return false; -} #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_MEMORY_FAILURE @@ -487,18 +462,6 @@ extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd); -static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) -{ - swp_entry_t arch_entry; - - if (pmd_swp_soft_dirty(pmd)) - pmd = pmd_swp_clear_soft_dirty(pmd); - if (pmd_swp_uffd_wp(pmd)) - pmd = pmd_swp_clear_uffd_wp(pmd); - arch_entry = __pmd_to_swp_entry(pmd); - return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); -} - static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { swp_entry_t arch_entry; @@ -507,23 +470,7 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) return __swp_entry_to_pmd(arch_entry); } -static inline int is_pmd_migration_entry(pmd_t pmd) -{ - swp_entry_t entry; - - if (pmd_present(pmd)) - return 0; - - entry = pmd_to_swp_entry(pmd); - return is_migration_entry(entry); -} #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, - struct page *page) -{ - BUILD_BUG(); -} - static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { @@ -532,64 +479,17 @@ static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { } -static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd) -{ - return swp_entry(0, 0); -} - static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) { return __pmd(0); } -static inline int is_pmd_migration_entry(pmd_t pmd) -{ - return 0; -} #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_ARCH_ENABLE_THP_MIGRATION) - -/** - * is_pmd_device_private_entry() - Check if PMD contains a device private swap entry - * @pmd: The PMD to check - * - * Returns true if the PMD contains a swap entry that represents a device private - * page mapping. This is used for zone device private pages that have been - * swapped out but still need special handling during various memory management - * operations. - * - * Return: 1 if PMD contains device private entry, 0 otherwise - */ -static inline int is_pmd_device_private_entry(pmd_t pmd) -{ - swp_entry_t entry; - - if (pmd_present(pmd)) - return 0; - - entry = pmd_to_swp_entry(pmd); - return is_device_private_entry(entry); -} - -#else /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ - -static inline int is_pmd_device_private_entry(pmd_t pmd) -{ - return 0; -} - -#endif /* CONFIG_ZONE_DEVICE && CONFIG_ARCH_ENABLE_THP_MIGRATION */ - static inline int non_swap_entry(swp_entry_t entry) { return swp_type(entry) >= MAX_SWAPFILES; } -static inline int is_pmd_non_present_folio_entry(pmd_t pmd) -{ - return is_pmd_migration_entry(pmd) || is_pmd_device_private_entry(pmd); -} - #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ -- cgit v1.2.3 From 15eabc898dc58c9e97eb9ddd56dc6b893e7d0d0e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:29 +0000 Subject: mm: introduce pmd_is_huge() and use where appropriate The leaf entry PMD case is confusing as only migration entries and device private entries are valid at PMD level, not true swap entries. We repeatedly perform checks of the form is_swap_pmd() || pmd_trans_huge() which is itself confusing - it implies that leaf entries at PMD level exist and are different from huge entries. Address this confusion by introduced pmd_is_huge() which checks for either case. Sadly due to header dependency issues (huge_mm.h is included very early on in headers and cannot really rely on much else) we cannot use pmd_is_valid_softleaf() here. However since these are the only valid, handled cases the function is still achieving what it intends to do. We then replace all instances of is_swap_pmd() || pmd_trans_huge() with pmd_is_huge() invocations and adjust logic accordingly to accommodate this. No functional change intended. Link: https://lkml.kernel.org/r/00f79db3b15293cac8f7040a48d69c52d00117e4.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 39 +++++++++++++++++++++++++++++++++++---- include/linux/swapops.h | 6 ++++++ 2 files changed, 41 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 19d4a5f52ca2..5ab240d61dcc 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -419,10 +419,36 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); +/** + * pmd_is_huge() - Is this PMD either a huge PMD entry or a software leaf entry? + * @pmd: The PMD to check. + * + * A huge PMD entry is a non-empty entry which is present and marked huge or a + * software leaf entry. This check be performed without the appropriate locks + * held, in which case the condition should be rechecked after they are + * acquired. + * + * Returns: true if this PMD is huge, false otherwise. + */ +static inline bool pmd_is_huge(pmd_t pmd) +{ + if (pmd_present(pmd)) { + return pmd_trans_huge(pmd); + } else if (!pmd_none(pmd)) { + /* + * Non-present PMDs must be valid huge non-present entries. We + * cannot assert that here due to header dependency issues. + */ + return true; + } + + return false; +} + #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ - if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)) \ + if (pmd_is_huge(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false); \ } while (0) @@ -469,10 +495,10 @@ static inline int is_swap_pmd(pmd_t pmd) static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_is_huge(*pmd)) return __pmd_trans_huge_lock(pmd, vma); - else - return NULL; + + return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) @@ -743,6 +769,11 @@ static inline struct folio *get_persistent_huge_zero_folio(void) { return NULL; } + +static inline bool pmd_is_huge(pmd_t pmd) +{ + return false; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/include/linux/swapops.h b/include/linux/swapops.h index f1277647262d..41cfc6d59054 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -471,6 +471,12 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) } #else /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ +static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, + struct page *page) +{ + BUILD_BUG(); +} + static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) { -- cgit v1.2.3 From c0a80c2ce68d3a04daa52497fbf524ffb3a376e0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:30 +0000 Subject: mm: remove remaining is_swap_pmd() users and is_swap_pmd() Update copy_huge_pmd() and change_huge_pmd() to use pmd_is_valid_softleaf() - as this checks for the only valid non-present huge PMD states. Also update mm/debug_vm_pgtable.c to explicitly test for a valid leaf PMD entry (which it was not before, which was incorrect), and have it test against pmd_is_huge() and pmd_is_valid_softleaf() rather than is_swap_pmd(). With these changes done there are no further users of is_swap_pmd(), so remove it. Link: https://lkml.kernel.org/r/1628b00b00c8498bbd2c20b82117ee87845fb738.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 5ab240d61dcc..525624c285a6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -486,11 +486,6 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); -static inline int is_swap_pmd(pmd_t pmd) -{ - return !pmd_none(pmd) && !pmd_present(pmd); -} - /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) @@ -692,10 +687,6 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, struct vm_area_struct *next) { } -static inline int is_swap_pmd(pmd_t pmd) -{ - return 0; -} static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { -- cgit v1.2.3 From 9ff30bb9ab40b34908eefd661f12f99aa00d04c3 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:31 +0000 Subject: mm: remove non_swap_entry() and use softleaf helpers instead There is simply no need for the hugely confusing concept of 'non-swap' swap entries now we have the concept of softleaf entries and relevant softleaf_xxx() helpers. Adjust all callers to use these instead and remove non_swap_entry() altogether. No functional change intended. Link: https://lkml.kernel.org/r/2562093f37f4a9cffea0447058014485eb50aaaf.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swapops.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 41cfc6d59054..c8e6f927da48 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -492,10 +492,5 @@ static inline pmd_t swp_entry_to_pmd(swp_entry_t entry) #endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ -static inline int non_swap_entry(swp_entry_t entry) -{ - return swp_type(entry) >= MAX_SWAPFILES; -} - #endif /* CONFIG_MMU */ #endif /* _LINUX_SWAPOPS_H */ -- cgit v1.2.3 From 03bfbc3ad6e496fb576ca9ace08211943232fdf9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:32 +0000 Subject: mm: remove is_hugetlb_entry_[migration, hwpoisoned]() We do not need to have explicit helper functions for these, it adds a level of confusion and indirection when we can simply use software leaf entry logic here instead and spell out the special huge_pte_none() case we must consider. No functional change intended. Link: https://lkml.kernel.org/r/0e92d6924d3de88cd014ce1c53e20edc08fc152e.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2387513d6ae5..457d48ac7bcd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -274,8 +274,6 @@ void hugetlb_vma_lock_release(struct kref *kref); long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags); -bool is_hugetlb_entry_migration(pte_t pte); -bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); -- cgit v1.2.3 From 93976a20345b4aff1ac7598ec1223d65ca33d49c Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:33 +0000 Subject: mm: eliminate further swapops predicates Having converted so much of the code base to software leaf entries, we can mop up some remaining cases. We replace is_pfn_swap_entry(), pfn_swap_entry_to_page(), is_writable_device_private_entry(), is_device_exclusive_entry(), is_migration_entry(), is_writable_migration_entry(), is_readable_migration_entry(), swp_offset_pfn() and pfn_swap_entry_folio() with softleaf equivalents. No functional change intended. Link: https://lkml.kernel.org/r/956bc9c031604811c0070d2f4bf2f1373f230213.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/leafops.h | 25 +++++++--- include/linux/swapops.h | 121 +----------------------------------------------- 2 files changed, 20 insertions(+), 126 deletions(-) (limited to 'include') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index f5ea9b0385ff..d282fab866a1 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -355,7 +355,7 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); /* Temporary until swp_entry_t eliminated. */ - return swp_offset_pfn(entry); + return swp_offset(entry) & SWP_PFN_MASK; } /** @@ -366,10 +366,16 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) */ static inline struct page *softleaf_to_page(softleaf_t entry) { + struct page *page = pfn_to_page(softleaf_to_pfn(entry)); + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); - /* Temporary until swp_entry_t eliminated. */ - return pfn_swap_entry_to_page(entry); + return page; } /** @@ -380,10 +386,17 @@ static inline struct page *softleaf_to_page(softleaf_t entry) */ static inline struct folio *softleaf_to_folio(softleaf_t entry) { - VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); - /* Temporary until swp_entry_t eliminated. */ - return pfn_swap_entry_folio(entry); + VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding folio is locked. + */ + VM_WARN_ON_ONCE(softleaf_is_migration(entry) && + !folio_test_locked(folio)); + + return folio; } /** diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c8e6f927da48..3d02b288c15e 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -28,7 +28,7 @@ #define SWP_OFFSET_MASK ((1UL << SWP_TYPE_SHIFT) - 1) /* - * Definitions only for PFN swap entries (see is_pfn_swap_entry()). To + * Definitions only for PFN swap entries (see leafeant_has_pfn()). To * store PFN, we only need SWP_PFN_BITS bits. Each of the pfn swap entries * can use the extra bits to store other information besides PFN. */ @@ -66,8 +66,6 @@ #define SWP_MIG_YOUNG BIT(SWP_MIG_YOUNG_BIT) #define SWP_MIG_DIRTY BIT(SWP_MIG_DIRTY_BIT) -static inline bool is_pfn_swap_entry(swp_entry_t entry); - /* Clear all flags but only keep swp_entry_t related information */ static inline pte_t pte_swp_clear_flags(pte_t pte) { @@ -109,17 +107,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry) return entry.val & SWP_OFFSET_MASK; } -/* - * This should only be called upon a pfn swap entry to get the PFN stored - * in the swap entry. Please refers to is_pfn_swap_entry() for definition - * of pfn swap entry. - */ -static inline unsigned long swp_offset_pfn(swp_entry_t entry) -{ - VM_BUG_ON(!is_pfn_swap_entry(entry)); - return swp_offset(entry) & SWP_PFN_MASK; -} - /* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. @@ -169,27 +156,11 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) return swp_entry(SWP_DEVICE_WRITE, offset); } -static inline bool is_device_private_entry(swp_entry_t entry) -{ - int type = swp_type(entry); - return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE; -} - -static inline bool is_writable_device_private_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_DEVICE_WRITE); -} - static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(SWP_DEVICE_EXCLUSIVE, offset); } -static inline bool is_device_exclusive_entry(swp_entry_t entry) -{ - return swp_type(entry) == SWP_DEVICE_EXCLUSIVE; -} - #else /* CONFIG_DEVICE_PRIVATE */ static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset) { @@ -201,50 +172,14 @@ static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset) return swp_entry(0, 0); } -static inline bool is_device_private_entry(swp_entry_t entry) -{ - return false; -} - -static inline bool is_writable_device_private_entry(swp_entry_t entry) -{ - return false; -} - static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset) { return swp_entry(0, 0); } -static inline bool is_device_exclusive_entry(swp_entry_t entry) -{ - return false; -} - #endif /* CONFIG_DEVICE_PRIVATE */ #ifdef CONFIG_MIGRATION -static inline int is_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ || - swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE || - swp_type(entry) == SWP_MIGRATION_WRITE); -} - -static inline int is_writable_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); -} - -static inline int is_readable_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ); -} - -static inline int is_readable_exclusive_migration_entry(swp_entry_t entry) -{ - return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE); -} static inline swp_entry_t make_readable_migration_entry(pgoff_t offset) { @@ -310,23 +245,10 @@ static inline swp_entry_t make_writable_migration_entry(pgoff_t offset) return swp_entry(0, 0); } -static inline int is_migration_entry(swp_entry_t swp) -{ - return 0; -} - static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { } -static inline int is_writable_migration_entry(swp_entry_t entry) -{ - return 0; -} -static inline int is_readable_migration_entry(swp_entry_t entry) -{ - return 0; -} static inline swp_entry_t make_migration_entry_young(swp_entry_t entry) { @@ -410,47 +332,6 @@ static inline swp_entry_t make_guard_swp_entry(void) return make_pte_marker_entry(PTE_MARKER_GUARD); } -static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) -{ - struct page *p = pfn_to_page(swp_offset_pfn(entry)); - - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - BUG_ON(is_migration_entry(entry) && !PageLocked(p)); - - return p; -} - -static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) -{ - struct folio *folio = pfn_folio(swp_offset_pfn(entry)); - - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked - */ - BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); - - return folio; -} - -/* - * A pfn swap entry is a special type of swap entry that always has a pfn stored - * in the swap offset. They can either be used to represent unaddressable device - * memory, to restrict access to a page undergoing migration or to represent a - * pfn which has been hwpoisoned and unmapped. - */ -static inline bool is_pfn_swap_entry(swp_entry_t entry) -{ - /* Make sure the swp offset can always store the needed fields */ - BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS); - - return is_migration_entry(entry) || is_device_private_entry(entry) || - is_device_exclusive_entry(entry) || is_hwpoison_entry(entry); -} - struct page_vma_mapped_walk; #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -- cgit v1.2.3 From a3a3e215c9c140c08760d4d96ba4e8bc485d0f14 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 10 Nov 2025 22:21:34 +0000 Subject: mm: replace remaining pte_to_swp_entry() with softleaf_from_pte() There are straggler invocations of pte_to_swp_entry() lying around, replace all of these with the software leaf entry equivalent - softleaf_from_pte(). With those removed, eliminate pte_to_swp_entry() altogether. No functional change intended. Link: https://lkml.kernel.org/r/d8ee5ccefe4c42d7c4fe1a2e46f285ac40421cd3.1762812360.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Alexander Gordeev Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Christian Borntraeger Cc: Christian Brauner Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: Dev Jain Cc: Gerald Schaefer Cc: Gregory Price Cc: Heiko Carstens Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Janosch Frank Cc: Jason Gunthorpe Cc: Joshua Hahn Cc: Kairui Song Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Mike Rapoport Cc: Muchun Song Cc: Naoya Horiguchi Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Pasha Tatashin Cc: Peter Xu Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/leafops.h | 7 ++++++- include/linux/swapops.h | 13 ------------- 2 files changed, 6 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/leafops.h b/include/linux/leafops.h index d282fab866a1..cfafe7a5e7b1 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -54,11 +54,16 @@ static inline softleaf_t softleaf_mk_none(void) */ static inline softleaf_t softleaf_from_pte(pte_t pte) { + softleaf_t arch_entry; + if (pte_present(pte) || pte_none(pte)) return softleaf_mk_none(); + pte = pte_swp_clear_flags(pte); + arch_entry = __pte_to_swp_entry(pte); + /* Temporary until swp_entry_t eliminated. */ - return pte_to_swp_entry(pte); + return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); } /** diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 3d02b288c15e..8cfc966eae48 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -107,19 +107,6 @@ static inline pgoff_t swp_offset(swp_entry_t entry) return entry.val & SWP_OFFSET_MASK; } -/* - * Convert the arch-dependent pte representation of a swp_entry_t into an - * arch-independent swp_entry_t. - */ -static inline swp_entry_t pte_to_swp_entry(pte_t pte) -{ - swp_entry_t arch_entry; - - pte = pte_swp_clear_flags(pte); - arch_entry = __pte_to_swp_entry(pte); - return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry)); -} - /* * Convert the arch-independent representation of a swp_entry_t into the * arch-dependent pte representation. -- cgit v1.2.3 From ad7c7f4576a5977b4ec4ac5dd090ab3f81ca7c6f Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 10 Nov 2025 16:17:56 +0800 Subject: mm: thp: introduce folio_split_queue_lock and its variants In future memcg removal, the binding between a folio and a memcg may change, making the split lock within the memcg unstable when held. A new approach is required to reparent the split queue to its parent. This patch starts introducing a unified way to acquire the split lock for future work. It's a code-only refactoring with no functional changes. Link: https://lkml.kernel.org/r/a31a90bcac04dc754f775e87ae3205be3170b571.1762762324.git.zhengqi.arch@bytedance.com Signed-off-by: Muchun Song Signed-off-by: Qi Zheng Acked-by: Johannes Weiner Reviewed-by: Zi Yan Acked-by: Shakeel Butt Acked-by: David Hildenbrand Reviewed-by: Harry Yoo Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Hugh Dickins Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 966f7c1a0128..b0c6a4635c67 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1647,6 +1647,11 @@ int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); + +static inline int shrinker_id(struct shrinker *shrinker) +{ + return shrinker->id; +} #else #define mem_cgroup_sockets_enabled 0 @@ -1678,6 +1683,11 @@ static inline void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } + +static inline int shrinker_id(struct shrinker *shrinker) +{ + return -1; +} #endif #ifdef CONFIG_MEMCG -- cgit v1.2.3 From 46156dba32cb68537d36877a97d672227f3e8134 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 10 Nov 2025 16:17:58 +0800 Subject: mm: thp: reparent the split queue during memcg offline Similar to list_lru, the split queue is relatively independent and does not need to be reparented along with objcg and LRU folios (holding objcg lock and lru lock). So let's apply the similar mechanism as list_lru to reparent the split queue separately when memcg is offine. This is also a preparation for reparenting LRU folios. Link: https://lkml.kernel.org/r/8703f907c4d1f7e8a2ef2bfed3036a84fa53028b.1762762324.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Zi Yan Reviewed-by: Muchun Song Acked-by: David Hildenbrand Acked-by: Shakeel Butt Reviewed-by: Harry Yoo Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Hugh Dickins Cc: Johannes Weiner Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Nico Pache Cc: Roman Gushchin Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++++ include/linux/memcontrol.h | 11 +++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 525624c285a6..e2e91aa1a042 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -415,6 +415,9 @@ static inline int split_huge_page(struct page *page) return split_huge_page_to_list_to_order(page, NULL, 0); } void deferred_split_folio(struct folio *folio, bool partially_mapped); +#ifdef CONFIG_MEMCG +void reparent_deferred_split_queue(struct mem_cgroup *memcg); +#endif void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze); @@ -647,6 +650,7 @@ static inline int try_folio_split_to_order(struct folio *folio, } static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} +static inline void reparent_deferred_split_queue(struct mem_cgroup *memcg) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b0c6a4635c67..cc6db20d7dca 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1775,6 +1775,12 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); + +static inline bool memcg_is_dying(struct mem_cgroup *memcg) +{ + return memcg ? css_is_dying(&memcg->css) : false; +} + #else static inline bool mem_cgroup_kmem_disabled(void) { @@ -1845,6 +1851,11 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) { } + +static inline bool memcg_is_dying(struct mem_cgroup *memcg) +{ + return false; +} #endif /* CONFIG_MEMCG */ #if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP) -- cgit v1.2.3 From 9e014077083753461938312d565e4ac7119570d1 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 14 Nov 2025 03:00:28 +0000 Subject: mm/khugepaged: unify SCAN_PMD_NONE and SCAN_PMD_NULL into SCAN_NO_PTE_TABLE The current hugepage collapse scan results include two separate values, SCAN_PMD_NONE and SCAN_PMD_NULL, which are handled identically by the consuming code. To reduce confusion and improve long-term maintenance, this commit merges these two functionally equivalent states into a single, clearer identifier: SCAN_NO_PTE_TABLE Link: https://lkml.kernel.org/r/20251114030028.7035-4-richard.weiyang@gmail.com Suggested-by: "David Hildenbrand (Red Hat)" Signed-off-by: Wei Yang Reviewed-by: Dev Jain Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Baolin Wang Reviewed-by: Nico Pache Cc: Barry Song Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Ryan Roberts Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index dd94d14a2427..4cde53b45a85 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -10,8 +10,7 @@ #define SCAN_STATUS \ EM( SCAN_FAIL, "failed") \ EM( SCAN_SUCCEED, "succeeded") \ - EM( SCAN_PMD_NULL, "pmd_null") \ - EM( SCAN_PMD_NONE, "pmd_none") \ + EM( SCAN_NO_PTE_TABLE, "no_pte_table") \ EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ -- cgit v1.2.3 From cab812d9c9642ec11b8961b7ea994f4bd0826159 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 14 Nov 2025 12:22:28 +1100 Subject: mm/huge_memory.c: introduce folio_split_unmapped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unmapped was added as a parameter to __folio_split() and related call sites to support splitting of folios already in the midst of a migration. This special case arose for device private folio migration since during migration there could be a disconnect between source and destination on the folio size. Introduce folio_split_unmapped() to handle this special case. Also refactor code and add __folio_freeze_and_split_unmapped() helper that is common to both __folio_split() and folio_split_unmapped(). This in turn removes the special casing introduced by the unmapped parameter in __folio_split(). [balbirs@nvidia.com: v2] Link: https://lkml.kernel.org/r/20251115084041.3914728-1-balbirs@nvidia.com [balbirs@nvidia.com: fix clang-20 build] Link: https://lkml.kernel.org/r/20251120134232.3588203-1-balbirs@nvidia.com [akpm@linux-foundation.org: add `inline' to shmem_uncharge() stub, per Balbir] Link: https://lkml.kernel.org/r/20251114012228.2634882-1-balbirs@nvidia.com Signed-off-by: Balbir Singh Suggested-by: Zi Yan Acked-by: Zi Yan Cc: David Hildenbrand Cc: Joshua Hahn Cc: Rakie Kim Cc: Byungchul Park Cc: Gregory Price Cc: Ying Huang Cc: Alistair Popple Cc: Oscar Salvador Cc: Lorenzo Stoakes Cc: Baolin Wang Cc: "Liam R. Howlett" Cc: Nico Pache Cc: Ryan Roberts Cc: Dev Jain Cc: Barry Song Cc: Lyude Paul Cc: Danilo Krummrich Cc: David Airlie Cc: Simona Vetter Cc: Ralph Campbell Cc: Mika Penttilä Cc: Matthew Brost Cc: Francois Dugast Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++-- include/linux/shmem_fs.h | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e2e91aa1a042..1d439de1ca2c 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -371,7 +371,8 @@ enum split_type { bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, - unsigned int new_order, bool unmapped); + unsigned int new_order); +int folio_split_unmapped(struct folio *folio, unsigned int new_order); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); bool folio_split_supported(struct folio *folio, unsigned int new_order, @@ -382,7 +383,7 @@ int folio_split(struct folio *folio, unsigned int new_order, struct page *page, static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { - return __split_huge_page_to_list_to_order(page, list, new_order, false); + return __split_huge_page_to_list_to_order(page, list, new_order); } static inline int split_huge_page_to_order(struct page *page, unsigned int new_order) { diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 5b368f9549d6..d02270072a34 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -136,11 +136,16 @@ static inline bool shmem_hpage_pmd_enabled(void) #ifdef CONFIG_SHMEM extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); +extern void shmem_uncharge(struct inode *inode, long pages); #else static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma) { return 0; } + +static inline void shmem_uncharge(struct inode *inode, long pages) +{ +} #endif extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); @@ -194,7 +199,6 @@ static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof) } extern bool shmem_charge(struct inode *inode, long pages); -extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_USERFAULTFD #ifdef CONFIG_SHMEM -- cgit v1.2.3 From 7e44d00a13ca5691caf4f7c46541ee60bf75b208 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:05 -0800 Subject: memcg: use mod_node_page_state to update stats Patch series "memcg: cleanup the memcg stats interfaces". The memcg stats are safe against irq (and nmi) context and thus does not require disabling irqs. However for some stats which are also maintained at node level, it is using irq unsafe interface and thus requiring the users to still disables irqs or use interfaces which explicitly disables irqs. Let's move memcg code to use irq safe node level stats function which is already optimized for architectures with HAVE_CMPXCHG_LOCAL (all major ones), so there will not be any performance penalty for its usage. This patch (of 4): The memcg stats are safe against irq (and nmi) context and thus does not require disabling irqs. However some code paths for memcg stats also update the node level stats and use irq unsafe interface and thus require the users to disable irqs. However node level stats, on architectures with HAVE_CMPXCHG_LOCAL (all major ones), has interface which does not require irq disabling. Let's move memcg stats code to start using that interface for node level stats. Link: https://lkml.kernel.org/r/20251110232008.1352063-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20251110232008.1352063-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- include/linux/vmstat.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cc6db20d7dca..1085d0460e66 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1408,7 +1408,7 @@ static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, { struct page *page = virt_to_head_page(p); - __mod_node_page_state(page_pgdat(page), idx, val); + mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c287998908bf..11a37aaa4dd9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -557,7 +557,7 @@ static inline void mod_lruvec_page_state(struct page *page, static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { - __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void mod_lruvec_state(struct lruvec *lruvec, @@ -569,7 +569,7 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, static inline void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { - __mod_node_page_state(folio_pgdat(folio), idx, val); + mod_node_page_state(folio_pgdat(folio), idx, val); } static inline void lruvec_stat_mod_folio(struct folio *folio, -- cgit v1.2.3 From 469241fe7657dbec9e2948287ab7412955d8b73a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:06 -0800 Subject: memcg: remove __mod_lruvec_kmem_state __mod_lruvec_kmem_state() is already safe against irqs, so there is no need to have a separate interface (i.e. mod_lruvec_kmem_state) which wraps calls to it with irq disabling and reenabling. Let's rename __mod_lruvec_kmem_state() to mod_lruvec_kmem_state(). Link: https://lkml.kernel.org/r/20251110232008.1352063-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Reviewed-by: Qi Zheng Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1085d0460e66..d35390f9892a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -957,17 +957,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec, void mem_cgroup_flush_stats(struct mem_cgroup *memcg); void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg); -void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); - -static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, - int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_kmem_state(p, idx, val); - local_irq_restore(flags); -} +void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); @@ -1403,14 +1393,6 @@ static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) { } -static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, - int val) -{ - struct page *page = virt_to_head_page(p); - - mod_node_page_state(page_pgdat(page), idx, val); -} - static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { @@ -1470,14 +1452,14 @@ struct slabobj_ext { #endif } __aligned(8); -static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) +static inline void inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_kmem_state(p, idx, 1); + mod_lruvec_kmem_state(p, idx, 1); } -static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) +static inline void dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_kmem_state(p, idx, -1); + mod_lruvec_kmem_state(p, idx, -1); } static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) -- cgit v1.2.3 From 5b3eb779a20cf30d74bb346d2a1e525bc9072685 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:07 -0800 Subject: memcg: remove __mod_lruvec_state __mod_lruvec_state() is already safe against irqs, so there is no need to have a separate interface (i.e. mod_lruvec_state) which wraps calls to it with irq disabling and reenabling. Let's rename __mod_lruvec_state() to mod_lruvec_state(). Link: https://lkml.kernel.org/r/20251110232008.1352063-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 2 +- include/linux/vmstat.h | 18 +----------------- 2 files changed, 2 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ca7a18351797..b58f34c4fe92 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -44,7 +44,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); - __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); + mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 11a37aaa4dd9..4eb7753e6e5c 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -520,19 +520,9 @@ static inline const char *vm_event_name(enum vm_event_item item) #ifdef CONFIG_MEMCG -void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, +void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -static inline void mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_lruvec_state(lruvec, idx, val); - local_irq_restore(flags); -} - void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); @@ -554,12 +544,6 @@ static inline void mod_lruvec_page_state(struct page *page, #else -static inline void __mod_lruvec_state(struct lruvec *lruvec, - enum node_stat_item idx, int val) -{ - mod_node_page_state(lruvec_pgdat(lruvec), idx, val); -} - static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { -- cgit v1.2.3 From c1bd09994c4d5b897571671bed16581335e93242 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 10 Nov 2025 15:20:08 -0800 Subject: memcg: remove __lruvec_stat_mod_folio __lruvec_stat_mod_folio() is already safe against irqs, so there is no need to have a separate interface (i.e. lruvec_stat_mod_folio) which wraps calls to it with irq disabling and reenabling. Let's rename __lruvec_stat_mod_folio() to lruvec_stat_mod_folio(). Link: https://lkml.kernel.org/r/20251110232008.1352063-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Harry Yoo Acked-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) (limited to 'include') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 4eb7753e6e5c..3398a345bda8 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -523,19 +523,9 @@ static inline const char *vm_event_name(enum vm_event_item item) void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -void __lruvec_stat_mod_folio(struct folio *folio, +void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val); -static inline void lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __lruvec_stat_mod_folio(folio, idx, val); - local_irq_restore(flags); -} - static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { @@ -550,12 +540,6 @@ static inline void mod_lruvec_state(struct lruvec *lruvec, mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } -static inline void __lruvec_stat_mod_folio(struct folio *folio, - enum node_stat_item idx, int val) -{ - mod_node_page_state(folio_pgdat(folio), idx, val); -} - static inline void lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, int val) { @@ -570,18 +554,6 @@ static inline void mod_lruvec_page_state(struct page *page, #endif /* CONFIG_MEMCG */ -static inline void __lruvec_stat_add_folio(struct folio *folio, - enum node_stat_item idx) -{ - __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio)); -} - -static inline void __lruvec_stat_sub_folio(struct folio *folio, - enum node_stat_item idx) -{ - __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio)); -} - static inline void lruvec_stat_add_folio(struct folio *folio, enum node_stat_item idx) { -- cgit v1.2.3 From 277a1ae3879a82a15a2e2d6741e38e31ea6487ee Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:01 +0800 Subject: mm: softdirty: add pgtable_supports_soft_dirty() Patch series "mm: Add soft-dirty and uffd-wp support for RISC-V", v15. This patchset adds support for Svrsw60t59b [1] extension which is ratified now, also add soft dirty and userfaultfd write protect tracking for RISC-V. The patches 1 and 2 add macros to allow architectures to define their own checks if the soft-dirty / uffd_wp PTE bits are available, in other words for RISC-V, the Svrsw60t59b extension is supported on which device the kernel is running. Also patch1-2 are removing "ifdef CONFIG_MEM_SOFT_DIRTY" "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and "ifdef CONFIG_PTE_MARKER_UFFD_WP" in favor of checks which if not overridden by the architecture, no change in behavior is expected. This patchset has been tested with kselftest mm suite in which soft-dirty, madv_populate, test_unmerge_uffd_wp, and uffd-unit-tests run and pass, and no regressions are observed in any of the other tests. This patch (of 6): Some platforms can customize the PTE PMD entry soft-dirty bit making it unavailable even if the architecture provides the resource. Add an API which architectures can define their specific implementations to detect if soft-dirty bit is available on which device the kernel is running. This patch is removing "ifdef CONFIG_MEM_SOFT_DIRTY" in favor of pgtable_supports_soft_dirty() checks that defaults to IS_ENABLED(CONFIG_MEM_SOFT_DIRTY), if not overridden by the architecture, no change in behavior is expected. We make sure to never set VM_SOFTDIRTY if !pgtable_supports_soft_dirty(), so we will never run into VM_SOFTDIRTY checks. [lorenzo.stoakes@oracle.com: fix VMA selftests] Link: https://lkml.kernel.org/r/dac6ddfe-773a-43d5-8f69-021b9ca4d24b@lucifer.local Link: https://lkml.kernel.org/r/20251113072806.795029-1-zhangchunyan@iscas.ac.cn Link: https://lkml.kernel.org/r/20251113072806.795029-2-zhangchunyan@iscas.ac.cn Link: https://github.com/riscv-non-isa/riscv-iommu/pull/543 [1] Signed-off-by: Chunyan Zhang Acked-by: David Hildenbrand Cc: Albert Ou Cc: Alexandre Ghiti Cc: Al Viro Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: Deepak Gupta Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Cc: Alexandre Ghiti Cc: Andrew Jones Cc: Conor Dooley Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 +++ include/linux/pgtable.h | 12 ++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index bf660d5b6e97..75f894c3f521 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -859,6 +859,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); ACCESS_PRIVATE(vma, __vm_flags) = flags; } @@ -870,6 +871,7 @@ static inline void vm_flags_init(struct vm_area_struct *vma, static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_assert_write_locked(vma); vm_flags_init(vma, flags); } @@ -891,6 +893,7 @@ static inline void vm_flags_set(struct vm_area_struct *vma, static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { + VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 32e8457ad535..b13b6f42be3c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1553,6 +1553,18 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define arch_start_context_switch(prev) do {} while (0) #endif +/* + * Some platforms can customize the PTE soft-dirty bit making it unavailable + * even if the architecture provides the resource. + * Adding this API allows architectures to add their own checks for the + * devices on which the kernel is running. + * Note: When overriding it, please make sure the CONFIG_MEM_SOFT_DIRTY + * is part of this macro. + */ +#ifndef pgtable_supports_soft_dirty +#define pgtable_supports_soft_dirty() IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) +#endif + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) -- cgit v1.2.3 From f59c0924d61aa2a2bb85936a593140f327112787 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Thu, 13 Nov 2025 15:28:02 +0800 Subject: mm: userfaultfd: add pgtable_supports_uffd_wp() Some platforms can customize the PTE/PMD entry uffd-wp bit making it unavailable even if the architecture provides the resource. This patch adds a macro API pgtable_supports_uffd_wp() that allows architectures to define their specific implementations to check if the uffd-wp bit is available on which device the kernel is running. Also this patch is removing "ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP" and "ifdef CONFIG_PTE_MARKER_UFFD_WP" in favor of pgtable_supports_uffd_wp() and uffd_supports_wp_marker() checks respectively that default to IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) and "IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) && IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP)" if not overridden by the architecture, no change in behavior is expected. Link: https://lkml.kernel.org/r/20251113072806.795029-3-zhangchunyan@iscas.ac.cn Signed-off-by: Chunyan Zhang Acked-by: David Hildenbrand Cc: Albert Ou Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Al Viro Cc: Andrew Jones Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Christian Brauner Cc: Conor Dooley Cc: Conor Dooley Cc: Deepak Gupta Cc: Jan Kara Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Rob Herring Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/asm-generic/pgtable_uffd.h | 17 ++++++++++ include/linux/mm_inline.h | 8 +++-- include/linux/userfaultfd_k.h | 69 ++++++++++++++++++++++---------------- 3 files changed, 63 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h index 828966d4c281..0d85791efdf7 100644 --- a/include/asm-generic/pgtable_uffd.h +++ b/include/asm-generic/pgtable_uffd.h @@ -1,6 +1,23 @@ #ifndef _ASM_GENERIC_PGTABLE_UFFD_H #define _ASM_GENERIC_PGTABLE_UFFD_H +/* + * Some platforms can customize the uffd-wp bit, making it unavailable + * even if the architecture provides the resource. + * Adding this API allows architectures to add their own checks for the + * devices on which the kernel is running. + * Note: When overriding it, please make sure the + * CONFIG_HAVE_ARCH_USERFAULTFD_WP is part of this macro. + */ +#ifndef pgtable_supports_uffd_wp +#define pgtable_supports_uffd_wp() IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_WP) +#endif + +static inline bool uffd_supports_wp_marker(void) +{ + return pgtable_supports_uffd_wp() && IS_ENABLED(CONFIG_PTE_MARKER_UFFD_WP); +} + #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP static __always_inline int pte_uffd_wp(pte_t pte) { diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index b58f34c4fe92..fa2d6ba811b5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -553,7 +553,6 @@ static inline pte_marker copy_pte_marker( return dstm; } -#endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to @@ -571,9 +570,11 @@ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; + if (!uffd_supports_wp_marker()) + return false; + /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); @@ -602,7 +603,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } -#endif + return false; } @@ -616,6 +617,7 @@ static inline bool vma_has_recency(const struct vm_area_struct *vma) return true; } +#endif /** * num_pages_contiguous() - determine the number of contiguous pages diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 96b089dff4ef..fd5f42765497 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -228,15 +228,14 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, if (wp_async && (vm_flags == VM_UFFD_WP)) return true; -#ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for * uffd-wp, then shmem & hugetlbfs are not supported but only * anonymous. */ - if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) + if (!uffd_supports_wp_marker() && (vm_flags & VM_UFFD_WP) && + !vma_is_anonymous(vma)) return false; -#endif /* By default, allow any of anon|shmem|hugetlb */ return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || @@ -291,6 +290,43 @@ void userfaultfd_release_new(struct userfaultfd_ctx *ctx); void userfaultfd_release_all(struct mm_struct *mm, struct userfaultfd_ctx *ctx); +static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) +{ + /* Only wr-protect mode uses pte markers */ + if (!userfaultfd_wp(vma)) + return false; + + /* File-based uffd-wp always need markers */ + if (!vma_is_anonymous(vma)) + return true; + + /* + * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED + * enabled (to apply markers on zero pages). + */ + return userfaultfd_wp_unpopulated(vma); +} + +/* + * Returns true if this is a swap pte and was uffd-wp wr-protected in either + * forms (pte marker or a normal swap pte), false otherwise. + */ +static inline bool pte_swp_uffd_wp_any(pte_t pte) +{ + if (!uffd_supports_wp_marker()) + return false; + + if (pte_present(pte)) + return false; + + if (pte_swp_uffd_wp(pte)) + return true; + + if (pte_is_uffd_wp_marker(pte)) + return true; + + return false; +} #else /* CONFIG_USERFAULTFD */ /* mm helpers */ @@ -415,23 +451,9 @@ static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) return false; } -#endif /* CONFIG_USERFAULTFD */ - static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) { - /* Only wr-protect mode uses pte markers */ - if (!userfaultfd_wp(vma)) - return false; - - /* File-based uffd-wp always need markers */ - if (!vma_is_anonymous(vma)) - return true; - - /* - * Anonymous uffd-wp only needs the markers if WP_UNPOPULATED - * enabled (to apply markers on zero pages). - */ - return userfaultfd_wp_unpopulated(vma); + return false; } /* @@ -440,16 +462,7 @@ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) */ static inline bool pte_swp_uffd_wp_any(pte_t pte) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP - if (pte_present(pte)) - return false; - if (pte_swp_uffd_wp(pte)) - return true; - - if (pte_is_uffd_wp_marker(pte)) - return true; -#endif return false; } - +#endif /* CONFIG_USERFAULTFD */ #endif /* _LINUX_USERFAULTFD_K_H */ -- cgit v1.2.3 From 31807483d3952059d395c2a73b1fa9625db9b366 Mon Sep 17 00:00:00 2001 From: Xie Yuanbin Date: Wed, 19 Nov 2025 17:59:43 +0800 Subject: mm/memory-failure: remove the selection of RAS commit 97f0b13452198290799f ("tracing: add trace event for memory-failure") introduces the selection of RAS in memory-failure. This commit is just a tracing feature; in reality, there is no dependency between memory-failure and RAS. RAS increases the size of the bzImage image by 8k, which is very valuable for embedded devices. Move the memory-failure traceing code from ras_event.h to memory-failure.h and remove the selection of RAS. Link: https://lkml.kernel.org/r/20251119095943.67125-1-xieyuanbin1@huawei.com Signed-off-by: Xie Yuanbin Acked-by: David Hildenbrand (Red Hat) Acked-by: Miaohe Lin Cc: Borislav Petkov Signed-off-by: Andrew Morton --- include/ras/ras_event.h | 87 ------------------------------- include/trace/events/memory-failure.h | 98 +++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 87 deletions(-) create mode 100644 include/trace/events/memory-failure.h (limited to 'include') diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index fecfeb7c8be7..1e5e87020eef 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -12,7 +12,6 @@ #include #include #include -#include /* * MCE Extended Error Log trace event @@ -339,92 +338,6 @@ TRACE_EVENT(aer_event, "Not available") ); #endif /* CONFIG_PCIEAER */ - -/* - * memory-failure recovery action result event - * - * unsigned long pfn - Page Frame Number of the corrupted page - * int type - Page types of the corrupted page - * int result - Result of recovery action - */ - -#ifdef CONFIG_MEMORY_FAILURE -#define MF_ACTION_RESULT \ - EM ( MF_IGNORED, "Ignored" ) \ - EM ( MF_FAILED, "Failed" ) \ - EM ( MF_DELAYED, "Delayed" ) \ - EMe ( MF_RECOVERED, "Recovered" ) - -#define MF_PAGE_TYPE \ - EM ( MF_MSG_KERNEL, "reserved kernel page" ) \ - EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ - EM ( MF_MSG_HUGE, "huge page" ) \ - EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ - EM ( MF_MSG_GET_HWPOISON, "get hwpoison page" ) \ - EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ - EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ - EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ - EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \ - EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \ - EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \ - EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \ - EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \ - EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ - EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ - EM ( MF_MSG_BUDDY, "free buddy page" ) \ - EM ( MF_MSG_DAX, "dax page" ) \ - EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ - EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ - EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ - EMe ( MF_MSG_UNKNOWN, "unknown page" ) - -/* - * First define the enums in MM_ACTION_RESULT to be exported to userspace - * via TRACE_DEFINE_ENUM(). - */ -#undef EM -#undef EMe -#define EM(a, b) TRACE_DEFINE_ENUM(a); -#define EMe(a, b) TRACE_DEFINE_ENUM(a); - -MF_ACTION_RESULT -MF_PAGE_TYPE - -/* - * Now redefine the EM() and EMe() macros to map the enums to the strings - * that will be printed in the output. - */ -#undef EM -#undef EMe -#define EM(a, b) { a, b }, -#define EMe(a, b) { a, b } - -TRACE_EVENT(memory_failure_event, - TP_PROTO(unsigned long pfn, - int type, - int result), - - TP_ARGS(pfn, type, result), - - TP_STRUCT__entry( - __field(unsigned long, pfn) - __field(int, type) - __field(int, result) - ), - - TP_fast_assign( - __entry->pfn = pfn; - __entry->type = type; - __entry->result = result; - ), - - TP_printk("pfn %#lx: recovery action for %s: %s", - __entry->pfn, - __print_symbolic(__entry->type, MF_PAGE_TYPE), - __print_symbolic(__entry->result, MF_ACTION_RESULT) - ) -); -#endif /* CONFIG_MEMORY_FAILURE */ #endif /* _TRACE_HW_EVENT_MC_H */ /* This part must be outside protection */ diff --git a/include/trace/events/memory-failure.h b/include/trace/events/memory-failure.h new file mode 100644 index 000000000000..aa57cc8f896b --- /dev/null +++ b/include/trace/events/memory-failure.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM memory_failure +#define TRACE_INCLUDE_FILE memory-failure + +#if !defined(_TRACE_MEMORY_FAILURE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MEMORY_FAILURE_H + +#include +#include + +/* + * memory-failure recovery action result event + * + * unsigned long pfn - Page Frame Number of the corrupted page + * int type - Page types of the corrupted page + * int result - Result of recovery action + */ + +#define MF_ACTION_RESULT \ + EM ( MF_IGNORED, "Ignored" ) \ + EM ( MF_FAILED, "Failed" ) \ + EM ( MF_DELAYED, "Delayed" ) \ + EMe ( MF_RECOVERED, "Recovered" ) + +#define MF_PAGE_TYPE \ + EM ( MF_MSG_KERNEL, "reserved kernel page" ) \ + EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ + EM ( MF_MSG_HUGE, "huge page" ) \ + EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ + EM ( MF_MSG_GET_HWPOISON, "get hwpoison page" ) \ + EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ + EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ + EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ + EM ( MF_MSG_DIRTY_MLOCKED_LRU, "dirty mlocked LRU page" ) \ + EM ( MF_MSG_CLEAN_MLOCKED_LRU, "clean mlocked LRU page" ) \ + EM ( MF_MSG_DIRTY_UNEVICTABLE_LRU, "dirty unevictable LRU page" ) \ + EM ( MF_MSG_CLEAN_UNEVICTABLE_LRU, "clean unevictable LRU page" ) \ + EM ( MF_MSG_DIRTY_LRU, "dirty LRU page" ) \ + EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ + EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ + EM ( MF_MSG_BUDDY, "free buddy page" ) \ + EM ( MF_MSG_DAX, "dax page" ) \ + EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ + EM ( MF_MSG_ALREADY_POISONED, "already poisoned" ) \ + EM ( MF_MSG_PFN_MAP, "non struct page pfn" ) \ + EMe ( MF_MSG_UNKNOWN, "unknown page" ) + +/* + * First define the enums in MM_ACTION_RESULT to be exported to userspace + * via TRACE_DEFINE_ENUM(). + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +MF_ACTION_RESULT +MF_PAGE_TYPE + +/* + * Now redefine the EM() and EMe() macros to map the enums to the strings + * that will be printed in the output. + */ +#undef EM +#undef EMe +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +TRACE_EVENT(memory_failure_event, + TP_PROTO(unsigned long pfn, + int type, + int result), + + TP_ARGS(pfn, type, result), + + TP_STRUCT__entry( + __field(unsigned long, pfn) + __field(int, type) + __field(int, result) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->type = type; + __entry->result = result; + ), + + TP_printk("pfn %#lx: recovery action for %s: %s", + __entry->pfn, + __print_symbolic(__entry->type, MF_PAGE_TYPE), + __print_symbolic(__entry->result, MF_ACTION_RESULT) + ) +); +#endif /* _TRACE_MEMORY_FAILURE_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 348ced3da52b3161f5ceec8868e81973ce48e11d Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Fri, 21 Nov 2025 14:48:59 -0500 Subject: hugetlb: add __read_mostly to sysctl_hugetlb_shm_group sysctl bits are mostly-read values. Link: https://lkml.kernel.org/r/20251121194859.265259-2-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Andrew Morton Acked-by: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 457d48ac7bcd..019a1c5281e4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -171,7 +171,7 @@ bool hugetlbfs_pagecache_present(struct hstate *h, struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio); -extern int sysctl_hugetlb_shm_group; +extern int sysctl_hugetlb_shm_group __read_mostly; extern struct list_head huge_boot_pages[MAX_NUMNODES]; void hugetlb_bootmem_alloc(void); -- cgit v1.2.3 From 48f014356698a3525959a9eb343dc67b5a5c6842 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 24 Nov 2025 17:37:40 +0200 Subject: PCI: Validate pci_rebar_size_supported() input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to Dan Carpenter, smatch detects issue with size parameter given to pci_rebar_size_supported(): drivers/pci/rebar.c:142 pci_rebar_size_supported() error: undefined (user controlled) shift '(((1))) << size' The problem is this call tree, which uses the 'size' from the user to shift in BIT() without validating it: __resource_resize_store # takes 'buf' from user sysfs write kstrtoul(buf, 0, &size) # converts to unsigned long pci_resize_resource # truncates to int pci_rebar_size_supported # BIT(size) without validation There could be similar problems also with pci_resize_resource() parameter values coming from drivers. Add 'size' validation to pci_rebar_size_supported(). There seems to be no SZ_128T prior to this so add one to be able to specify the largest size supported by the kernel (PCIe r7.0 spec already defines sizes even beyond 128TB but kernel does not yet support them). The issue looks older than the introduction of pci_rebar_size_supported() by bb1fabd0d94e ("PCI: Add pci_rebar_size_supported() helper"). It would be also nice to convert 'size' unsigned too everywhere, maybe even u8 but that is left as further work. Fixes: 8bb705e3e79d ("PCI: Add pci_resize_resource() for resizing BARs") Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/aSA1WiRG3RuhqZMY@stanley.mountain/ Signed-off-by: Ilpo Järvinen [bhelgaas: commit log, add report URL] Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20251124153740.2995-1-ilpo.jarvinen@linux.intel.com --- include/linux/sizes.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sizes.h b/include/linux/sizes.h index 49039494076f..f1f1a055b047 100644 --- a/include/linux/sizes.h +++ b/include/linux/sizes.h @@ -67,5 +67,6 @@ #define SZ_16T _AC(0x100000000000, ULL) #define SZ_32T _AC(0x200000000000, ULL) #define SZ_64T _AC(0x400000000000, ULL) +#define SZ_128T _AC(0x800000000000, ULL) #endif /* __LINUX_SIZES_H__ */ -- cgit v1.2.3 From 4fe5a00ec70717a7f1002d8913ec6143582b3c8e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 15:41:00 +0000 Subject: net: sched: fix TCF_LAYER_TRANSPORT handling in tcf_get_base_ptr() syzbot reported that tcf_get_base_ptr() can be called while transport header is not set [1]. Instead of returning a dangling pointer, return NULL. Fix tcf_get_base_ptr() callers to handle this NULL value. [1] WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 skb_transport_header include/linux/skbuff.h:3071 [inline] WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 tcf_get_base_ptr include/net/pkt_cls.h:539 [inline] WARNING: CPU: 1 PID: 6019 at ./include/linux/skbuff.h:3071 em_nbyte_match+0x2d8/0x3f0 net/sched/em_nbyte.c:43 Modules linked in: CPU: 1 UID: 0 PID: 6019 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Call Trace: tcf_em_match net/sched/ematch.c:494 [inline] __tcf_em_tree_match+0x1ac/0x770 net/sched/ematch.c:520 tcf_em_tree_match include/net/pkt_cls.h:512 [inline] basic_classify+0x115/0x2d0 net/sched/cls_basic.c:50 tc_classify include/net/tc_wrapper.h:197 [inline] __tcf_classify net/sched/cls_api.c:1764 [inline] tcf_classify+0x4cf/0x1140 net/sched/cls_api.c:1860 multiq_classify net/sched/sch_multiq.c:39 [inline] multiq_enqueue+0xfd/0x4c0 net/sched/sch_multiq.c:66 dev_qdisc_enqueue+0x4e/0x260 net/core/dev.c:4118 __dev_xmit_skb net/core/dev.c:4214 [inline] __dev_queue_xmit+0xe83/0x3b50 net/core/dev.c:4729 packet_snd net/packet/af_packet.c:3076 [inline] packet_sendmsg+0x3e33/0x5080 net/packet/af_packet.c:3108 sock_sendmsg_nosec net/socket.c:727 [inline] __sock_sendmsg+0x21c/0x270 net/socket.c:742 ____sys_sendmsg+0x505/0x830 net/socket.c:2630 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+f3a497f02c389d86ef16@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6920855a.a70a0220.2ea503.0058.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20251121154100.1616228-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/pkt_cls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index c64fd896b1f9..99ac747b7906 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -536,6 +536,8 @@ static inline unsigned char * tcf_get_base_ptr(struct sk_buff *skb, int layer) case TCF_LAYER_NETWORK: return skb_network_header(skb); case TCF_LAYER_TRANSPORT: + if (!skb_transport_header_was_set(skb)) + break; return skb_transport_header(skb); } -- cgit v1.2.3 From 075b19c211dfeea5f27075293ddf8795b78c9bd9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 21 Nov 2025 18:02:00 +0100 Subject: net: factor-out _sk_charge() helper Move out of __inet_accept() the code dealing charging newly accepted socket to memcg. MPTCP will soon use it to on a per subflow basis, in different contexts. No functional changes intended. Signed-off-by: Paolo Abeni Acked-by: Geliang Tang Acked-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251121-net-next-mptcp-memcg-backlog-imp-v1-1-1f34b6c1e0b1@kernel.org Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index a5f36ea9d46f..38d48cfe0741 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1631,6 +1631,8 @@ static inline void sk_mem_uncharge(struct sock *sk, int size) sk_mem_reclaim(sk); } +void __sk_charge(struct sock *sk, gfp_t gfp); + #if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES) static inline void sk_owner_set(struct sock *sk, struct module *owner) { -- cgit v1.2.3 From 73029e73ccd07b64905f441d4f474a9bb91e7027 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 24 Nov 2025 18:27:30 -0800 Subject: x86/cc: Fix enum spelling to fix kernel-doc warnings Make the enum name in kernel-doc match the code to prevent kernel-doc warnings: Warning: include/linux/cc_platform.h:106 Enum value 'CC_ATTR_GUEST_SEV_SNP' not described in enum 'cc_attr' Warning: include/linux/cc_platform.h:106 Excess enum value '%CC_ATTR_SEV_SNP' description in 'cc_attr' Fixes: f742b90e61bb ("x86/mm: Extend cc_attr to include AMD SEV-SNP") Signed-off-by: Randy Dunlap Signed-off-by: Borislav Petkov (AMD) Link: https://patch.msgid.link/20251125022730.3163679-1-rdunlap@infradead.org --- include/linux/cc_platform.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index 7fcec025c5e0..559353ad64ac 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -74,7 +74,7 @@ enum cc_attr { CC_ATTR_GUEST_UNROLL_STRING_IO, /** - * @CC_ATTR_SEV_SNP: Guest SNP is active. + * @CC_ATTR_GUEST_SEV_SNP: Guest SNP is active. * * The platform/OS is running as a guest/virtual machine and actively * using AMD SEV-SNP features. -- cgit v1.2.3 From 37d369fa97cc0774ea4eab726d16bcb5fbe3a104 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 23 Nov 2025 22:05:15 +0000 Subject: fs: Add uoff_t In a recent commit, I inadvertently changed a comparison from being an unsigned comparison (on 64-bit systems) to being a signed comparison (which it had always been on 32-bit systems). This led to a sporadic fstests failure. To make sure this comparison is always unsigned, introduce a new type, uoff_t which is the unsigned version of loff_t. Generally file sizes are restricted to being a signed integer, but in these two places it is convenient to pass -1 to indicate "up to the end of the file". Signed-off-by: Matthew Wilcox (Oracle) Link: https://patch.msgid.link/20251123220518.1447261-1-willy@infradead.org Signed-off-by: Christian Brauner --- include/linux/mm.h | 8 ++++---- include/linux/shmem_fs.h | 2 +- include/linux/types.h | 1 + include/uapi/asm-generic/posix_types.h | 1 + 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d16b33bacc32..2a36d1bcf491 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3495,10 +3495,10 @@ struct vm_unmapped_area_info { extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ -extern void truncate_inode_pages(struct address_space *, loff_t); -extern void truncate_inode_pages_range(struct address_space *, - loff_t lstart, loff_t lend); -extern void truncate_inode_pages_final(struct address_space *); +void truncate_inode_pages(struct address_space *mapping, loff_t lstart); +void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, + uoff_t lend); +void truncate_inode_pages_final(struct address_space *mapping); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0e47465ef0fd..774efe592a9a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -111,7 +111,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); int shmem_writeout(struct folio *folio, struct swap_iocb **plug, struct list_head *folio_list); -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); +void shmem_truncate_range(struct inode *inode, loff_t start, uoff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/types.h b/include/linux/types.h index 6dfdb8e8e4c3..d4437e9c452c 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -50,6 +50,7 @@ typedef __kernel_old_gid_t old_gid_t; #if defined(__GNUC__) typedef __kernel_loff_t loff_t; +typedef __kernel_uoff_t uoff_t; #endif /* diff --git a/include/uapi/asm-generic/posix_types.h b/include/uapi/asm-generic/posix_types.h index b5f7594eee7a..0a90ad92dbf3 100644 --- a/include/uapi/asm-generic/posix_types.h +++ b/include/uapi/asm-generic/posix_types.h @@ -86,6 +86,7 @@ typedef struct { */ typedef __kernel_long_t __kernel_off_t; typedef long long __kernel_loff_t; +typedef unsigned long long __kernel_uoff_t; typedef __kernel_long_t __kernel_old_time_t; #ifndef __KERNEL__ typedef __kernel_long_t __kernel_time_t; -- cgit v1.2.3 From 54ca9e913e22e364292a484783efc4fcdb6fdc51 Mon Sep 17 00:00:00 2001 From: Askar Safin Date: Thu, 20 Nov 2025 19:51:40 +0000 Subject: include/linux/fs.h: trivial fix: regualr -> regular Trivial fix. Signed-off-by: Askar Safin Link: https://patch.msgid.link/20251120195140.571608-1-safinaskar@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index ff69734b9fde..e02700b4e36b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3102,7 +3102,7 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) * file_start_write - get write access to a superblock for regular file io * @file: the file we want to write to * - * This is a variant of sb_start_write() which is a noop on non-regualr file. + * This is a variant of sb_start_write() which is a noop on non-regular file. * Should be matched with a call to file_end_write(). */ static inline void file_start_write(struct file *file) -- cgit v1.2.3 From f9f85149994dbb9db43202ae8fabf68940c0ac0f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Nov 2025 18:06:26 +0100 Subject: fs, iomap: remove IOCB_DIO_CALLER_COMP This was added by commit 099ada2c8726 ("io_uring/rw: add write support for IOCB_DIO_CALLER_COMP") and disabled a little later by commit 838b35bb6a89 ("io_uring/rw: disable IOCB_DIO_CALLER_COMP") because it didn't work. Remove all the related code that sat unused for 2 years. Signed-off-by: Christoph Hellwig Link: https://patch.msgid.link/20251113170633.1453259-2-hch@lst.de Reviewed-by: Jan Kara Reviewed-by: Chaitanya Kulkarni Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 43 +++++++++---------------------------------- 1 file changed, 9 insertions(+), 34 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index c895146c1444..e210d2d8af53 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -367,23 +367,9 @@ struct readahead_control; #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) -/* - * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the - * iocb completion can be passed back to the owner for execution from a safe - * context rather than needing to be punted through a workqueue. If this - * flag is set, the bio completion handling may set iocb->dio_complete to a - * handler function and iocb->private to context information for that handler. - * The issuer should call the handler with that context information from task - * context to complete the processing of the iocb. Note that while this - * provides a task context for the dio_complete() callback, it should only be - * used on the completion side for non-IO generating completions. It's fine to - * call blocking functions from this callback, but they should not wait for - * unrelated IO (like cache flushing, new IO generation, etc). - */ -#define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ -#define IOCB_AIO_RW (1 << 23) -#define IOCB_HAS_METADATA (1 << 24) +#define IOCB_AIO_RW (1 << 22) +#define IOCB_HAS_METADATA (1 << 23) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ @@ -400,7 +386,6 @@ struct readahead_control; { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ - { IOCB_DIO_CALLER_COMP, "CALLER_COMP" }, \ { IOCB_AIO_RW, "AIO_RW" }, \ { IOCB_HAS_METADATA, "AIO_HAS_METADATA" } @@ -412,23 +397,13 @@ struct kiocb { int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ u8 ki_write_stream; - union { - /* - * Only used for async buffered reads, where it denotes the - * page waitqueue associated with completing the read. Valid - * IFF IOCB_WAITQ is set. - */ - struct wait_page_queue *ki_waitq; - /* - * Can be used for O_DIRECT IO, where the completion handling - * is punted back to the issuer of the IO. May only be set - * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer - * must then check for presence of this handler when ki_complete - * is invoked. The data passed in to this handler must be - * assigned to ->private when dio_complete is assigned. - */ - ssize_t (*dio_complete)(void *data); - }; + + /* + * Only used for async buffered reads, where it denotes the page + * waitqueue associated with completing the read. + * Valid IFF IOCB_WAITQ is set. + */ + struct wait_page_queue *ki_waitq; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) -- cgit v1.2.3 From 24d4da5c2565313c2ad3c43449937a9351a64407 Mon Sep 17 00:00:00 2001 From: Ria Thomas Date: Mon, 24 Nov 2025 18:26:37 +0530 Subject: wifi: ieee80211: correct FILS status codes The FILS status codes are set to 108/109, but the IEEE 802.11-2020 spec defines them as 112/113. Update the enum so it matches the specification and keeps the kernel consistent with standard values. Fixes: a3caf7440ded ("cfg80211: Add support for FILS shared key authentication offload") Signed-off-by: Ria Thomas Reviewed-by: Jeff Johnson Link: https://patch.msgid.link/20251124125637.3936154-1-ria.thomas@morsemicro.com Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index d55d8ea3a8be..96439de55f07 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1493,8 +1493,8 @@ enum ieee80211_statuscode { WLAN_STATUS_DENIED_WITH_SUGGESTED_BAND_AND_CHANNEL = 99, WLAN_STATUS_DENIED_DUE_TO_SPECTRUM_MANAGEMENT = 103, /* 802.11ai */ - WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 108, - WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 109, + WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 112, + WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 113, WLAN_STATUS_SAE_HASH_TO_ELEMENT = 126, WLAN_STATUS_SAE_PK = 127, WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING = 133, -- cgit v1.2.3 From cba1ba11c1bae87de9c2e13d342bfbd6a3c1cf63 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Tue, 25 Nov 2025 13:59:26 +1100 Subject: wifi: cfg80211: include s1g_primary_2mhz when comparing chandefs When comparing chandefs, ensure we include s1g_primary_2mhz. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20251125025927.245280-3-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 3d3ed1932262..899f267b7cf9 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -974,7 +974,8 @@ cfg80211_chandef_identical(const struct cfg80211_chan_def *chandef1, chandef1->center_freq1 == chandef2->center_freq1 && chandef1->freq1_offset == chandef2->freq1_offset && chandef1->center_freq2 == chandef2->center_freq2 && - chandef1->punctured == chandef2->punctured); + chandef1->punctured == chandef2->punctured && + chandef1->s1g_primary_2mhz == chandef2->s1g_primary_2mhz); } /** -- cgit v1.2.3 From a27628f4363435beac84b55c749c41a005054d30 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Sat, 11 Oct 2025 00:17:36 +0200 Subject: fs: rework I_NEW handling to operate without fences In the inode hash code grab the state while ->i_lock is held. If found to be set, synchronize the sleep once more with the lock held. In the real world the flag is not set most of the time. Apart from being simpler to reason about, it comes with a minor speed up as now clearing the flag does not require the smp_mb() fence. While here rename wait_on_inode() to wait_on_new_inode() to line it up with __wait_on_freeing_inode(). Christian Brauner says: As per the discussion in [1] I folded in the diff sent in [2]. Link: https://lore.kernel.org/69238e4d.a70a0220.d98e3.006e.GAE@google.com [1] Link: https://lore.kernel.org/c2kpawomkbvtahjm7y5mposbhckb7wxthi3iqy5yr22ggpucrm@ufvxwy233qxo [2] Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251010221737.1403539-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 21c73df3ce75..a813abdcf218 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1030,15 +1030,7 @@ static inline void inode_fake_hash(struct inode *inode) hlist_add_fake(&inode->i_hash); } -static inline void wait_on_inode(struct inode *inode) -{ - wait_var_event(inode_state_wait_address(inode, __I_NEW), - !(inode_state_read_once(inode) & I_NEW)); - /* - * Pairs with routines clearing I_NEW. - */ - smp_rmb(); -} +void wait_on_new_inode(struct inode *inode); /* * inode->i_rwsem nesting subclasses for the lock validator: @@ -3417,7 +3409,7 @@ extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), - void *data); + void *data, bool *isnew); extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup(struct super_block *sb, unsigned long ino); -- cgit v1.2.3 From 4c6b40877b4dc83f61a762a3a35a09dcf744b585 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 29 Oct 2025 14:14:28 +0100 Subject: fs: cosmetic fixes to lru handling 1. inode_bit_waitqueue() was somehow placed between __inode_add_lru() and inode_add_lru(). move it up 2. assert ->i_lock is held in __inode_add_lru instead of just claiming it is needed 3. s/__inode_add_lru/__inode_lru_list_add/ for consistency with itself (inode_lru_list_del()) and similar routines for sb and io list management 4. push list presence check into inode_lru_list_del(), just like sb and io list Signed-off-by: Mateusz Guzik Link: https://patch.msgid.link/20251029131428.654761-2-mjguzik@gmail.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index a813abdcf218..33129cda3a99 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3502,7 +3502,7 @@ static inline void remove_inode_hash(struct inode *inode) } extern void inode_sb_list_add(struct inode *inode); -extern void inode_add_lru(struct inode *inode); +extern void inode_lru_list_add(struct inode *inode); extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); -- cgit v1.2.3 From 1c6a92a5a5de7ebf94526dee7068926e6d5b1b01 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 24 Nov 2025 18:28:34 -0800 Subject: wifi: nl80211: vendor-cmd: intel: fix a blank kernel-doc line warning Delete an empty line prevent a kernel-doc warning: Warning: ../include/uapi/linux/nl80211-vnd-intel.h:86 bad line: Fixes: 3d2a2544eae9 ("nl80211: vendor-cmd: add Intel vendor commands for iwlmei usage") Signed-off-by: Randy Dunlap Link: https://patch.msgid.link/20251125022834.3171742-1-rdunlap@infradead.org Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211-vnd-intel.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/nl80211-vnd-intel.h b/include/uapi/linux/nl80211-vnd-intel.h index 4ed7d0b24512..79ccc9401d50 100644 --- a/include/uapi/linux/nl80211-vnd-intel.h +++ b/include/uapi/linux/nl80211-vnd-intel.h @@ -84,7 +84,6 @@ enum iwl_vendor_auth_akm_mode { * * @NUM_IWL_MVM_VENDOR_ATTR: number of vendor attributes * @MAX_IWL_MVM_VENDOR_ATTR: highest vendor attribute number - */ enum iwl_mvm_vendor_attr { __IWL_MVM_VENDOR_ATTR_INVALID = 0x00, -- cgit v1.2.3 From 194832dcb13b0d02fce0df887235b7e6d1ef0121 Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 18 Nov 2025 11:04:04 +0100 Subject: string: use __attribute__((nonnull())) in strends() The arguments of strends() must not be NULL so annotate the function with the nonnull attribute. Suggested-by: Kees Cook Link: https://lore.kernel.org/r/20251118-strends-follow-up-v1-2-d3f8ef750f59@linaro.org Signed-off-by: Bartosz Golaszewski --- include/linux/string.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/string.h b/include/linux/string.h index 69e9256592f8..0266dbdaa4cd 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -570,7 +570,8 @@ static inline bool strstarts(const char *str, const char *prefix) * Returns: * True if @str ends with @suffix. False in all other cases. */ -static inline bool strends(const char *str, const char *suffix) +static inline bool __attribute__((nonnull(1, 2))) +strends(const char *str, const char *suffix) { unsigned int str_len = strlen(str), suffix_len = strlen(suffix); -- cgit v1.2.3 From 155f8d4ef0b78afbf25b1449bbd654fd1327cc7a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 30 Oct 2025 11:01:37 +0000 Subject: ACPI: GTDT: Get rid of acpi_arch_timer_mem_init() Since 0f67b56d84b4c ("clocksource/drivers/arm_arch_timer_mmio: Switch over to standalone driver"), acpi_arch_timer_mem_init() is unused. Remove it. Signed-off-by: Marc Zyngier Cc: Hanjun Guo Cc: Sudeep Holla Cc: Rafael J. Wysocki Cc: Daniel Lezcano Cc: Thomas Gleixner Cc: Mark Rutland Acked-by: Hanjun Guo Signed-off-by: Catalin Marinas --- include/linux/acpi.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 5ff5d99f6ead..22b377c3a319 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -755,7 +755,6 @@ int acpi_reconfig_notifier_unregister(struct notifier_block *nb); int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count); int acpi_gtdt_map_ppi(int type); bool acpi_gtdt_c3stop(int type); -int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count); #endif #ifndef ACPI_HAVE_ARCH_SET_ROOT_POINTER -- cgit v1.2.3 From 2d45db63260c6ae3cf007361e04a1c41bd265084 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 10 Nov 2025 22:09:16 -0800 Subject: backlight: lp855x: Fix lp855x.h kernel-doc warnings Add a missing struct short description and a missing leading " *" to lp855x.h to avoid kernel-doc warnings: Warning: include/linux/platform_data/lp855x.h:126 missing initial short description on line: * struct lp855x_platform_data Warning: include/linux/platform_data/lp855x.h:131 bad line: Only valid when mode is PWM_BASED. Fixes: 7be865ab8634 ("backlight: new backlight driver for LP855x devices") Signed-off-by: Randy Dunlap Reviewed-by: Daniel Thompson (RISCstar) Link: https://patch.msgid.link/20251111060916.1995920-1-rdunlap@infradead.org Signed-off-by: Lee Jones --- include/linux/platform_data/lp855x.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/platform_data/lp855x.h b/include/linux/platform_data/lp855x.h index ab222dd05bbc..3b4a891acefe 100644 --- a/include/linux/platform_data/lp855x.h +++ b/include/linux/platform_data/lp855x.h @@ -124,12 +124,12 @@ struct lp855x_rom_data { }; /** - * struct lp855x_platform_data + * struct lp855x_platform_data - lp855 platform-specific data * @name : Backlight driver name. If it is not defined, default name is set. * @device_control : value of DEVICE CONTROL register * @initial_brightness : initial value of backlight brightness * @period_ns : platform specific pwm period value. unit is nano. - Only valid when mode is PWM_BASED. + * Only valid when mode is PWM_BASED. * @size_program : total size of lp855x_rom_data * @rom_data : list of new eeprom/eprom registers */ -- cgit v1.2.3 From 34fa09c698d626b09f7824fe2c520a0a21a072b9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Nov 2025 14:50:25 +0100 Subject: Revert "ACPI: processor: Do not expose global variable acpi_idle_driver" Revert commit 559f2eacc8a2 ACPI: processor: Do not expose global variable acpi_idle_driver" because it depends on a problematic one. Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 7146a8e9e9c2..24fdaa3c2899 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -417,6 +417,7 @@ static inline void acpi_processor_throttling_init(void) {} #endif /* CONFIG_ACPI_CPU_FREQ_PSS */ /* in processor_idle.c */ +extern struct cpuidle_driver acpi_idle_driver; #ifdef CONFIG_ACPI_PROCESSOR_IDLE void acpi_processor_power_init(struct acpi_processor *pr); void acpi_processor_power_exit(struct acpi_processor *pr); -- cgit v1.2.3 From 66e600a26ee7d845d9434c3d60cef4bbf7dd3eb4 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Nov 2025 14:53:33 +0100 Subject: Revert "ACPI: processor: idle: Redefine two functions as void" Revert commit fbd401e95e56 ("ACPI: processor: idle: Redefine two functions as void") because it depends on a problematic one. Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 24fdaa3c2899..6ee4a69412de 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -419,8 +419,8 @@ static inline void acpi_processor_throttling_init(void) {} /* in processor_idle.c */ extern struct cpuidle_driver acpi_idle_driver; #ifdef CONFIG_ACPI_PROCESSOR_IDLE -void acpi_processor_power_init(struct acpi_processor *pr); -void acpi_processor_power_exit(struct acpi_processor *pr); +int acpi_processor_power_init(struct acpi_processor *pr); +int acpi_processor_power_exit(struct acpi_processor *pr); int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); void acpi_processor_register_idle_driver(void); -- cgit v1.2.3 From e6889323c2184c700428dd4b90a1c2c06b8ae51f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Nov 2025 15:03:24 +0100 Subject: Revert "ACPI: processor: idle: Rearrange declarations in header file" Revert commit bdf780fbcef5 ("ACPI: processor: idle: Rearrange declarations in header file") because it depends on a problematic one. Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 6ee4a69412de..2976a6d0c54f 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -425,8 +425,6 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); void acpi_processor_register_idle_driver(void); void acpi_processor_unregister_idle_driver(void); -int acpi_processor_ffh_lpi_probe(unsigned int cpu); -int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ /* in processor_thermal.c */ @@ -449,6 +447,11 @@ static inline void acpi_thermal_cpufreq_exit(struct cpufreq_policy *policy) } #endif /* CONFIG_CPU_FREQ */ +#ifdef CONFIG_ACPI_PROCESSOR_IDLE +extern int acpi_processor_ffh_lpi_probe(unsigned int cpu); +extern int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi); +#endif + void acpi_processor_init_invariance_cppc(void); #endif -- cgit v1.2.3 From 1a8b3501821b608383f7c7aa0f24e2006681e2b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Nov 2025 15:05:01 +0100 Subject: Revert "ACPI: processor: Remove unused empty stubs of some functions" Revert commit 5020d05b3476 ("ACPI: processor: Remove unused empty stubs of some functions") because it depends on a problematic one. Signed-off-by: Rafael J. Wysocki --- include/acpi/processor.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'include') diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 2976a6d0c54f..ff864c1cee3a 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -425,6 +425,26 @@ int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); void acpi_processor_register_idle_driver(void); void acpi_processor_unregister_idle_driver(void); +#else +static inline int acpi_processor_power_init(struct acpi_processor *pr) +{ + return -ENODEV; +} + +static inline int acpi_processor_power_exit(struct acpi_processor *pr) +{ + return -ENODEV; +} + +static inline int acpi_processor_power_state_has_changed(struct acpi_processor *pr) +{ + return -ENODEV; +} + +static inline int acpi_processor_hotplug(struct acpi_processor *pr) +{ + return -ENODEV; +} #endif /* CONFIG_ACPI_PROCESSOR_IDLE */ /* in processor_thermal.c */ -- cgit v1.2.3 From 43ff36c4a5a574ee83b4b0d3f3d74f09a3a8c2d3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 25 Nov 2025 15:06:12 +0100 Subject: Revert "ACPI: processor: idle: Optimize ACPI idle driver registration" Revert commit 7a8c994cbb2d ("ACPI: processor: idle: Optimize ACPI idle driver registration") because it is reported to introduce a cpuidle regression leading to a kernel crash on a platform using the ACPI idle driver. Signed-off-by: Rafael J. Wysocki Reported-by: Borislav Petkov Tested-by: Borislav Petkov (AMD) Closes: https://lore.kernel.org/lkml/20251124200019.GIaSS5U9HhsWBotrQZ@fat_crate.local/ --- include/acpi/processor.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/acpi/processor.h b/include/acpi/processor.h index ff864c1cee3a..d0eccbd920e5 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -423,8 +423,6 @@ int acpi_processor_power_init(struct acpi_processor *pr); int acpi_processor_power_exit(struct acpi_processor *pr); int acpi_processor_power_state_has_changed(struct acpi_processor *pr); int acpi_processor_hotplug(struct acpi_processor *pr); -void acpi_processor_register_idle_driver(void); -void acpi_processor_unregister_idle_driver(void); #else static inline int acpi_processor_power_init(struct acpi_processor *pr) { -- cgit v1.2.3 From b2a38f6df9dab0b05858746edcbe2403f8f4e4ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:43 +0000 Subject: net_sched: make room for (struct qdisc_skb_cb)->pkt_segs Add a new u16 field, next to pkt_len : pkt_segs This will cache shinfo->gso_segs to speed up qdisc deqeue(). Move slave_dev_queue_mapping at the end of qdisc_skb_cb, and move three bits from tc_skb_cb : - post_ct - post_ct_snat - post_ct_dnat Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-2-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sch_generic.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 94966692ccdf..9cd8b5d4b236 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -429,13 +429,16 @@ struct tcf_proto { }; struct qdisc_skb_cb { - struct { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; - }; + unsigned int pkt_len; + u16 pkt_segs; + u16 tc_classid; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; + + u16 slave_dev_queue_mapping; + u8 post_ct:1; + u8 post_ct_snat:1; + u8 post_ct_dnat:1; }; typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv); @@ -1064,11 +1067,8 @@ struct tc_skb_cb { struct qdisc_skb_cb qdisc_cb; u32 drop_reason; - u16 zone; /* Only valid if post_ct = true */ + u16 zone; /* Only valid if qdisc_skb_cb(skb)->post_ct = true */ u16 mru; - u8 post_ct:1; - u8 post_ct_snat:1; - u8 post_ct_dnat:1; }; static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb) -- cgit v1.2.3 From 2773cb0b3120eb5c4b66d949eb99853d5bae1221 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:47 +0000 Subject: net_sched: use qdisc_skb_cb(skb)->pkt_segs in bstats_update() Avoid up to two cache line misses in qdisc dequeue() to fetch skb_shinfo(skb)->gso_segs/gso_size while qdisc spinlock is held. This gives a 5 % improvement in a TX intensive workload. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-6-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sch_generic.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 9cd8b5d4b236..cdf7a58ebcf5 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -829,6 +829,15 @@ static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb) return qdisc_skb_cb(skb)->pkt_len; } +static inline unsigned int qdisc_pkt_segs(const struct sk_buff *skb) +{ + u32 pkt_segs = qdisc_skb_cb(skb)->pkt_segs; + + DEBUG_NET_WARN_ON_ONCE(pkt_segs != + (skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1)); + return pkt_segs; +} + /* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */ enum net_xmit_qdisc_t { __NET_XMIT_STOLEN = 0x00010000, @@ -870,9 +879,7 @@ static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, static inline void bstats_update(struct gnet_stats_basic_sync *bstats, const struct sk_buff *skb) { - _bstats_update(bstats, - qdisc_pkt_len(skb), - skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); + _bstats_update(bstats, qdisc_pkt_len(skb), qdisc_pkt_segs(skb)); } static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, -- cgit v1.2.3 From ad50d5a3fc20327e133e2db849c6e67fc49650e6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:49 +0000 Subject: net_sched: add Qdisc_read_mostly and Qdisc_write groups It is possible to reorg Qdisc to avoid always dirtying 2 cache lines in fast path by reducing this to a single dirtied cache line. In current layout, we change only four/six fields in the first cache line: - q.spinlock - q.qlen - bstats.bytes - bstats.packets - some Qdisc also change q.next/q.prev In the second cache line we change in the fast path: - running - state - qstats.backlog /* --- cacheline 2 boundary (128 bytes) --- */ struct sk_buff_head gso_skb __attribute__((__aligned__(64))); /* 0x80 0x18 */ struct qdisc_skb_head q; /* 0x98 0x18 */ struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xb0 0x10 */ /* --- cacheline 3 boundary (192 bytes) --- */ struct gnet_stats_queue qstats; /* 0xc0 0x14 */ bool running; /* 0xd4 0x1 */ /* XXX 3 bytes hole, try to pack */ unsigned long state; /* 0xd8 0x8 */ struct Qdisc * next_sched; /* 0xe0 0x8 */ struct sk_buff_head skb_bad_txq; /* 0xe8 0x18 */ /* --- cacheline 4 boundary (256 bytes) --- */ Reorganize things to have a first cache line mostly read, then a mostly written one. This gives a ~3% increase of performance under tx stress. Note that there is an additional hole because @qstats now spans over a third cache line. /* --- cacheline 2 boundary (128 bytes) --- */ __u8 __cacheline_group_begin__Qdisc_read_mostly[0] __attribute__((__aligned__(64))); /* 0x80 0 */ struct sk_buff_head gso_skb; /* 0x80 0x18 */ struct Qdisc * next_sched; /* 0x98 0x8 */ struct sk_buff_head skb_bad_txq; /* 0xa0 0x18 */ __u8 __cacheline_group_end__Qdisc_read_mostly[0]; /* 0xb8 0 */ /* XXX 8 bytes hole, try to pack */ /* --- cacheline 3 boundary (192 bytes) --- */ __u8 __cacheline_group_begin__Qdisc_write[0] __attribute__((__aligned__(64))); /* 0xc0 0 */ struct qdisc_skb_head q; /* 0xc0 0x18 */ unsigned long state; /* 0xd8 0x8 */ struct gnet_stats_basic_sync bstats __attribute__((__aligned__(16))); /* 0xe0 0x10 */ bool running; /* 0xf0 0x1 */ /* XXX 3 bytes hole, try to pack */ struct gnet_stats_queue qstats; /* 0xf4 0x14 */ /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ __u8 __cacheline_group_end__Qdisc_write[0]; /* 0x108 0 */ /* XXX 56 bytes hole, try to pack */ Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-8-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sch_generic.h | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index cdf7a58ebcf5..79501499dafb 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -103,17 +103,24 @@ struct Qdisc { int pad; refcount_t refcnt; - /* - * For performance sake on SMP, we put highly modified fields at the end - */ - struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; - struct qdisc_skb_head q; - struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - bool running; /* must be written under qdisc spinlock */ - unsigned long state; - struct Qdisc *next_sched; - struct sk_buff_head skb_bad_txq; + /* Cache line potentially dirtied in dequeue() or __netif_reschedule(). */ + __cacheline_group_begin(Qdisc_read_mostly) ____cacheline_aligned; + struct sk_buff_head gso_skb; + struct Qdisc *next_sched; + struct sk_buff_head skb_bad_txq; + __cacheline_group_end(Qdisc_read_mostly); + + /* Fields dirtied in dequeue() fast path. */ + __cacheline_group_begin(Qdisc_write) ____cacheline_aligned; + struct qdisc_skb_head q; + unsigned long state; + struct gnet_stats_basic_sync bstats; + bool running; /* must be written under qdisc spinlock */ + + /* Note : we only change qstats.backlog in fast path. */ + struct gnet_stats_queue qstats; + __cacheline_group_end(Qdisc_write); + atomic_long_t defer_count ____cacheline_aligned_in_smp; struct llist_head defer_list; -- cgit v1.2.3 From 0170d7f47c8bb0311bc802bad52245c045f151fe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:54 +0000 Subject: net_sched: add tcf_kfree_skb_list() helper Using kfree_skb_list_reason() to free list of skbs from qdisc operations seems wrong as each skb might have a different drop reason. Cleanup __dev_xmit_skb() to call tcf_kfree_skb_list() once in preparation of the following patch. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-13-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/sch_generic.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 79501499dafb..b8092d0378a0 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -1105,6 +1105,17 @@ static inline void tcf_set_drop_reason(const struct sk_buff *skb, tc_skb_cb(skb)->drop_reason = reason; } +static inline void tcf_kfree_skb_list(struct sk_buff *skb) +{ + while (unlikely(skb)) { + struct sk_buff *next = skb->next; + + prefetch(next); + kfree_skb_reason(skb, tcf_get_drop_reason(skb)); + skb = next; + } +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ -- cgit v1.2.3 From 191ff13e42a7b7824fec5b2ed84fd6481356754d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Nov 2025 08:32:55 +0000 Subject: net_sched: add qdisc_dequeue_drop() helper Some qdisc like cake, codel, fq_codel might drop packets in their dequeue() method. This is currently problematic because dequeue() runs with the qdisc spinlock held. Freeing skbs can be extremely expensive. Add qdisc_dequeue_drop() method and a new TCQ_F_DEQUEUE_DROPS so that these qdiscs can opt-in to defer the skb frees after the socket spinlock is released. TCQ_F_DEQUEUE_DROPS is an attempt to not penalize other qdiscs with an extra cache line miss. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20251121083256.674562-14-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/pkt_sched.h | 5 +++-- include/net/sch_generic.h | 30 +++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 4678db45832a..e703c507d0da 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -114,12 +114,13 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, void __qdisc_run(struct Qdisc *q); -static inline void qdisc_run(struct Qdisc *q) +static inline struct sk_buff *qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); - qdisc_run_end(q); + return qdisc_run_end(q); } + return NULL; } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index b8092d0378a0..c3a7268b567e 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -88,6 +88,8 @@ struct Qdisc { #define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ #define TCQ_F_NOLOCK 0x100 /* qdisc does not require locking */ #define TCQ_F_OFFLOADED 0x200 /* qdisc is offloaded to HW */ +#define TCQ_F_DEQUEUE_DROPS 0x400 /* ->dequeue() can drop packets in q->to_free */ + u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; @@ -119,6 +121,8 @@ struct Qdisc { /* Note : we only change qstats.backlog in fast path. */ struct gnet_stats_queue qstats; + + struct sk_buff *to_free; __cacheline_group_end(Qdisc_write); @@ -218,8 +222,10 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) return true; } -static inline void qdisc_run_end(struct Qdisc *qdisc) +static inline struct sk_buff *qdisc_run_end(struct Qdisc *qdisc) { + struct sk_buff *to_free = NULL; + if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); @@ -232,9 +238,16 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) if (unlikely(test_bit(__QDISC_STATE_MISSED, &qdisc->state))) __netif_schedule(qdisc); - } else { - WRITE_ONCE(qdisc->running, false); + return NULL; + } + + if (qdisc->flags & TCQ_F_DEQUEUE_DROPS) { + to_free = qdisc->to_free; + if (to_free) + qdisc->to_free = NULL; } + WRITE_ONCE(qdisc->running, false); + return to_free; } static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) @@ -1116,6 +1129,17 @@ static inline void tcf_kfree_skb_list(struct sk_buff *skb) } } +static inline void qdisc_dequeue_drop(struct Qdisc *q, struct sk_buff *skb, + enum skb_drop_reason reason) +{ + DEBUG_NET_WARN_ON_ONCE(!(q->flags & TCQ_F_DEQUEUE_DROPS)); + DEBUG_NET_WARN_ON_ONCE(q->flags & TCQ_F_NOLOCK); + + tcf_set_drop_reason(skb, reason); + skb->next = q->to_free; + q->to_free = skb; +} + /* Instead of calling kfree_skb() while root qdisc lock is held, * queue the skb for future freeing at end of __dev_xmit_skb() */ -- cgit v1.2.3 From 96ce2aeb15bd8672ab47abe547e2a1f8ba3886ff Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 21 Nov 2025 11:50:58 -0400 Subject: vfio/pci: Add vfio_pci_dma_buf_iommufd_map() This function is used to establish the "private interconnect" between the VFIO DMABUF exporter and the iommufd DMABUF importer. This is intended to be a temporary API until the core DMABUF interface is improved to natively support a private interconnect and revocable negotiation. This function should only be called by iommufd when trying to map a DMABUF. For now iommufd will only support VFIO DMABUFs. The following improvements are needed in the DMABUF API to generically support more exporters with iommufd/kvm type importers that cannot use the DMA API: 1) Revoke semantics. VFIO needs to be able to prevent access to the MMIO during FLR, and so it will use dma_buf_move_notify() to prevent access. iommmufd does not support fault handling so it cannot implement the full move_notify. Instead if revoke is negotiated the exporter promises not to use move_notify() unless the importer can experiance failures. iommufd will unmap the dmabuf from the iommu page tables while it is revoked. 2) Private interconnect negotiation. iommufd will only be able to map a "private interconnect" that provides a phys_addr_t and a struct p2pdma_provider * to describe the memory. It cannot use a DMA mapped scatterlist since it is directly calling iommu_map(). 3) NULL device during dma_buf_dynamic_attach(). Since iommufd doesn't use the DMA API it doesn't have a DMAable struct device to pass here. Link: https://patch.msgid.link/r/1-v2-b2c110338e3f+5c2-iommufd_dmabuf_jgg@nvidia.com Reviewed-by: Nicolin Chen Reviewed-by: Kevin Tian Tested-by: Nicolin Chen Tested-by: Shuai Xue Acked-by: Alex Williamson Signed-off-by: Jason Gunthorpe --- include/linux/vfio_pci_core.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index c9466ba323fa..6a3074f2cf1c 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -28,6 +28,7 @@ struct vfio_pci_core_device; struct vfio_pci_region; struct p2pdma_provider; struct dma_buf_phys_vec; +struct dma_buf_attachment; struct vfio_pci_regops { ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, @@ -203,4 +204,7 @@ VFIO_IOREAD_DECLARATION(32) VFIO_IOREAD_DECLARATION(64) #endif +int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, + struct dma_buf_phys_vec *phys); + #endif /* VFIO_PCI_CORE_H */ -- cgit v1.2.3 From a4e6512a79d8486dccf3e8b066e5d6bd5ff95446 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:42 +0100 Subject: PM: QoS: Introduce a CPU system wakeup QoS limit Some platforms supports multiple low power states for CPUs that can be used when entering system-wide suspend. Currently we are always selecting the deepest possible state for the CPUs, which can break the system wakeup latency constraint that may be required for a use case. Let's take the first step towards addressing this problem, by introducing an interface for user space, that allows us to specify the CPU system wakeup QoS limit. Subsequent changes will start taking into account the new QoS limit. Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-2-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- include/linux/pm_qos.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 4a69d4af3ff8..6cea4455f867 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -162,6 +162,15 @@ static inline void cpu_latency_qos_update_request(struct pm_qos_request *req, static inline void cpu_latency_qos_remove_request(struct pm_qos_request *req) {} #endif +#ifdef CONFIG_PM_QOS_CPU_SYSTEM_WAKEUP +s32 cpu_wakeup_latency_qos_limit(void); +#else +static inline s32 cpu_wakeup_latency_qos_limit(void) +{ + return PM_QOS_RESUME_LATENCY_NO_CONSTRAINT; +} +#endif + #ifdef CONFIG_PM enum pm_qos_flags_status __dev_pm_qos_flags(struct device *dev, s32 mask); enum pm_qos_flags_status dev_pm_qos_flags(struct device *dev, s32 mask); -- cgit v1.2.3 From 8e7de6dc420979f4e4443807b71dcc8b72d8c4a9 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:43 +0100 Subject: pmdomain: Respect the CPU system wakeup QoS limit for s2idle A CPU system wakeup QoS limit may have been requested by user space. To avoid breaking this constraint when entering a low power state during s2idle through genpd, let's extend the corresponding genpd governor for CPUs. More precisely, during s2idle let the genpd governor select a suitable domain idle state, by taking into account the QoS limit. Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-3-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- include/linux/pm_domain.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index f67a2cb7d781..93ba0143ca47 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -153,6 +153,7 @@ enum genpd_sync_state { }; struct dev_power_governor { + bool (*system_power_down_ok)(struct dev_pm_domain *domain); bool (*power_down_ok)(struct dev_pm_domain *domain); bool (*suspend_ok)(struct device *dev); }; -- cgit v1.2.3 From 99b42445f4a4aaff75eca24dfc9e6e376292dd48 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 25 Nov 2025 12:26:45 +0100 Subject: sched: idle: Respect the CPU system wakeup QoS limit for s2idle A CPU system wakeup QoS limit may have been requested by user space. To avoid breaking this constraint when entering a low power state during s2idle, let's start to take into account the QoS limit. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Dhruva Gole Reviewed-by: Kevin Hilman (TI) Tested-by: Kevin Hilman (TI) Signed-off-by: Ulf Hansson Link: https://patch.msgid.link/20251125112650.329269-5-ulf.hansson@linaro.org Signed-off-by: Rafael J. Wysocki --- include/linux/cpuidle.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index a9ee4fe55dcf..4073690504a7 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -248,7 +248,8 @@ extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, u64 latency_limit_ns); extern int cpuidle_enter_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev); + struct cpuidle_device *dev, + u64 latency_limit_ns); extern void cpuidle_use_deepest_state(u64 latency_limit_ns); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, @@ -256,7 +257,8 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, u64 latency_limit_ns) {return -ENODEV; } static inline int cpuidle_enter_s2idle(struct cpuidle_driver *drv, - struct cpuidle_device *dev) + struct cpuidle_device *dev, + u64 latency_limit_ns) {return -ENODEV; } static inline void cpuidle_use_deepest_state(u64 latency_limit_ns) { -- cgit v1.2.3 From 35a5c37cb9f1f947dff18e7cfc75a8cfcfd557ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:01 +0100 Subject: cpumask: Cache num_possible_cpus() Reevaluating num_possible_cpus() over and over does not make sense. That becomes a constant after init as cpu_possible_mask is marked ro_after_init. Cache the value during initialization and provide that for consumption. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Yury Norov Reviewed-by: Mathieu Desnoyers Reviewed-by: Shrikanth Hegde Link: https://patch.msgid.link/20251119172549.578653738@linutronix.de --- include/linux/cpumask.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index feba06eb0a42..66694ee8d86e 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -126,6 +126,7 @@ extern struct cpumask __cpu_dying_mask; #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; +extern unsigned int __num_possible_cpus; extern cpumask_t cpus_booted_once_mask; @@ -1152,13 +1153,13 @@ void init_cpu_possible(const struct cpumask *src); #define __assign_cpu(cpu, mask, val) \ __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) -#define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) void set_cpu_online(unsigned int cpu, bool online); +void set_cpu_possible(unsigned int cpu, bool possible); /** * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * @@ -1211,7 +1212,12 @@ static __always_inline unsigned int num_online_cpus(void) { return raw_atomic_read(&__num_online_cpus); } -#define num_possible_cpus() cpumask_weight(cpu_possible_mask) + +static __always_inline unsigned int num_possible_cpus(void) +{ + return __num_possible_cpus; +} + #define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) -- cgit v1.2.3 From 539115f08cf850b9fdc6526b31da0839ff6c1631 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:03 +0100 Subject: sched/mmcid: Convert mm CID mask to a bitmap This is truly a bitmap and just conveniently uses a cpumask because the maximum size of the bitmap is nr_cpu_ids. But that prevents to do searches for a zero bit in a limited range, which is helpful to provide an efficient mechanism to consolidate the CID space when the number of users decreases. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Acked-by: Yury Norov (NVIDIA) Link: https://patch.msgid.link/20251119172549.642866767@linutronix.de --- include/linux/mm_types.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 67a7bdf772f7..bafb81b33922 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1342,13 +1342,13 @@ static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) } /* Accessor for struct mm_struct's cidmask. */ -static inline cpumask_t *mm_cidmask(struct mm_struct *mm) +static inline unsigned long *mm_cidmask(struct mm_struct *mm) { unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm); /* Skip mm_cpus_allowed */ cid_bitmap += cpumask_size(); - return (struct cpumask *)cid_bitmap; + return (unsigned long *)cid_bitmap; } static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) @@ -1363,7 +1363,7 @@ static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; raw_spin_lock_init(&mm->mm_cid.lock); cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); - cpumask_clear(mm_cidmask(mm)); + bitmap_zero(mm_cidmask(mm), num_possible_cpus()); } static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) @@ -1384,7 +1384,8 @@ static inline void mm_destroy_cid(struct mm_struct *mm) static inline unsigned int mm_cid_size(void) { - return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ + /* mm_cpus_allowed(), mm_cidmask(). */ + return cpumask_size() + bitmap_size(num_possible_cpus()); } #else /* CONFIG_SCHED_MM_CID */ -- cgit v1.2.3 From 2b1642b881088bbf73fcb1147c474a198ec46729 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:05 +0100 Subject: signal: Move MMCID exit out of sighand lock There is no need anymore to keep this under sighand lock as the current code and the upcoming replacement are not depending on the exit state of a task anymore. That allows to use a mutex in the exit path. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.706439391@linutronix.de --- include/linux/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 64f080d6ed6e..c411ae021bc5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2298,7 +2298,7 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); -void sched_mm_cid_exit_signals(struct task_struct *t); +void sched_mm_cid_exit(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { return t->mm_cid.cid; @@ -2307,7 +2307,7 @@ static inline int task_mm_cid(struct task_struct *t) static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } -static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } +static inline void sched_mm_cid_exit(struct task_struct *t) { } static inline int task_mm_cid(struct task_struct *t) { /* -- cgit v1.2.3 From bf070520e398679cd582b3c3e44107bf22c143ba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:07 +0100 Subject: sched/mmcid: Move initialization out of line It's getting bigger soon, so just move it out of line to the rest of the code. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.769636491@linutronix.de --- include/linux/mm_types.h | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bafb81b33922..3b7d05e7169c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1351,20 +1351,7 @@ static inline unsigned long *mm_cidmask(struct mm_struct *mm) return (unsigned long *)cid_bitmap; } -static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) -{ - int i; - - for_each_possible_cpu(i) { - struct mm_cid_pcpu *pcpu = per_cpu_ptr(mm->mm_cid.pcpu, i); - - pcpu->cid = MM_CID_UNSET; - } - mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; - raw_spin_lock_init(&mm->mm_cid.lock); - cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); - bitmap_zero(mm_cidmask(mm), num_possible_cpus()); -} +void mm_init_cid(struct mm_struct *mm, struct task_struct *p); static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) { -- cgit v1.2.3 From b0c3d51b54f8a4f4c809432d210c0c983d5cd97e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:09 +0100 Subject: sched/mmcid: Provide precomputed maximal value Reading mm::mm_users and mm:::mm_cid::nr_cpus_allowed every time to compute the maximal CID value is just wasteful as that value is only changing on fork(), exit() and eventually when the affinity changes. So it can be easily precomputed at those points and provided in mm::mm_cid for consumption in the hot path. But there is an issue with using mm::mm_users for accounting because that does not necessarily reflect the number of user space tasks as other kernel code can take temporary references on the MM which skew the picture. Solve that by adding a users counter to struct mm_mm_cid, which is modified by fork() and exit() and used for precomputing under mm_mm_cid::lock. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.832764634@linutronix.de --- include/linux/rseq_types.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index d7e8071b626a..0fab369999b6 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -117,14 +117,20 @@ struct mm_cid_pcpu { /** * struct mm_mm_cid - Storage for per MM CID data * @pcpu: Per CPU storage for CIDs associated to a CPU + * @max_cids: The exclusive maximum CID value for allocation and convergence * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map * is growth only. + * @users: The number of tasks sharing this MM. Separate from mm::mm_users + * as that is modified by mmget()/mm_put() by other entities which + * do not actually share the MM. * @lock: Spinlock to protect all fields except @pcpu. It also protects * the MM cid cpumask and the MM cidmask bitmap. */ struct mm_mm_cid { struct mm_cid_pcpu __percpu *pcpu; + unsigned int max_cids; unsigned int nr_cpus_allowed; + unsigned int users; raw_spinlock_t lock; }____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ -- cgit v1.2.3 From 51dd92c71a38647803478fb81e1812286a8998b1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:11 +0100 Subject: sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex Prepare for the new CID management scheme which puts the CID ownership transition into the fork() and exit() slow path by serializing sched_mm_cid_fork()/exit() with it, so task list and cpu mask walks can be done in interruptible and preemptible code. The contention on it is not worse than on other concurrency controls in the fork()/exit() machinery. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172549.895826703@linutronix.de --- include/linux/rseq_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 0fab369999b6..574aba6fe97c 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -125,6 +125,7 @@ struct mm_cid_pcpu { * do not actually share the MM. * @lock: Spinlock to protect all fields except @pcpu. It also protects * the MM cid cpumask and the MM cidmask bitmap. + * @mutex: Mutex to serialize forks and exits related to this mm */ struct mm_mm_cid { struct mm_cid_pcpu __percpu *pcpu; @@ -132,6 +133,7 @@ struct mm_mm_cid { unsigned int nr_cpus_allowed; unsigned int users; raw_spinlock_t lock; + struct mutex mutex; }____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; -- cgit v1.2.3 From 23343b6b09acb4bf97f34ed60e135000ca57ede1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:12 +0100 Subject: sched/mmcid: Introduce per task/CPU ownership infrastructure The MM CID management has two fundamental requirements: 1) It has to guarantee that at no given point in time the same CID is used by concurrent tasks in userspace. 2) The CID space must not exceed the number of possible CPUs in a system. While most allocators (glibc, tcmalloc, jemalloc) do not care about that, there seems to be at least librseq depending on it. The CID space compaction itself is not a functional correctness requirement, it is only a useful optimization mechanism to reduce the memory foot print in unused user space pools. The optimal CID space is: min(nr_tasks, nr_cpus_allowed); Where @nr_tasks is the number of actual user space threads associated to the mm and @nr_cpus_allowed is the superset of all task affinities. It is growth only as it would be insane to take a racy snapshot of all task affinities when the affinity of one task changes just do redo it 2 milliseconds later when the next task changes its affinity. That means that as long as the number of tasks is lower or equal than the number of CPUs allowed, each task owns a CID. If the number of tasks exceeds the number of CPUs allowed it switches to per CPU mode, where the CPUs own the CIDs and the tasks borrow them as long as they are scheduled in. For transition periods CIDs can go beyond the optimal space as long as they don't go beyond the number of possible CPUs. The current upstream implementation adds overhead into task migration to keep the CID with the task. It also has to do the CID space consolidation work from a task work in the exit to user space path. As that work is assigned to a random task related to a MM this can inflict unwanted exit latencies. This can be done differently by implementing a strict CID ownership mechanism. Either the CIDs are owned by the tasks or by the CPUs. The latter provides less locality when tasks are heavily migrating, but there is no justification to optimize for overcommit scenarios and thereby penalizing everyone else. Provide the basic infrastructure to implement this: - Change the UNSET marker to BIT(31) from ~0U - Add the ONCPU marker as BIT(30) - Add the TRANSIT marker as BIT(29) That allows to check for ownership trivially and provides a simple check for UNSET as well. The TRANSIT marker is required to prevent CID space exhaustion when switching from per CPU to per task mode. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251119172549.960252358@linutronix.de --- include/linux/rseq_types.h | 4 +++- include/linux/sched.h | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 574aba6fe97c..87854effe1ad 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -92,7 +92,9 @@ struct rseq_data { }; #ifdef CONFIG_SCHED_MM_CID -#define MM_CID_UNSET (~0U) +#define MM_CID_UNSET BIT(31) +#define MM_CID_ONCPU BIT(30) +#define MM_CID_TRANSIT BIT(29) /** * struct sched_mm_cid - Storage for per task MM CID data diff --git a/include/linux/sched.h b/include/linux/sched.h index c411ae021bc5..9eec409745f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2299,16 +2299,16 @@ void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit(struct task_struct *t); -static inline int task_mm_cid(struct task_struct *t) +static __always_inline int task_mm_cid(struct task_struct *t) { - return t->mm_cid.cid; + return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit(struct task_struct *t) { } -static inline int task_mm_cid(struct task_struct *t) +static __always_inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is -- cgit v1.2.3 From 9a723ed7facff6955da8d64cc9de7066038036c1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:14 +0100 Subject: sched/mmcid: Provide new scheduler CID mechanism The MM CID management has two fundamental requirements: 1) It has to guarantee that at no given point in time the same CID is used by concurrent tasks in userspace. 2) The CID space must not exceed the number of possible CPUs in a system. While most allocators (glibc, tcmalloc, jemalloc) do not care about that, there seems to be at least some LTTng library depending on it. The CID space compaction itself is not a functional correctness requirement, it is only a useful optimization mechanism to reduce the memory foot print in unused user space pools. The optimal CID space is: min(nr_tasks, nr_cpus_allowed); Where @nr_tasks is the number of actual user space threads associated to the mm and @nr_cpus_allowed is the superset of all task affinities. It is growth only as it would be insane to take a racy snapshot of all task affinities when the affinity of one task changes just do redo it 2 milliseconds later when the next task changes it's affinity. That means that as long as the number of tasks is lower or equal than the number of CPUs allowed, each task owns a CID. If the number of tasks exceeds the number of CPUs allowed it switches to per CPU mode, where the CPUs own the CIDs and the tasks borrow them as long as they are scheduled in. For transition periods CIDs can go beyond the optimal space as long as they don't go beyond the number of possible CPUs. The current upstream implementation adds overhead into task migration to keep the CID with the task. It also has to do the CID space consolidation work from a task work in the exit to user space path. As that work is assigned to a random task related to a MM this can inflict unwanted exit latencies. Implement the context switch parts of a strict ownership mechanism to address this. This removes most of the work from the task which schedules out. Only during transitioning from per CPU to per task ownership it is required to drop the CID when leaving the CPU to prevent CID space exhaustion. Other than that scheduling out is just a single check and branch. The task which schedules in has to check whether: 1) The ownership mode changed 2) The CID is within the optimal CID space In stable situations this results in zero work. The only short disruption is when ownership mode changes or when the associated CID is not in the optimal CID space. The latter only happens when tasks exit and therefore the optimal CID space shrinks. That mechanism is strictly optimized for the common case where no change happens. The only case where it actually causes a temporary one time spike is on mode changes when and only when a lot of tasks related to a MM schedule exactly at the same time and have eventually to compete on allocating a CID from the bitmap. In the sysbench test case which triggered the spinlock contention in the initial CID code, __schedule() drops significantly in perf top on a 128 Core (256 threads) machine when running sysbench with 255 threads, which fits into the task mode limit of 256 together with the parent thread: Upstream rseq/perf branch +CID rework 0.42% 0.37% 0.32% [k] __schedule Increasing the number of threads to 256, which puts the test process into per CPU mode looks about the same. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172550.023984859@linutronix.de --- include/linux/rseq.h | 8 ++++---- include/linux/rseq_types.h | 18 +++++++++++++----- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index bf8a6bf315f3..4c0e8bdd2dd9 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -73,13 +73,13 @@ static __always_inline void rseq_sched_switch_event(struct task_struct *t) } /* - * Invoked from __set_task_cpu() when a task migrates to enforce an IDs - * update. + * Invoked from __set_task_cpu() when a task migrates or from + * mm_cid_schedin() when the CID changes to enforce an IDs update. * * This does not raise TIF_NOTIFY_RESUME as that happens in * rseq_sched_switch_event(). */ -static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) +static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t) { t->rseq.event.ids_changed = true; } @@ -168,7 +168,7 @@ static inline void rseq_fork(struct task_struct *t, u64 clone_flags) static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } -static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { } +static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { } static inline void rseq_force_update(void) { } static inline void rseq_virt_userspace_exit(void) { } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 87854effe1ad..66b1482e1146 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -119,23 +119,31 @@ struct mm_cid_pcpu { /** * struct mm_mm_cid - Storage for per MM CID data * @pcpu: Per CPU storage for CIDs associated to a CPU + * @percpu: Set, when CIDs are in per CPU mode + * @transit: Set to MM_CID_TRANSIT during a mode change transition phase * @max_cids: The exclusive maximum CID value for allocation and convergence + * @lock: Spinlock to protect all fields except @pcpu. It also protects + * the MM cid cpumask and the MM cidmask bitmap. + * @mutex: Mutex to serialize forks and exits related to this mm * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map * is growth only. * @users: The number of tasks sharing this MM. Separate from mm::mm_users * as that is modified by mmget()/mm_put() by other entities which * do not actually share the MM. - * @lock: Spinlock to protect all fields except @pcpu. It also protects - * the MM cid cpumask and the MM cidmask bitmap. - * @mutex: Mutex to serialize forks and exits related to this mm */ struct mm_mm_cid { + /* Hotpath read mostly members */ struct mm_cid_pcpu __percpu *pcpu; + unsigned int percpu; + unsigned int transit; unsigned int max_cids; - unsigned int nr_cpus_allowed; - unsigned int users; + raw_spinlock_t lock; struct mutex mutex; + + /* Low frequency modified */ + unsigned int nr_cpus_allowed; + unsigned int users; }____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; -- cgit v1.2.3 From fbd0e71dc370af73f6b316e4de9eed273dd90340 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:16 +0100 Subject: sched/mmcid: Provide CID ownership mode fixup functions CIDs are either owned by tasks or by CPUs. The ownership mode depends on the number of tasks related to a MM and the number of CPUs on which these tasks are theoretically allowed to run on. Theoretically because that number is the superset of CPU affinities of all tasks which only grows and never shrinks. Switching to per CPU mode happens when the user count becomes greater than the maximum number of CIDs, which is calculated by: opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); max_cids = min(1.25 * opt_cids, nr_cpu_ids); The +25% allowance is useful for tight CPU masks in scenarios where only a few threads are created and destroyed to avoid frequent mode switches. Though this allowance shrinks, the closer opt_cids becomes to nr_cpu_ids, which is the (unfortunate) hard ABI limit. At the point of switching to per CPU mode the new user is not yet visible in the system, so the task which initiated the fork() runs the fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and either transfers each tasks owned CID to the CPU the task runs on or drops it into the CID pool if a task is not on a CPU at that point in time. Tasks which schedule in before the task walk reaches them do the handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes it's guaranteed that no task related to that MM owns a CID anymore. Switching back to task mode happens when the user count goes below the threshold which was recorded on the per CPU mode switch: pcpu_thrs = min(opt_cids - (opt_cids / 4), nr_cpu_ids / 2); This threshold is updated when a affinity change increases the number of allowed CPUs for the MM, which might cause a switch back to per task mode. If the switch back was initiated by a exiting task, then that task runs the fixup function. If it was initiated by a affinity change, then it's run either in the deferred update function in context of a workqueue or by a task which forks a new one or by a task which exits. Whatever happens first. mm_cid_fixup_cpus_to_task() walks through the possible CPUs and either transfers the CPU owned CIDs to a related task which runs on the CPU or drops it into the pool. Tasks which schedule in on a CPU which the walk did not cover yet do the handover themselves. This transition from CPU to per task ownership happens in two phases: 1) mm:mm_cid.transit contains MM_CID_TRANSIT. This is OR'ed on the task CID and denotes that the CID is only temporarily owned by the task. When it schedules out the task drops the CID back into the pool if this bit is set. 2) The initiating context walks the per CPU space and after completion clears mm:mm_cid.transit. After that point the CIDs are strictly task owned again. This two phase transition is required to prevent CID space exhaustion during the transition as a direct transfer of ownership would fail if two tasks are scheduled in on the same CPU before the fixup freed per CPU CIDs. When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID related to that MM is owned by a CPU anymore. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172550.088189028@linutronix.de --- include/linux/rseq_types.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 66b1482e1146..a3a4f3f10862 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -122,14 +122,15 @@ struct mm_cid_pcpu { * @percpu: Set, when CIDs are in per CPU mode * @transit: Set to MM_CID_TRANSIT during a mode change transition phase * @max_cids: The exclusive maximum CID value for allocation and convergence - * @lock: Spinlock to protect all fields except @pcpu. It also protects - * the MM cid cpumask and the MM cidmask bitmap. + * @lock: Spinlock to protect against affinity setting which can't take @mutex * @mutex: Mutex to serialize forks and exits related to this mm * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map * is growth only. * @users: The number of tasks sharing this MM. Separate from mm::mm_users * as that is modified by mmget()/mm_put() by other entities which * do not actually share the MM. + * @pcpu_thrs: Threshold for switching back from per CPU mode + * @update_deferred: A deferred switch back to per task mode is pending. */ struct mm_mm_cid { /* Hotpath read mostly members */ @@ -144,6 +145,8 @@ struct mm_mm_cid { /* Low frequency modified */ unsigned int nr_cpus_allowed; unsigned int users; + unsigned int pcpu_thrs; + unsigned int update_deferred; }____cacheline_aligned_in_smp; #else /* CONFIG_SCHED_MM_CID */ struct mm_mm_cid { }; -- cgit v1.2.3 From c809f081fe400cb1b9898f4791c0d33146315161 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:18 +0100 Subject: irqwork: Move data struct to a types header ... to avoid header recursion hell. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172550.152813625@linutronix.de --- include/linux/irq_work.h | 9 ++------- include/linux/irq_work_types.h | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 7 deletions(-) create mode 100644 include/linux/irq_work_types.h (limited to 'include') diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 136f2980cba3..c5afd053ae32 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -2,8 +2,9 @@ #ifndef _LINUX_IRQ_WORK_H #define _LINUX_IRQ_WORK_H -#include +#include #include +#include /* * An entry can be in one of four states: @@ -14,12 +15,6 @@ * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed */ -struct irq_work { - struct __call_single_node node; - void (*func)(struct irq_work *); - struct rcuwait irqwait; -}; - #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node = { .u_flags = (_flags), }, \ .func = (_func), \ diff --git a/include/linux/irq_work_types.h b/include/linux/irq_work_types.h new file mode 100644 index 000000000000..73abec5bb06e --- /dev/null +++ b/include/linux/irq_work_types.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IRQ_WORK_TYPES_H +#define _LINUX_IRQ_WORK_TYPES_H + +#include +#include + +struct irq_work { + struct __call_single_node node; + void (*func)(struct irq_work *); + struct rcuwait irqwait; +}; + +#endif -- cgit v1.2.3 From 9da6ccbcea3de1fa704202e3346fe6c0226bfc18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:20 +0100 Subject: sched/mmcid: Implement deferred mode change When affinity changes cause an increase of the number of CPUs allowed for tasks which are related to a MM, that might results in a situation where the ownership mode can go back from per CPU mode to per task mode. As affinity changes happen with runqueue lock held there is no way to do the actual mode change and required fixup right there. Add the infrastructure to defer it to a workqueue. The scheduled work can race with a fork() or exit(). Whatever happens first takes care of it. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172550.216484739@linutronix.de --- include/linux/rseq_types.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index a3a4f3f10862..81fbb8885e8d 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -2,7 +2,9 @@ #ifndef _LINUX_RSEQ_TYPES_H #define _LINUX_RSEQ_TYPES_H +#include #include +#include #ifdef CONFIG_RSEQ struct rseq; @@ -122,6 +124,8 @@ struct mm_cid_pcpu { * @percpu: Set, when CIDs are in per CPU mode * @transit: Set to MM_CID_TRANSIT during a mode change transition phase * @max_cids: The exclusive maximum CID value for allocation and convergence + * @irq_work: irq_work to handle the affinity mode change case + * @work: Regular work to handle the affinity mode change case * @lock: Spinlock to protect against affinity setting which can't take @mutex * @mutex: Mutex to serialize forks and exits related to this mm * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map @@ -139,6 +143,10 @@ struct mm_mm_cid { unsigned int transit; unsigned int max_cids; + /* Rarely used. Moves @lock and @mutex into the second cacheline */ + struct irq_work irq_work; + struct work_struct work; + raw_spinlock_t lock; struct mutex mutex; -- cgit v1.2.3 From 653fda7ae73d8033dedb65537acac0c2c287dc3f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Nov 2025 18:27:22 +0100 Subject: sched/mmcid: Switch over to the new mechanism Now that all pieces are in place, change the implementations of sched_mm_cid_fork() and sched_mm_cid_exit() to adhere to the new strict ownership scheme and switch context_switch() over to use the new mm_cid_schedin() functionality. The common case is that there is no mode change required, which makes fork() and exit() just update the user count and the constraints. In case that a new user would exceed the CID space limit the fork() context handles the transition to per CPU mode with mm::mm_cid::mutex held. exit() handles the transition back to per task mode when the user count drops below the switch back threshold. fork() might also be forced to handle a deferred switch back to per task mode, when a affinity change increased the number of allowed CPUs enough. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251119172550.280380631@linutronix.de --- include/linux/rseq.h | 19 ------------------- include/linux/rseq_types.h | 8 ++++---- 2 files changed, 4 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/rseq.h b/include/linux/rseq.h index 4c0e8bdd2dd9..2266f4dc77b6 100644 --- a/include/linux/rseq.h +++ b/include/linux/rseq.h @@ -84,24 +84,6 @@ static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t) t->rseq.event.ids_changed = true; } -/* - * Invoked from switch_mm_cid() in context switch when the task gets a MM - * CID assigned. - * - * This does not raise TIF_NOTIFY_RESUME as that happens in - * rseq_sched_switch_event(). - */ -static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) -{ - /* - * Requires a comparison as the switch_mm_cid() code does not - * provide a conditional for it readily. So avoid excessive updates - * when nothing changes. - */ - if (t->rseq.ids.mm_cid != cid) - t->rseq.event.ids_changed = true; -} - /* Enforce a full update after RSEQ registration and when execve() failed */ static inline void rseq_force_update(void) { @@ -169,7 +151,6 @@ static inline void rseq_handle_slowpath(struct pt_regs *regs) { } static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } static inline void rseq_sched_switch_event(struct task_struct *t) { } static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } -static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { } static inline void rseq_force_update(void) { } static inline void rseq_virt_userspace_exit(void) { } static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } diff --git a/include/linux/rseq_types.h b/include/linux/rseq_types.h index 81fbb8885e8d..332dc14b81c9 100644 --- a/include/linux/rseq_types.h +++ b/include/linux/rseq_types.h @@ -101,18 +101,18 @@ struct rseq_data { }; /** * struct sched_mm_cid - Storage for per task MM CID data * @active: MM CID is active for the task - * @cid: The CID associated to the task - * @last_cid: The last CID associated to the task + * @cid: The CID associated to the task either permanently or + * borrowed from the CPU */ struct sched_mm_cid { unsigned int active; unsigned int cid; - unsigned int last_cid; }; /** * struct mm_cid_pcpu - Storage for per CPU MM_CID data - * @cid: The CID associated to the CPU + * @cid: The CID associated to the CPU either permanently or + * while a task with a CID is running */ struct mm_cid_pcpu { unsigned int cid; -- cgit v1.2.3 From ec95cd103c3a1e2567927014e4a710416cde3e52 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Tue, 25 Nov 2025 15:13:27 -0800 Subject: hfs/hfsplus: move on-disk layout declarations into hfs_common.h Currently, HFS declares on-disk layout's metadata structures in fs/hfs/hfs.h and HFS+ declares it in fs/hfsplus/hfsplus_raw.h. However, HFS and HFS+ on-disk layouts have some similarity and overlapping in declarations. As a result, fs/hfs/hfs.h and fs/hfsplus/hfsplus_raw.h contain multiple duplicated declarations. Moreover, both HFS and HFS+ drivers contain completely similar implemented functionality in multiple places. This patch is moving the on-disk layout declarations from fs/hfs/hfs.h and fs/hfsplus/hfsplus_raw.h into include/linux/hfs_common.h with the goal to exclude the duplication in declarations. Also, this patch prepares the basis for creating a hfslib that can aggregate common functionality without necessity to duplicate the same code in HFS and HFS+ drivers. Signed-off-by: Viacheslav Dubeyko cc: John Paul Adrian Glaubitz cc: Yangtao Li cc: linux-fsdevel@vger.kernel.org Signed-off-by: Viacheslav Dubeyko --- include/linux/hfs_common.h | 633 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 633 insertions(+) (limited to 'include') diff --git a/include/linux/hfs_common.h b/include/linux/hfs_common.h index 8838ca2f3d08..dadb5e0aa8a3 100644 --- a/include/linux/hfs_common.h +++ b/include/linux/hfs_common.h @@ -17,4 +17,637 @@ pr_debug("pid %d:%s:%d %s(): " fmt, \ current->pid, __FILE__, __LINE__, __func__, ##__VA_ARGS__) \ +/* + * Format of structures on disk + * Information taken from Apple Technote #1150 (HFS Plus Volume Format) + */ + +/* offsets to various blocks */ +#define HFS_DD_BLK 0 /* Driver Descriptor block */ +#define HFS_PMAP_BLK 1 /* First block of partition map */ +#define HFS_MDB_BLK 2 /* Block (w/i partition) of MDB */ + +/* magic numbers for various disk blocks */ +#define HFS_DRVR_DESC_MAGIC 0x4552 /* "ER": driver descriptor map */ +#define HFS_OLD_PMAP_MAGIC 0x5453 /* "TS": old-type partition map */ +#define HFS_NEW_PMAP_MAGIC 0x504D /* "PM": new-type partition map */ +#define HFS_SUPER_MAGIC 0x4244 /* "BD": HFS MDB (super block) */ +#define HFS_MFS_SUPER_MAGIC 0xD2D7 /* MFS MDB (super block) */ + +#define HFSPLUS_VOLHEAD_SIG 0x482b +#define HFSPLUS_VOLHEAD_SIGX 0x4858 +#define HFSPLUS_SUPER_MAGIC 0x482b + +#define HFSP_WRAP_MAGIC 0x4244 +#define HFSP_WRAP_ATTRIB_SLOCK 0x8000 +#define HFSP_WRAP_ATTRIB_SPARED 0x0200 + +#define HFSP_WRAPOFF_SIG 0x00 +#define HFSP_WRAPOFF_ATTRIB 0x0A +#define HFSP_WRAPOFF_ABLKSIZE 0x14 +#define HFSP_WRAPOFF_ABLKSTART 0x1C +#define HFSP_WRAPOFF_EMBEDSIG 0x7C +#define HFSP_WRAPOFF_EMBEDEXT 0x7E + +#define HFSP_HARDLINK_TYPE 0x686c6e6b /* 'hlnk' */ +#define HFSP_HFSPLUS_CREATOR 0x6866732b /* 'hfs+' */ + +#define HFSP_SYMLINK_TYPE 0x736c6e6b /* 'slnk' */ +#define HFSP_SYMLINK_CREATOR 0x72686170 /* 'rhap' */ + +#define HFSP_MOUNT_VERSION 0x482b4c78 /* 'H+Lx' */ + +#define HFSP_HIDDENDIR_NAME \ + "\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80\xe2\x90\x80HFS+ Private Data" + +/* various FIXED size parameters */ +#define HFS_SECTOR_SIZE 512 /* size of an HFS sector */ +#define HFS_SECTOR_SIZE_BITS 9 /* log_2(HFS_SECTOR_SIZE) */ +#define HFS_MAX_VALENCE 32767U + +#define HFSPLUS_SECTOR_SIZE HFS_SECTOR_SIZE +#define HFSPLUS_SECTOR_SHIFT HFS_SECTOR_SIZE_BITS +#define HFSPLUS_VOLHEAD_SECTOR 2 +#define HFSPLUS_MIN_VERSION 4 +#define HFSPLUS_CURRENT_VERSION 5 + +#define HFS_NAMELEN 31 /* maximum length of an HFS filename */ +#define HFS_MAX_NAMELEN 128 + +#define HFSPLUS_MAX_STRLEN 255 +#define HFSPLUS_ATTR_MAX_STRLEN 127 + +/* Meanings of the drAtrb field of the MDB, + * Reference: _Inside Macintosh: Files_ p. 2-61 + */ +#define HFS_SB_ATTRIB_HLOCK (1 << 7) +#define HFS_SB_ATTRIB_UNMNT (1 << 8) +#define HFS_SB_ATTRIB_SPARED (1 << 9) +#define HFS_SB_ATTRIB_INCNSTNT (1 << 11) +#define HFS_SB_ATTRIB_SLOCK (1 << 15) + +/* values for hfs_cat_rec.cdrType */ +#define HFS_CDR_DIR 0x01 /* folder (directory) */ +#define HFS_CDR_FIL 0x02 /* file */ +#define HFS_CDR_THD 0x03 /* folder (directory) thread */ +#define HFS_CDR_FTH 0x04 /* file thread */ + +/* legal values for hfs_ext_key.FkType and hfs_file.fork */ +#define HFS_FK_DATA 0x00 +#define HFS_FK_RSRC 0xFF + +/* bits in hfs_fil_entry.Flags */ +#define HFS_FIL_LOCK 0x01 /* locked */ +#define HFS_FIL_THD 0x02 /* file thread */ +#define HFS_FIL_DOPEN 0x04 /* data fork open */ +#define HFS_FIL_ROPEN 0x08 /* resource fork open */ +#define HFS_FIL_DIR 0x10 /* directory (always clear) */ +#define HFS_FIL_NOCOPY 0x40 /* copy-protected file */ +#define HFS_FIL_USED 0x80 /* open */ + +/* bits in hfs_dir_entry.Flags. dirflags is 16 bits. */ +#define HFS_DIR_LOCK 0x01 /* locked */ +#define HFS_DIR_THD 0x02 /* directory thread */ +#define HFS_DIR_INEXPFOLDER 0x04 /* in a shared area */ +#define HFS_DIR_MOUNTED 0x08 /* mounted */ +#define HFS_DIR_DIR 0x10 /* directory (always set) */ +#define HFS_DIR_EXPFOLDER 0x20 /* share point */ + +/* bits hfs_finfo.fdFlags */ +#define HFS_FLG_INITED 0x0100 +#define HFS_FLG_LOCKED 0x1000 +#define HFS_FLG_INVISIBLE 0x4000 + +/* Some special File ID numbers */ +#define HFS_POR_CNID 1 /* Parent Of the Root */ +#define HFSPLUS_POR_CNID HFS_POR_CNID +#define HFS_ROOT_CNID 2 /* ROOT directory */ +#define HFSPLUS_ROOT_CNID HFS_ROOT_CNID +#define HFS_EXT_CNID 3 /* EXTents B-tree */ +#define HFSPLUS_EXT_CNID HFS_EXT_CNID +#define HFS_CAT_CNID 4 /* CATalog B-tree */ +#define HFSPLUS_CAT_CNID HFS_CAT_CNID +#define HFS_BAD_CNID 5 /* BAD blocks file */ +#define HFSPLUS_BAD_CNID HFS_BAD_CNID +#define HFS_ALLOC_CNID 6 /* ALLOCation file (HFS+) */ +#define HFSPLUS_ALLOC_CNID HFS_ALLOC_CNID +#define HFS_START_CNID 7 /* STARTup file (HFS+) */ +#define HFSPLUS_START_CNID HFS_START_CNID +#define HFS_ATTR_CNID 8 /* ATTRibutes file (HFS+) */ +#define HFSPLUS_ATTR_CNID HFS_ATTR_CNID +#define HFS_EXCH_CNID 15 /* ExchangeFiles temp id */ +#define HFSPLUS_EXCH_CNID HFS_EXCH_CNID +#define HFS_FIRSTUSER_CNID 16 /* first available user id */ +#define HFSPLUS_FIRSTUSER_CNID HFS_FIRSTUSER_CNID + +/*======== HFS/HFS+ structures as they appear on the disk ========*/ + +typedef __be32 hfsplus_cnid; +typedef __be16 hfsplus_unichr; + +/* Pascal-style string of up to 31 characters */ +struct hfs_name { + u8 len; + u8 name[HFS_NAMELEN]; +} __packed; + +/* A "string" as used in filenames, etc. */ +struct hfsplus_unistr { + __be16 length; + hfsplus_unichr unicode[HFSPLUS_MAX_STRLEN]; +} __packed; + +/* + * A "string" is used in attributes file + * for name of extended attribute + */ +struct hfsplus_attr_unistr { + __be16 length; + hfsplus_unichr unicode[HFSPLUS_ATTR_MAX_STRLEN]; +} __packed; + +struct hfs_extent { + __be16 block; + __be16 count; +}; +typedef struct hfs_extent hfs_extent_rec[3]; + +/* A single contiguous area of a file */ +struct hfsplus_extent { + __be32 start_block; + __be32 block_count; +} __packed; +typedef struct hfsplus_extent hfsplus_extent_rec[8]; + +/* Information for a "Fork" in a file */ +struct hfsplus_fork_raw { + __be64 total_size; + __be32 clump_size; + __be32 total_blocks; + hfsplus_extent_rec extents; +} __packed; + +struct hfs_mdb { + __be16 drSigWord; /* Signature word indicating fs type */ + __be32 drCrDate; /* fs creation date/time */ + __be32 drLsMod; /* fs modification date/time */ + __be16 drAtrb; /* fs attributes */ + __be16 drNmFls; /* number of files in root directory */ + __be16 drVBMSt; /* location (in 512-byte blocks) + of the volume bitmap */ + __be16 drAllocPtr; /* location (in allocation blocks) + to begin next allocation search */ + __be16 drNmAlBlks; /* number of allocation blocks */ + __be32 drAlBlkSiz; /* bytes in an allocation block */ + __be32 drClpSiz; /* clumpsize, the number of bytes to + allocate when extending a file */ + __be16 drAlBlSt; /* location (in 512-byte blocks) + of the first allocation block */ + __be32 drNxtCNID; /* CNID to assign to the next + file or directory created */ + __be16 drFreeBks; /* number of free allocation blocks */ + u8 drVN[28]; /* the volume label */ + __be32 drVolBkUp; /* fs backup date/time */ + __be16 drVSeqNum; /* backup sequence number */ + __be32 drWrCnt; /* fs write count */ + __be32 drXTClpSiz; /* clumpsize for the extents B-tree */ + __be32 drCTClpSiz; /* clumpsize for the catalog B-tree */ + __be16 drNmRtDirs; /* number of directories in + the root directory */ + __be32 drFilCnt; /* number of files in the fs */ + __be32 drDirCnt; /* number of directories in the fs */ + u8 drFndrInfo[32]; /* data used by the Finder */ + __be16 drEmbedSigWord; /* embedded volume signature */ + __be32 drEmbedExtent; /* starting block number (xdrStABN) + and number of allocation blocks + (xdrNumABlks) occupied by embedded + volume */ + __be32 drXTFlSize; /* bytes in the extents B-tree */ + hfs_extent_rec drXTExtRec; /* extents B-tree's first 3 extents */ + __be32 drCTFlSize; /* bytes in the catalog B-tree */ + hfs_extent_rec drCTExtRec; /* catalog B-tree's first 3 extents */ +} __packed; + +/* HFS+ Volume Header */ +struct hfsplus_vh { + __be16 signature; + __be16 version; + __be32 attributes; + __be32 last_mount_vers; + u32 reserved; + + __be32 create_date; + __be32 modify_date; + __be32 backup_date; + __be32 checked_date; + + __be32 file_count; + __be32 folder_count; + + __be32 blocksize; + __be32 total_blocks; + __be32 free_blocks; + + __be32 next_alloc; + __be32 rsrc_clump_sz; + __be32 data_clump_sz; + hfsplus_cnid next_cnid; + + __be32 write_count; + __be64 encodings_bmp; + + u32 finder_info[8]; + + struct hfsplus_fork_raw alloc_file; + struct hfsplus_fork_raw ext_file; + struct hfsplus_fork_raw cat_file; + struct hfsplus_fork_raw attr_file; + struct hfsplus_fork_raw start_file; +} __packed; + +/* HFS+ volume attributes */ +#define HFSPLUS_VOL_UNMNT (1 << 8) +#define HFSPLUS_VOL_SPARE_BLK (1 << 9) +#define HFSPLUS_VOL_NOCACHE (1 << 10) +#define HFSPLUS_VOL_INCNSTNT (1 << 11) +#define HFSPLUS_VOL_NODEID_REUSED (1 << 12) +#define HFSPLUS_VOL_JOURNALED (1 << 13) +#define HFSPLUS_VOL_SOFTLOCK (1 << 15) +#define HFSPLUS_VOL_UNUSED_NODE_FIX (1 << 31) + +struct hfs_point { + __be16 v; + __be16 h; +} __packed; + +typedef struct hfs_point hfsp_point; + +struct hfs_rect { + __be16 top; + __be16 left; + __be16 bottom; + __be16 right; +} __packed; + +typedef struct hfs_rect hfsp_rect; + +struct hfs_finfo { + __be32 fdType; + __be32 fdCreator; + __be16 fdFlags; + struct hfs_point fdLocation; + __be16 fdFldr; +} __packed; + +typedef struct hfs_finfo FInfo; + +struct hfs_fxinfo { + __be16 fdIconID; + u8 fdUnused[8]; + __be16 fdComment; + __be32 fdPutAway; +} __packed; + +typedef struct hfs_fxinfo FXInfo; + +struct hfs_dinfo { + struct hfs_rect frRect; + __be16 frFlags; + struct hfs_point frLocation; + __be16 frView; +} __packed; + +typedef struct hfs_dinfo DInfo; + +struct hfs_dxinfo { + struct hfs_point frScroll; + __be32 frOpenChain; + __be16 frUnused; + __be16 frComment; + __be32 frPutAway; +} __packed; + +typedef struct hfs_dxinfo DXInfo; + +union hfs_finder_info { + struct { + struct hfs_finfo finfo; + struct hfs_fxinfo fxinfo; + } file; + struct { + struct hfs_dinfo dinfo; + struct hfs_dxinfo dxinfo; + } dir; +} __packed; + +/* The key used in the catalog b-tree: */ +struct hfs_cat_key { + u8 key_len; /* number of bytes in the key */ + u8 reserved; /* padding */ + __be32 ParID; /* CNID of the parent dir */ + struct hfs_name CName; /* The filename of the entry */ +} __packed; + +/* HFS+ catalog entry key */ +struct hfsplus_cat_key { + __be16 key_len; + hfsplus_cnid parent; + struct hfsplus_unistr name; +} __packed; + +#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key)) + +/* The key used in the extents b-tree: */ +struct hfs_ext_key { + u8 key_len; /* number of bytes in the key */ + u8 FkType; /* HFS_FK_{DATA,RSRC} */ + __be32 FNum; /* The File ID of the file */ + __be16 FABN; /* allocation blocks number*/ +} __packed; + +/* HFS+ extents tree key */ +struct hfsplus_ext_key { + __be16 key_len; + u8 fork_type; + u8 pad; + hfsplus_cnid cnid; + __be32 start_block; +} __packed; + +#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key) + +typedef union hfs_btree_key { + u8 key_len; /* number of bytes in the key */ + struct hfs_cat_key cat; + struct hfs_ext_key ext; +} hfs_btree_key; + +#define HFS_MAX_CAT_KEYLEN (sizeof(struct hfs_cat_key) - sizeof(u8)) +#define HFS_MAX_EXT_KEYLEN (sizeof(struct hfs_ext_key) - sizeof(u8)) + +typedef union hfs_btree_key btree_key; + +/* The catalog record for a file */ +struct hfs_cat_file { + s8 type; /* The type of entry */ + u8 reserved; + u8 Flags; /* Flags such as read-only */ + s8 Typ; /* file version number = 0 */ + struct hfs_finfo UsrWds; /* data used by the Finder */ + __be32 FlNum; /* The CNID */ + __be16 StBlk; /* obsolete */ + __be32 LgLen; /* The logical EOF of the data fork*/ + __be32 PyLen; /* The physical EOF of the data fork */ + __be16 RStBlk; /* obsolete */ + __be32 RLgLen; /* The logical EOF of the rsrc fork */ + __be32 RPyLen; /* The physical EOF of the rsrc fork */ + __be32 CrDat; /* The creation date */ + __be32 MdDat; /* The modified date */ + __be32 BkDat; /* The last backup date */ + struct hfs_fxinfo FndrInfo; /* more data for the Finder */ + __be16 ClpSize; /* number of bytes to allocate + when extending files */ + hfs_extent_rec ExtRec; /* first extent record + for the data fork */ + hfs_extent_rec RExtRec; /* first extent record + for the resource fork */ + u32 Resrv; /* reserved by Apple */ +} __packed; + +/* the catalog record for a directory */ +struct hfs_cat_dir { + s8 type; /* The type of entry */ + u8 reserved; + __be16 Flags; /* flags */ + __be16 Val; /* Valence: number of files and + dirs in the directory */ + __be32 DirID; /* The CNID */ + __be32 CrDat; /* The creation date */ + __be32 MdDat; /* The modification date */ + __be32 BkDat; /* The last backup date */ + struct hfs_dinfo UsrInfo; /* data used by the Finder */ + struct hfs_dxinfo FndrInfo; /* more data used by Finder */ + u8 Resrv[16]; /* reserved by Apple */ +} __packed; + +/* the catalog record for a thread */ +struct hfs_cat_thread { + s8 type; /* The type of entry */ + u8 reserved[9]; /* reserved by Apple */ + __be32 ParID; /* CNID of parent directory */ + struct hfs_name CName; /* The name of this entry */ +} __packed; + +/* A catalog tree record */ +typedef union hfs_cat_rec { + s8 type; /* The type of entry */ + struct hfs_cat_file file; + struct hfs_cat_dir dir; + struct hfs_cat_thread thread; +} hfs_cat_rec; + +/* POSIX permissions */ +struct hfsplus_perm { + __be32 owner; + __be32 group; + u8 rootflags; + u8 userflags; + __be16 mode; + __be32 dev; +} __packed; + +#define HFSPLUS_FLG_NODUMP 0x01 +#define HFSPLUS_FLG_IMMUTABLE 0x02 +#define HFSPLUS_FLG_APPEND 0x04 + +/* HFS/HFS+ BTree node descriptor */ +struct hfs_bnode_desc { + __be32 next; /* (V) Number of the next node at this level */ + __be32 prev; /* (V) Number of the prev node at this level */ + u8 type; /* (F) The type of node */ + u8 height; /* (F) The level of this node (leaves=1) */ + __be16 num_recs; /* (V) The number of records in this node */ + u16 reserved; +} __packed; + +/* HFS/HFS+ BTree node types */ +#define HFS_NODE_INDEX 0x00 /* An internal (index) node */ +#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */ +#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */ +#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */ + +/* HFS/HFS+ BTree header */ +struct hfs_btree_header_rec { + __be16 depth; /* (V) The number of levels in this B-tree */ + __be32 root; /* (V) The node number of the root node */ + __be32 leaf_count; /* (V) The number of leaf records */ + __be32 leaf_head; /* (V) The number of the first leaf node */ + __be32 leaf_tail; /* (V) The number of the last leaf node */ + __be16 node_size; /* (F) The number of bytes in a node (=512) */ + __be16 max_key_len; /* (F) The length of a key in an index node */ + __be32 node_count; /* (V) The total number of nodes */ + __be32 free_nodes; /* (V) The number of unused nodes */ + u16 reserved1; + __be32 clump_size; /* (F) clump size. not usually used. */ + u8 btree_type; /* (F) BTree type */ + u8 key_type; + __be32 attributes; /* (F) attributes */ + u32 reserved3[16]; +} __packed; + +/* BTree attributes */ +#define BTREE_ATTR_BADCLOSE 0x00000001 /* b-tree not closed properly. not + used by hfsplus. */ +#define HFS_TREE_BIGKEYS 0x00000002 /* key length is u16 instead of u8. + used by hfsplus. */ +#define HFS_TREE_VARIDXKEYS 0x00000004 /* variable key length instead of + max key length. use din catalog + b-tree but not in extents + b-tree (hfsplus). */ + +/* HFS+ BTree misc info */ +#define HFSPLUS_TREE_HEAD 0 +#define HFSPLUS_NODE_MXSZ 32768 +#define HFSPLUS_ATTR_TREE_NODE_SIZE 8192 +#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT 3 +#define HFSPLUS_BTREE_HDR_USER_BYTES 128 + +/* btree key type */ +#define HFSPLUS_KEY_CASEFOLDING 0xCF /* case-insensitive */ +#define HFSPLUS_KEY_BINARY 0xBC /* case-sensitive */ + +/* HFS+ folder data (part of an hfsplus_cat_entry) */ +struct hfsplus_cat_folder { + __be16 type; + __be16 flags; + __be32 valence; + hfsplus_cnid id; + __be32 create_date; + __be32 content_mod_date; + __be32 attribute_mod_date; + __be32 access_date; + __be32 backup_date; + struct hfsplus_perm permissions; + struct_group_attr(info, __packed, + DInfo user_info; + DXInfo finder_info; + ); + __be32 text_encoding; + __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */ +} __packed; + +/* HFS+ file data (part of a cat_entry) */ +struct hfsplus_cat_file { + __be16 type; + __be16 flags; + u32 reserved1; + hfsplus_cnid id; + __be32 create_date; + __be32 content_mod_date; + __be32 attribute_mod_date; + __be32 access_date; + __be32 backup_date; + struct hfsplus_perm permissions; + struct_group_attr(info, __packed, + FInfo user_info; + FXInfo finder_info; + ); + __be32 text_encoding; + u32 reserved2; + + struct hfsplus_fork_raw data_fork; + struct hfsplus_fork_raw rsrc_fork; +} __packed; + +/* File and folder flag bits */ +#define HFSPLUS_FILE_LOCKED 0x0001 +#define HFSPLUS_FILE_THREAD_EXISTS 0x0002 +#define HFSPLUS_XATTR_EXISTS 0x0004 +#define HFSPLUS_ACL_EXISTS 0x0008 +#define HFSPLUS_HAS_FOLDER_COUNT 0x0010 /* Folder has subfolder count + * (HFSX only) */ + +/* HFS+ catalog thread (part of a cat_entry) */ +struct hfsplus_cat_thread { + __be16 type; + s16 reserved; + hfsplus_cnid parentID; + struct hfsplus_unistr nodeName; +} __packed; + +#define HFSPLUS_MIN_THREAD_SZ 10 + +/* A data record in the catalog tree */ +typedef union { + __be16 type; + struct hfsplus_cat_folder folder; + struct hfsplus_cat_file file; + struct hfsplus_cat_thread thread; +} __packed hfsplus_cat_entry; + +/* HFS+ catalog entry type */ +#define HFSPLUS_FOLDER 0x0001 +#define HFSPLUS_FILE 0x0002 +#define HFSPLUS_FOLDER_THREAD 0x0003 +#define HFSPLUS_FILE_THREAD 0x0004 + +#define HFSPLUS_XATTR_FINDER_INFO_NAME "com.apple.FinderInfo" +#define HFSPLUS_XATTR_ACL_NAME "com.apple.system.Security" + +#define HFSPLUS_ATTR_INLINE_DATA 0x10 +#define HFSPLUS_ATTR_FORK_DATA 0x20 +#define HFSPLUS_ATTR_EXTENTS 0x30 + +/* HFS+ attributes tree key */ +struct hfsplus_attr_key { + __be16 key_len; + __be16 pad; + hfsplus_cnid cnid; + __be32 start_block; + struct hfsplus_attr_unistr key_name; +} __packed; + +#define HFSPLUS_ATTR_KEYLEN sizeof(struct hfsplus_attr_key) + +/* HFS+ fork data attribute */ +struct hfsplus_attr_fork_data { + __be32 record_type; + __be32 reserved; + struct hfsplus_fork_raw the_fork; +} __packed; + +/* HFS+ extension attribute */ +struct hfsplus_attr_extents { + __be32 record_type; + __be32 reserved; + struct hfsplus_extent extents; +} __packed; + +#define HFSPLUS_MAX_INLINE_DATA_SIZE 3802 + +/* HFS+ attribute inline data */ +struct hfsplus_attr_inline_data { + __be32 record_type; + __be32 reserved1; + u8 reserved2[6]; + __be16 length; + u8 raw_bytes[HFSPLUS_MAX_INLINE_DATA_SIZE]; +} __packed; + +/* A data record in the attributes tree */ +typedef union { + __be32 record_type; + struct hfsplus_attr_fork_data fork_data; + struct hfsplus_attr_extents extents; + struct hfsplus_attr_inline_data inline_data; +} __packed hfsplus_attr_entry; + +/* HFS+ generic BTree key */ +typedef union { + __be16 key_len; + struct hfsplus_cat_key cat; + struct hfsplus_ext_key ext; + struct hfsplus_attr_key attr; +} __packed hfsplus_btree_key; + #endif /* _HFS_COMMON_H_ */ -- cgit v1.2.3 From 8f6ddc0587606c4be7ffcbdb20a4a99647e0c362 Mon Sep 17 00:00:00 2001 From: Leon Hwang Date: Tue, 25 Nov 2025 22:58:50 +0800 Subject: bpf: Introduce internal bpf_map_check_op_flags helper function It is to unify map flags checking for lookup_elem, update_elem, lookup_batch and update_batch APIs. Acked-by: Andrii Nakryiko Signed-off-by: Leon Hwang Link: https://lore.kernel.org/r/20251125145857.98134-2-leon.hwang@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9b788c7b4aa..6498be4c44f8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -3829,4 +3829,15 @@ bpf_prog_update_insn_ptrs(struct bpf_prog *prog, u32 *offsets, void *image) } #endif +static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 allowed_flags) +{ + if (flags & ~allowed_flags) + return -EINVAL; + + if ((flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)) + return -EINVAL; + + return 0; +} + #endif /* _LINUX_BPF_H */ -- cgit v1.2.3 From 68e83f3472667aac18d577587102f4bf77d0bd06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Thu, 20 Nov 2025 17:44:27 +0000 Subject: tools: ynl-gen: add regeneration comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a comment on regeneration to the generated files. The comment is placed after the YNL-GEN line[1], as to not interfere with ynl-regen.sh's detection logic. [1] and after the optional YNL-ARG line. Link: https://lore.kernel.org/r/aR5m174O7pklKrMR@zx2c4.com/ Suggested-by: Jason A. Donenfeld Signed-off-by: Asbjørn Sloth Tønnesen Acked-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20251120174429.390574-3-ast@fiberby.net Signed-off-by: Jakub Kicinski --- include/uapi/linux/android/binder_netlink.h | 1 + include/uapi/linux/dpll.h | 1 + include/uapi/linux/ethtool_netlink_generated.h | 1 + include/uapi/linux/fou.h | 1 + include/uapi/linux/handshake.h | 1 + include/uapi/linux/if_team.h | 1 + include/uapi/linux/lockd_netlink.h | 1 + include/uapi/linux/mptcp_pm.h | 1 + include/uapi/linux/net_shaper.h | 1 + include/uapi/linux/netdev.h | 1 + include/uapi/linux/nfsd_netlink.h | 1 + include/uapi/linux/ovpn.h | 1 + include/uapi/linux/psp.h | 1 + 13 files changed, 13 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/android/binder_netlink.h b/include/uapi/linux/android/binder_netlink.h index b218f96d6668..bf69833c9a19 100644 --- a/include/uapi/linux/android/binder_netlink.h +++ b/include/uapi/linux/android/binder_netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/binder.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_ANDROID_BINDER_NETLINK_H #define _UAPI_LINUX_ANDROID_BINDER_NETLINK_H diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index 69d35570ac4f..b7ff9c44f9aa 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/dpll.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_DPLL_H #define _UAPI_LINUX_DPLL_H diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index b71b175df46d..556a0c834df5 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/ethtool.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H #define _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H diff --git a/include/uapi/linux/fou.h b/include/uapi/linux/fou.h index b5cd3e7b3775..bb6bef74d2d1 100644 --- a/include/uapi/linux/fou.h +++ b/include/uapi/linux/fou.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_FOU_H #define _UAPI_LINUX_FOU_H diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h index 662e7de46c54..d7e40f594888 100644 --- a/include/uapi/linux/handshake.h +++ b/include/uapi/linux/handshake.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/handshake.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_HANDSHAKE_H #define _UAPI_LINUX_HANDSHAKE_H diff --git a/include/uapi/linux/if_team.h b/include/uapi/linux/if_team.h index a5c06243a435..f4cd839ae725 100644 --- a/include/uapi/linux/if_team.h +++ b/include/uapi/linux/if_team.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/team.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_IF_TEAM_H #define _UAPI_LINUX_IF_TEAM_H diff --git a/include/uapi/linux/lockd_netlink.h b/include/uapi/linux/lockd_netlink.h index 21c65aec3bc6..2d766a0fa6ea 100644 --- a/include/uapi/linux/lockd_netlink.h +++ b/include/uapi/linux/lockd_netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/lockd.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_LOCKD_NETLINK_H #define _UAPI_LINUX_LOCKD_NETLINK_H diff --git a/include/uapi/linux/mptcp_pm.h b/include/uapi/linux/mptcp_pm.h index bf44a5cf5b5a..c97d060ee90b 100644 --- a/include/uapi/linux/mptcp_pm.h +++ b/include/uapi/linux/mptcp_pm.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_MPTCP_PM_H #define _UAPI_LINUX_MPTCP_PM_H diff --git a/include/uapi/linux/net_shaper.h b/include/uapi/linux/net_shaper.h index d8834b59f7d7..3dd22c2930d9 100644 --- a/include/uapi/linux/net_shaper.h +++ b/include/uapi/linux/net_shaper.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/net_shaper.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_NET_SHAPER_H #define _UAPI_LINUX_NET_SHAPER_H diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 048c8de1a130..e0b579a1df4f 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_NETDEV_H #define _UAPI_LINUX_NETDEV_H diff --git a/include/uapi/linux/nfsd_netlink.h b/include/uapi/linux/nfsd_netlink.h index 887cbd12b695..e157e2009ea8 100644 --- a/include/uapi/linux/nfsd_netlink.h +++ b/include/uapi/linux/nfsd_netlink.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/nfsd.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_NFSD_NETLINK_H #define _UAPI_LINUX_NFSD_NETLINK_H diff --git a/include/uapi/linux/ovpn.h b/include/uapi/linux/ovpn.h index 680d1522dc87..959b41def61f 100644 --- a/include/uapi/linux/ovpn.h +++ b/include/uapi/linux/ovpn.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/ovpn.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_OVPN_H #define _UAPI_LINUX_OVPN_H diff --git a/include/uapi/linux/psp.h b/include/uapi/linux/psp.h index d8449c043ba1..a3a336488dc3 100644 --- a/include/uapi/linux/psp.h +++ b/include/uapi/linux/psp.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/psp.yaml */ /* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _UAPI_LINUX_PSP_H #define _UAPI_LINUX_PSP_H -- cgit v1.2.3 From 3a6e8fd0bf4042c572dc52e634878b9aca02970d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Nov 2025 17:50:10 +0000 Subject: tcp: rename icsk_timeout() to tcp_timeout_expires() In preparation of sk->tcp_timeout_timer introduction, rename icsk_timeout() helper and change its argument to plain 'const struct sock *sk'. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251124175013.1473655-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index fd40af2221b9..765c2149d678 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -184,10 +184,9 @@ static inline void inet_csk_delack_init(struct sock *sk) memset(&inet_csk(sk)->icsk_ack, 0, sizeof(inet_csk(sk)->icsk_ack)); } -static inline unsigned long -icsk_timeout(const struct inet_connection_sock *icsk) +static inline unsigned long tcp_timeout_expires(const struct sock *sk) { - return READ_ONCE(icsk->icsk_retransmit_timer.expires); + return READ_ONCE(inet_csk(sk)->icsk_retransmit_timer.expires); } static inline unsigned long -- cgit v1.2.3 From 27e8257a86516682e2ec5d7543a8909c37ae8b00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Nov 2025 17:50:11 +0000 Subject: net: move sk_dst_pending_confirm and sk_pacing_status to sock_read_tx group These two fields are mostly read in TCP tx path, move them in an more appropriate group for better cache locality. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251124175013.1473655-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 38d48cfe0741..a89aa97151f5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -481,8 +481,6 @@ struct sock { struct rb_root tcp_rtx_queue; }; struct sk_buff_head sk_write_queue; - u32 sk_dst_pending_confirm; - u32 sk_pacing_status; /* see enum sk_pacing */ struct page_frag sk_frag; struct timer_list sk_timer; @@ -493,6 +491,8 @@ struct sock { __cacheline_group_end(sock_write_tx); __cacheline_group_begin(sock_read_tx); + u32 sk_dst_pending_confirm; + u32 sk_pacing_status; /* see enum sk_pacing */ unsigned long sk_max_pacing_rate; long sk_sndtimeo; u32 sk_priority; -- cgit v1.2.3 From 08dfe370239e53494453cee1e2ded2cdaa1efd12 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Nov 2025 17:50:12 +0000 Subject: tcp: introduce icsk->icsk_keepalive_timer sk->sk_timer has been used for TCP keepalives. Keepalive timers are not in fast path, we want to use sk->sk_timer storage for retransmit timers, for better cache locality. Create icsk->icsk_keepalive_timer and change keepalive code to no longer use sk->sk_timer. Added space is reclaimed in the following patch. This includes changes to MPTCP, which was also using sk_timer. Alias icsk->mptcp_tout_timer and icsk->icsk_keepalive_timer for inet_sk_diag_fill() sake. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251124175013.1473655-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 765c2149d678..e0d90b996348 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -57,6 +57,9 @@ struct inet_connection_sock_af_ops { * @icsk_bind_hash: Bind node * @icsk_bind2_hash: Bind node in the bhash2 table * @icsk_retransmit_timer: Resend (no ack) + * @icsk_delack_timer: Delayed ACK timer + * @icsk_keepalive_timer: Keepalive timer + * @mptcp_tout_timer: mptcp timer * @icsk_rto: Retransmit timeout * @icsk_pmtu_cookie Last pmtu seen by socket * @icsk_ca_ops Pluggable congestion control hook @@ -81,8 +84,12 @@ struct inet_connection_sock { struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; struct inet_bind2_bucket *icsk_bind2_hash; - struct timer_list icsk_retransmit_timer; - struct timer_list icsk_delack_timer; + struct timer_list icsk_retransmit_timer; + struct timer_list icsk_delack_timer; + union { + struct timer_list icsk_keepalive_timer; + struct timer_list mptcp_tout_timer; + }; __u32 icsk_rto; __u32 icsk_rto_min; u32 icsk_rto_max; -- cgit v1.2.3 From 9a5e5334adc039fa652aa071ea95b18db0bc1f43 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 24 Nov 2025 17:50:13 +0000 Subject: tcp: remove icsk->icsk_retransmit_timer Now sk->sk_timer is no longer used by TCP keepalive, we can use its storage for TCP and MPTCP retransmit timers for better cache locality. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20251124175013.1473655-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 8 +++----- include/net/sock.h | 9 +++++++-- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index e0d90b996348..ecb362025c4e 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -56,7 +56,6 @@ struct inet_connection_sock_af_ops { * @icsk_accept_queue: FIFO of established children * @icsk_bind_hash: Bind node * @icsk_bind2_hash: Bind node in the bhash2 table - * @icsk_retransmit_timer: Resend (no ack) * @icsk_delack_timer: Delayed ACK timer * @icsk_keepalive_timer: Keepalive timer * @mptcp_tout_timer: mptcp timer @@ -84,7 +83,6 @@ struct inet_connection_sock { struct request_sock_queue icsk_accept_queue; struct inet_bind_bucket *icsk_bind_hash; struct inet_bind2_bucket *icsk_bind2_hash; - struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; union { struct timer_list icsk_keepalive_timer; @@ -193,7 +191,7 @@ static inline void inet_csk_delack_init(struct sock *sk) static inline unsigned long tcp_timeout_expires(const struct sock *sk) { - return READ_ONCE(inet_csk(sk)->icsk_retransmit_timer.expires); + return READ_ONCE(sk->tcp_retransmit_timer.expires); } static inline unsigned long @@ -209,7 +207,7 @@ static inline void inet_csk_clear_xmit_timer(struct sock *sk, const int what) if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) { smp_store_release(&icsk->icsk_pending, 0); #ifdef INET_CSK_CLEAR_TIMERS - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &sk->tcp_retransmit_timer); #endif } else if (what == ICSK_TIME_DACK) { smp_store_release(&icsk->icsk_ack.pending, 0); @@ -241,7 +239,7 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || what == ICSK_TIME_LOSS_PROBE || what == ICSK_TIME_REO_TIMEOUT) { smp_store_release(&icsk->icsk_pending, what); - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, when); + sk_reset_timer(sk, &sk->tcp_retransmit_timer, when); } else if (what == ICSK_TIME_DACK) { smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_TIMER); diff --git a/include/net/sock.h b/include/net/sock.h index a89aa97151f5..02253c6a578b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -305,6 +305,8 @@ struct sk_filter; * @sk_txrehash: enable TX hash rethink * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer + * @tcp_retransmit_timer: tcp retransmit timer + * @mptcp_retransmit_timer: mptcp retransmit timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING flags @@ -482,8 +484,11 @@ struct sock { }; struct sk_buff_head sk_write_queue; struct page_frag sk_frag; - struct timer_list sk_timer; - + union { + struct timer_list sk_timer; + struct timer_list tcp_retransmit_timer; + struct timer_list mptcp_retransmit_timer; + }; unsigned long sk_pacing_rate; /* bytes per second */ atomic_t sk_zckey; atomic_t sk_tskey; -- cgit v1.2.3 From 585a4f22c4f9d85e32d42be65e67c232e82e5b3a Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Nov 2025 11:16:03 +0100 Subject: can: bittiming: apply NL_SET_ERR_MSG() to can_calc_bittiming() When CONFIG_CAN_CALC_BITTIMING is disabled, the can_calc_bittiming() functions can not be used and the user needs to provide all the bittiming parameters. Currently, can_calc_bittiming() prints an error message to the kernel log. Instead use NL_SET_ERR_MSG() to make it return the error message through the netlink interface so that the user can directly see it. Signed-off-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251126-canxl-v8-2-e7e3eb74f889@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index d30816dd93c7..3926c78b2222 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -141,7 +141,7 @@ static inline int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, const struct can_bittiming_const *btc, struct netlink_ext_ack *extack) { - netdev_err(dev, "bit-timing calculation not available\n"); + NL_SET_ERR_MSG(extack, "bit-timing calculation not available\n"); return -EINVAL; } -- cgit v1.2.3 From d037d05c2e32792a6fa572b0aa3c92a8ac78589d Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Nov 2025 11:16:04 +0100 Subject: can: dev: can_dev_dropped_skb: drop CAN FD skbs if FD is off Currently, the CAN FD skb validation logic is based on the MTU: the interface is deemed FD capable if and only if its MTU is greater or equal to CANFD_MTU. This logic is showing its limit with the introduction of CAN XL. For example, consider the two scenarios below: 1. An interface configured with CAN FD on and CAN XL on 2. An interface configured with CAN FD off and CAN XL on In those two scenarios, the interfaces would have the same MTU: CANXL_MTU making it impossible to differentiate which one has CAN FD turned on and which one has it off. Because of the limitation, the only non-UAPI-breaking workaround is to do the check at the device level using the can_priv->ctrlmode flags. Unfortunately, the virtual interfaces (vcan, vxcan), which do not have a can_priv, are left behind. Add a check on the CAN_CTRLMODE_FD flag in can_dev_dropped_skb() and drop FD frames whenever the feature is turned off. Signed-off-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251126-canxl-v8-3-e7e3eb74f889@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index bd7410b5d8a6..a7a39a6101d9 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -103,12 +103,20 @@ static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *s if (priv->ctrlmode & CAN_CTRLMODE_LISTENONLY) { netdev_info_once(dev, "interface in listen only mode, dropping skb\n"); - kfree_skb(skb); - dev->stats.tx_dropped++; - return true; + goto invalid_skb; + } + + if (!(priv->ctrlmode & CAN_CTRLMODE_FD) && can_is_canfd_skb(skb)) { + netdev_info_once(dev, "CAN FD is disabled, dropping skb\n"); + goto invalid_skb; } return can_dropped_invalid_skb(dev, skb); + +invalid_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; + return true; } void can_setup(struct net_device *dev); -- cgit v1.2.3 From 60f511f443e552ef5b5cd79ec2b881f4323e19c9 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Nov 2025 11:16:05 +0100 Subject: can: netlink: add CAN_CTRLMODE_RESTRICTED ISO 11898-1:2024 adds a new restricted operation mode. This mode is added as a mandatory feature for nodes which support CAN XL and is retrofitted as optional for legacy nodes (i.e. the ones which only support Classical CAN and CAN FD). The restricted operation mode is nearly the same as the listen only mode: the node can not send data frames or remote frames and can not send dominant bits if an error occurs. The only exception is that the node shall still send the acknowledgment bit. A second niche exception is that the node may still send a data frame containing a time reference message if the node is a primary time provider, but because the time provider feature is not yet implemented in the kernel, this second exception is not relevant to us at the moment. Add the CAN_CTRLMODE_RESTRICTED control mode flag and update the can_dev_dropped_skb() helper function accordingly. Finally, bail out if both CAN_CTRLMODE_LISTENONLY and CAN_CTRLMODE_RESTRICTED are provided. Signed-off-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251126-canxl-v8-4-e7e3eb74f889@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 50 +++++++++++++++++++++------------------- include/uapi/linux/can/netlink.h | 1 + 2 files changed, 27 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index a7a39a6101d9..ab11c0e9111b 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -95,30 +95,6 @@ static inline bool can_is_canxl_dev_mtu(unsigned int mtu) return (mtu >= CANXL_MIN_MTU && mtu <= CANXL_MAX_MTU); } -/* drop skb if it does not contain a valid CAN frame for sending */ -static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *skb) -{ - struct can_priv *priv = netdev_priv(dev); - - if (priv->ctrlmode & CAN_CTRLMODE_LISTENONLY) { - netdev_info_once(dev, - "interface in listen only mode, dropping skb\n"); - goto invalid_skb; - } - - if (!(priv->ctrlmode & CAN_CTRLMODE_FD) && can_is_canfd_skb(skb)) { - netdev_info_once(dev, "CAN FD is disabled, dropping skb\n"); - goto invalid_skb; - } - - return can_dropped_invalid_skb(dev, skb); - -invalid_skb: - kfree_skb(skb); - dev->stats.tx_dropped++; - return true; -} - void can_setup(struct net_device *dev); struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, @@ -154,6 +130,32 @@ void can_bus_off(struct net_device *dev); const char *can_get_state_str(const enum can_state state); const char *can_get_ctrlmode_str(u32 ctrlmode); +/* drop skb if it does not contain a valid CAN frame for sending */ +static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *skb) +{ + struct can_priv *priv = netdev_priv(dev); + u32 silent_mode = priv->ctrlmode & (CAN_CTRLMODE_LISTENONLY | + CAN_CTRLMODE_RESTRICTED); + + if (silent_mode) { + netdev_info_once(dev, "interface in %s mode, dropping skb\n", + can_get_ctrlmode_str(silent_mode)); + goto invalid_skb; + } + + if (!(priv->ctrlmode & CAN_CTRLMODE_FD) && can_is_canfd_skb(skb)) { + netdev_info_once(dev, "CAN FD is disabled, dropping skb\n"); + goto invalid_skb; + } + + return can_dropped_invalid_skb(dev, skb); + +invalid_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; + return true; +} + void can_state_get_by_berr_counter(const struct net_device *dev, const struct can_berr_counter *bec, enum can_state *tx_state, diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index ef62f56eaaef..fafd1cce4798 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -103,6 +103,7 @@ struct can_ctrlmode { #define CAN_CTRLMODE_CC_LEN8_DLC 0x100 /* Classic CAN DLC option */ #define CAN_CTRLMODE_TDC_AUTO 0x200 /* FD transceiver automatically calculates TDCV */ #define CAN_CTRLMODE_TDC_MANUAL 0x400 /* FD TDCV is manually set up by user */ +#define CAN_CTRLMODE_RESTRICTED 0x800 /* Restricted operation mode */ /* * CAN device statistics -- cgit v1.2.3 From e63281614747c73f25b708c75bc696c4e76f5588 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Nov 2025 11:16:06 +0100 Subject: can: netlink: add initial CAN XL support CAN XL uses bittiming parameters different from Classical CAN and CAN FD. Thus, all the data bittiming parameters, including TDC, need to be duplicated for CAN XL. Add the CAN XL netlink interface for all the features which are common with CAN FD. Any new CAN XL specific features are added later on. The first time CAN XL is activated, the MTU is set by default to CANXL_MAX_MTU. The user may then configure a custom MTU within the CANXL_MIN_MTU to CANXL_MAX_MTU range, in which case, the custom MTU value will be kept as long as CAN XL remains active. Signed-off-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251126-canxl-v8-5-e7e3eb74f889@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 6 ++++-- include/linux/can/dev.h | 7 ++++++- include/uapi/linux/can/netlink.h | 7 +++++++ 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 3926c78b2222..b6cd2476ffd7 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -16,10 +16,12 @@ #define CAN_CTRLMODE_FD_TDC_MASK \ (CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_TDC_MANUAL) +#define CAN_CTRLMODE_XL_TDC_MASK \ + (CAN_CTRLMODE_XL_TDC_AUTO | CAN_CTRLMODE_XL_TDC_MANUAL) #define CAN_CTRLMODE_TDC_AUTO_MASK \ - (CAN_CTRLMODE_TDC_AUTO) + (CAN_CTRLMODE_TDC_AUTO | CAN_CTRLMODE_XL_TDC_AUTO) #define CAN_CTRLMODE_TDC_MANUAL_MASK \ - (CAN_CTRLMODE_TDC_MANUAL) + (CAN_CTRLMODE_TDC_MANUAL | CAN_CTRLMODE_XL_TDC_MANUAL) /* * struct can_tdc - CAN FD Transmission Delay Compensation parameters diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index ab11c0e9111b..f15879bd818d 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -47,7 +47,7 @@ struct can_priv { const struct can_bittiming_const *bittiming_const; struct can_bittiming bittiming; - struct data_bittiming_params fd; + struct data_bittiming_params fd, xl; unsigned int bitrate_const_cnt; const u32 *bitrate_const; u32 bitrate_max; @@ -85,6 +85,11 @@ static inline bool can_fd_tdc_is_enabled(const struct can_priv *priv) return !!(priv->ctrlmode & CAN_CTRLMODE_FD_TDC_MASK); } +static inline bool can_xl_tdc_is_enabled(const struct can_priv *priv) +{ + return !!(priv->ctrlmode & CAN_CTRLMODE_XL_TDC_MASK); +} + static inline u32 can_get_static_ctrlmode(struct can_priv *priv) { return priv->ctrlmode & ~priv->ctrlmode_supported; diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index fafd1cce4798..c2c96c5978a8 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -104,6 +104,9 @@ struct can_ctrlmode { #define CAN_CTRLMODE_TDC_AUTO 0x200 /* FD transceiver automatically calculates TDCV */ #define CAN_CTRLMODE_TDC_MANUAL 0x400 /* FD TDCV is manually set up by user */ #define CAN_CTRLMODE_RESTRICTED 0x800 /* Restricted operation mode */ +#define CAN_CTRLMODE_XL 0x1000 /* CAN XL mode */ +#define CAN_CTRLMODE_XL_TDC_AUTO 0x2000 /* XL transceiver automatically calculates TDCV */ +#define CAN_CTRLMODE_XL_TDC_MANUAL 0x4000 /* XL TDCV is manually set up by user */ /* * CAN device statistics @@ -139,6 +142,10 @@ enum { IFLA_CAN_BITRATE_MAX, IFLA_CAN_TDC, /* FD */ IFLA_CAN_CTRLMODE_EXT, + IFLA_CAN_XL_DATA_BITTIMING, + IFLA_CAN_XL_DATA_BITTIMING_CONST, + IFLA_CAN_XL_DATA_BITRATE_CONST, + IFLA_CAN_XL_TDC, /* add new constants above here */ __IFLA_CAN_MAX, -- cgit v1.2.3 From 233134af208689c2d5d40896f5740473a74e3cb2 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Nov 2025 11:16:07 +0100 Subject: can: netlink: add CAN_CTRLMODE_XL_TMS flag The Transceiver Mode Switching (TMS) indicates whether the CAN XL controller shall use the PWM or NRZ encoding during the data phase. The term "transceiver mode switching" is used in both ISO 11898-1 and CiA 612-2 (although only the latter one uses the abbreviation TMS). We adopt the same naming convention here for consistency. Add the CAN_CTRLMODE_XL_TMS flag to the list of the CAN control modes. Add can_validate_xl_flags() to check the coherency of the TMS flag. That function will be reused in upcoming changes to validate the other CAN XL flags. Signed-off-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251126-canxl-v8-6-e7e3eb74f889@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/netlink.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index c2c96c5978a8..ebafb091d80f 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -107,6 +107,7 @@ struct can_ctrlmode { #define CAN_CTRLMODE_XL 0x1000 /* CAN XL mode */ #define CAN_CTRLMODE_XL_TDC_AUTO 0x2000 /* XL transceiver automatically calculates TDCV */ #define CAN_CTRLMODE_XL_TDC_MANUAL 0x4000 /* XL TDCV is manually set up by user */ +#define CAN_CTRLMODE_XL_TMS 0x8000 /* Transceiver Mode Switching */ /* * CAN device statistics -- cgit v1.2.3 From 6df01533e535d21cac779ff35cc25c43304035c3 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Wed, 26 Nov 2025 11:16:08 +0100 Subject: can: dev: can_dev_dropped_skb: drop CC/FD frames in CANXL-only mode The error-signalling (ES) is a mandatory functionality for CAN CC and CAN FD to report CAN frame format violations by sending an error-frame signal on the bus. A so-called 'mixed-mode' is intended to have (XL-tolerant) CAN FD nodes and CAN XL nodes on one CAN segment, where the FD-controllers can talk CC/FD and the XL-controllers can talk CC/FD/XL. This mixed-mode utilizes the error-signalling for sending CC/FD/XL frames. The CANXL-only mode disables the error-signalling in the CAN XL controller. This mode does not allow CC/FD frames to be sent but additionally offers a CAN XL transceiver mode switching (TMS). Configured with CAN_CTRLMODE_FD and CAN_CTRLMODE_XL this leads to: FD=0 XL=0 CC-only mode (ES=1) FD=1 XL=0 FD/CC mixed-mode (ES=1) FD=1 XL=1 XL/FD/CC mixed-mode (ES=1) FD=0 XL=1 XL-only mode (ES=0, TMS optional) The helper function can_dev_in_xl_only_mode() determines the required value to disable error signalling in the CAN XL controller. Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251126-canxl-v8-7-e7e3eb74f889@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index f15879bd818d..52c8be5c160e 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -135,6 +135,19 @@ void can_bus_off(struct net_device *dev); const char *can_get_state_str(const enum can_state state); const char *can_get_ctrlmode_str(u32 ctrlmode); +static inline bool can_dev_in_xl_only_mode(struct can_priv *priv) +{ + const u32 mixed_mode = CAN_CTRLMODE_FD | CAN_CTRLMODE_XL; + + /* When CAN XL is enabled but FD is disabled we are running in + * the so-called 'CANXL-only mode' where the error signalling is + * disabled. This helper function determines the required value + * to disable error signalling in the CAN XL controller. + * The so-called CC/FD/XL 'mixed mode' requires error signalling. + */ + return ((priv->ctrlmode & mixed_mode) == CAN_CTRLMODE_XL); +} + /* drop skb if it does not contain a valid CAN frame for sending */ static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *skb) { @@ -153,6 +166,12 @@ static inline bool can_dev_dropped_skb(struct net_device *dev, struct sk_buff *s goto invalid_skb; } + if (can_dev_in_xl_only_mode(priv) && !can_is_canxl_skb(skb)) { + netdev_info_once(dev, + "Error signaling is disabled, dropping skb\n"); + goto invalid_skb; + } + return can_dropped_invalid_skb(dev, skb); invalid_skb: -- cgit v1.2.3 From f6ccc2b293ba27e9171c63e456d9cba664fa2337 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Nov 2025 11:16:09 +0100 Subject: can: bittiming: add PWM parameters In CAN XL, higher data bit rates require the CAN transceiver to switch its operation mode to use Pulse-Width Modulation (PWM) transmission mode instead of the classic dominant/recessive transmission mode. The PWM parameters are: - PWMS: pulse width modulation short phase - PWML: pulse width modulation long phase - PWMO: pulse width modulation offset CiA 612-2 specifies PWMS and PWML to be at least 1 (arguably, PWML shall be at least 2 to respect the PWMS < PWML rule). PWMO's minimum is expected to always be zero. It is added more for consistency than anything else. Add struct can_pwm_const so that the different devices can provide their minimum and maximum values. When TMS is on, the runtime PWMS, PWML and PWMO are needed (either calculated or provided by the user): add struct can_pwm to store these. TDC and PWM can not be used at the same time (TDC can only be used when TMS is off and PWM only when TMS is on). struct can_pwm is thus put together with struct can_tdc inside a union to save some space. The netlink logic will be added in an upcoming change. Signed-off-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251126-canxl-v8-8-e7e3eb74f889@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index b6cd2476ffd7..967d76689c4f 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2020 Pengutronix, Marc Kleine-Budde - * Copyright (c) 2021 Vincent Mailhol + * Copyright (c) 2021-2025 Vincent Mailhol */ #ifndef _CAN_BITTIMING_H @@ -120,11 +120,48 @@ struct can_tdc_const { u32 tdcf_max; }; +/* + * struct can_pwm - CAN Pulse-Width Modulation (PWM) parameters + * + * @pwms: pulse width modulation short phase + * @pwml: pulse width modulation long phase + * @pwmo: pulse width modulation offset + */ +struct can_pwm { + u32 pwms; + u32 pwml; + u32 pwmo; +}; + +/* + * struct can_pwm - CAN hardware-dependent constants for Pulse-Width + * Modulation (PWM) + * + * @pwms_min: PWM short phase minimum value. Must be at least 1. + * @pwms_max: PWM short phase maximum value + * @pwml_min: PWM long phase minimum value. Must be at least 1. + * @pwml_max: PWM long phase maximum value + * @pwmo_min: PWM offset phase minimum value + * @pwmo_max: PWM offset phase maximum value + */ +struct can_pwm_const { + u32 pwms_min; + u32 pwms_max; + u32 pwml_min; + u32 pwml_max; + u32 pwmo_min; + u32 pwmo_max; +}; + struct data_bittiming_params { const struct can_bittiming_const *data_bittiming_const; struct can_bittiming data_bittiming; const struct can_tdc_const *tdc_const; - struct can_tdc tdc; + const struct can_pwm_const *pwm_const; + union { + struct can_tdc tdc; + struct can_pwm pwm; + }; const u32 *data_bitrate_const; unsigned int data_bitrate_const_cnt; int (*do_set_data_bittiming)(struct net_device *dev); -- cgit v1.2.3 From 8e2a2885a2a6217190065d1aae98fe88a670cc28 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Nov 2025 11:16:10 +0100 Subject: can: bittiming: add PWM validation Add can_validate_pwm() to validate the values pwms, pwml and pwml. Error messages are added to each of the checks to inform the user on what went wrong. Refer to those error messages to understand the validation logic. The boundary values CAN_PWM_DECODE_NS (the transceiver minimum decoding margin) and CAN_PWM_NS_MAX (the maximum PWM symbol duration) are hardcoded for the moment. Note that a transceiver capable of bitrates higher than 20 Mbps may be able to handle a CAN_PWM_DECODE_NS below 5 ns. If such transceivers become commercially available, this code could be revisited to make this parameter configurable. For now, leave it static. Signed-off-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251126-canxl-v8-9-e7e3eb74f889@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'include') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 967d76689c4f..2504fafc72e4 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -87,6 +87,11 @@ struct can_tdc { u32 tdcf; }; +/* The transceiver decoding margin corresponds to t_Decode in ISO 11898-2 */ +#define CAN_PWM_DECODE_NS 5 +/* Maximum PWM symbol duration. Corresponds to t_SymbolNom_MAX - t_Decode */ +#define CAN_PWM_NS_MAX (205 - CAN_PWM_DECODE_NS) + /* * struct can_tdc_const - CAN hardware-dependent constant for * Transmission Delay Compensation @@ -203,6 +208,10 @@ int can_get_bittiming(const struct net_device *dev, struct can_bittiming *bt, const unsigned int bitrate_const_cnt, struct netlink_ext_ack *extack); +int can_validate_pwm_bittiming(const struct net_device *dev, + const struct can_pwm *pwm, + struct netlink_ext_ack *extack); + /* * can_get_relative_tdco() - TDCO relative to the sample point * @@ -245,4 +254,17 @@ static inline unsigned int can_bit_time(const struct can_bittiming *bt) return CAN_SYNC_SEG + bt->prop_seg + bt->phase_seg1 + bt->phase_seg2; } +/* Duration of one bit in minimum time quantum */ +static inline unsigned int can_bit_time_tqmin(const struct can_bittiming *bt) +{ + return can_bit_time(bt) * bt->brp; +} + +/* Convert a duration from minimum a minimum time quantum to nano seconds */ +static inline u32 can_tqmin_to_ns(u32 tqmin, u32 clock_freq) +{ + return DIV_U64_ROUND_CLOSEST(mul_u32_u32(tqmin, NSEC_PER_SEC), + clock_freq); +} + #endif /* !_CAN_BITTIMING_H */ -- cgit v1.2.3 From 9892339cf0348730e82383d4de9d9387b9d63925 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Nov 2025 11:16:11 +0100 Subject: can: calc_bittiming: add PWM calculation Perform the PWM calculation according to CiA recommendations. Note that for databitrates greater than 5 MBPS, tqmin is less than CAN_PWM_NS_MAX (which is defined to 200 nano seconds), consequently, the result of the division: DIV_ROUND_UP(xl_ns, CAN_PWM_NS_MAX) is one and thus the for loop automatically stops on the first iteration giving a single PWM symbol per bit as expected. Because of that, there is no actual need for a separate conditional branch for when the databitrate is greater than 5 MBPS. Signed-off-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251126-canxl-v8-10-e7e3eb74f889@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/linux/can/bittiming.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/can/bittiming.h b/include/linux/can/bittiming.h index 2504fafc72e4..726d909e87ce 100644 --- a/include/linux/can/bittiming.h +++ b/include/linux/can/bittiming.h @@ -180,6 +180,8 @@ int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, void can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, const struct can_bittiming *dbt, u32 tdc_mask, u32 *ctrlmode, u32 ctrlmode_supported); + +int can_calc_pwm(struct net_device *dev, struct netlink_ext_ack *extack); #else /* !CONFIG_CAN_CALC_BITTIMING */ static inline int can_calc_bittiming(const struct net_device *dev, struct can_bittiming *bt, @@ -195,6 +197,14 @@ can_calc_tdco(struct can_tdc *tdc, const struct can_tdc_const *tdc_const, u32 tdc_mask, u32 *ctrlmode, u32 ctrlmode_supported) { } + +static inline int +can_calc_pwm(struct net_device *dev, struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, + "bit-timing calculation not available: manually provide PWML and PWMS\n"); + return -EINVAL; +} #endif /* CONFIG_CAN_CALC_BITTIMING */ void can_sjw_set_default(struct can_bittiming *bt); -- cgit v1.2.3 From 46552323fa6779beb1ea558254dfd56021174c93 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 26 Nov 2025 11:16:12 +0100 Subject: can: netlink: add PWM netlink interface When the TMS is switched on, the node uses PWM (Pulse Width Modulation) during the data phase instead of the classic NRZ (Non Return to Zero) encoding. PWM is configured by three parameters: - PWMS: Pulse Width Modulation Short phase - PWML: Pulse Width Modulation Long phase - PWMO: Pulse Width Modulation Offset time For each of these parameters, define three IFLA symbols: - IFLA_CAN_PWM_PWM*_MIN: the minimum allowed value. - IFLA_CAN_PWM_PWM*_MAX: the maximum allowed value. - IFLA_CAN_PWM_PWM*: the runtime value. This results in a total of nine IFLA symbols which are all nested in a parent IFLA_CAN_XL_PWM symbol. IFLA_CAN_PWM_PWM*_MIN and IFLA_CAN_PWM_PWM*_MAX define the range of allowed values and will match the value statically configured by the device in struct can_pwm_const. IFLA_CAN_PWM_PWM* match the runtime values stored in struct can_pwm. Those parameters may only be configured when the tms mode is on. If the PWMS, PWML and PWMO parameters are provided, check that all the needed parameters are present using can_validate_pwm(), then check their value using can_validate_pwm_bittiming(). PWMO defaults to zero if omitted. Otherwise, if CAN_CTRLMODE_XL_TMS is true but none of the PWM parameters are provided, calculate them using can_calc_pwm(). Signed-off-by: Vincent Mailhol Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251126-canxl-v8-11-e7e3eb74f889@pengutronix.de Signed-off-by: Marc Kleine-Budde --- include/uapi/linux/can/netlink.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/can/netlink.h b/include/uapi/linux/can/netlink.h index ebafb091d80f..c30d16746159 100644 --- a/include/uapi/linux/can/netlink.h +++ b/include/uapi/linux/can/netlink.h @@ -5,6 +5,7 @@ * Definitions for the CAN netlink interface * * Copyright (c) 2009 Wolfgang Grandegger + * Copyright (c) 2021-2025 Vincent Mailhol * * This program is free software; you can redistribute it and/or modify * it under the terms of the version 2 of the GNU General Public License @@ -147,6 +148,7 @@ enum { IFLA_CAN_XL_DATA_BITTIMING_CONST, IFLA_CAN_XL_DATA_BITRATE_CONST, IFLA_CAN_XL_TDC, + IFLA_CAN_XL_PWM, /* add new constants above here */ __IFLA_CAN_MAX, @@ -188,6 +190,29 @@ enum { IFLA_CAN_CTRLMODE_MAX = __IFLA_CAN_CTRLMODE - 1 }; +/* + * CAN FD/XL Pulse-Width Modulation (PWM) + * + * Please refer to struct can_pwm_const and can_pwm in + * include/linux/can/bittiming.h for further details. + */ +enum { + IFLA_CAN_PWM_UNSPEC, + IFLA_CAN_PWM_PWMS_MIN, /* u32 */ + IFLA_CAN_PWM_PWMS_MAX, /* u32 */ + IFLA_CAN_PWM_PWML_MIN, /* u32 */ + IFLA_CAN_PWM_PWML_MAX, /* u32 */ + IFLA_CAN_PWM_PWMO_MIN, /* u32 */ + IFLA_CAN_PWM_PWMO_MAX, /* u32 */ + IFLA_CAN_PWM_PWMS, /* u32 */ + IFLA_CAN_PWM_PWML, /* u32 */ + IFLA_CAN_PWM_PWMO, /* u32 */ + + /* add new constants above here */ + __IFLA_CAN_PWM, + IFLA_CAN_PWM_MAX = __IFLA_CAN_PWM - 1 +}; + /* u16 termination range: 1..65535 Ohms */ #define CAN_TERMINATION_DISABLED 0 -- cgit v1.2.3 From 4e1da516debbe6a573ffa0392e2809d180d0575c Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Thu, 23 Oct 2025 14:28:18 +0100 Subject: comedi: Add reference counting for Comedi command handling For interrupts from badly behaved hardware (as emulated by Syzbot), it is possible for the Comedi core functions that manage the progress of asynchronous data acquisition to be called from driver ISRs while no asynchronous command has been set up, which can cause problems such as invalid pointer dereferencing or dividing by zero. To help protect against that, introduce new functions to maintain a reference counter for asynchronous commands that are being set up. `comedi_get_is_subdevice_running(s)` will check if a command has been set up on a subdevice and is still marked as running, and if so will increment the reference counter and return `true`, otherwise it will return `false` without modifying the reference counter. `comedi_put_is_subdevice_running(s)` will decrement the reference counter and set a completion event when decremented to 0. Change the `do_cmd_ioctl()` function (responsible for setting up the asynchronous command) to reinitialize the completion event and set the reference counter to 1 before it marks the subdevice as running. Change the `do_become_nonbusy()` function (responsible for destroying a completed command) to call `comedi_put_is_subdevice_running(s)` and wait for the completion event after marking the subdevice as not running. Because the subdevice normally gets marked as not running before the call to `do_become_nonbusy()` (and may also be called when the Comedi device is being detached from the low-level driver), add a new flag `COMEDI_SRF_BUSY` to the set of subdevice run-flags that indicates that an asynchronous command was set up and will need to be destroyed. This flag is set by `do_cmd_ioctl()` and cleared and checked by `do_become_nonbusy()`. Subsequent patches will change the Comedi core functions that are called from low-level drivers for asynchrous command handling to make use of the `comedi_get_is_subdevice_running()` and `comedi_put_is_subdevice_running()` functions, and will modify the ISRs of some of these low-level drivers if they dereference the subdevice's `async` pointer directly. Signed-off-by: Ian Abbott Link: https://patch.msgid.link/20251023133001.8439-2-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/comedi/comedidev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/comedi/comedidev.h b/include/linux/comedi/comedidev.h index 4cb0400ad616..35fdc41845ce 100644 --- a/include/linux/comedi/comedidev.h +++ b/include/linux/comedi/comedidev.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #define COMEDI_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + (c)) @@ -272,6 +273,8 @@ struct comedi_buf_map { * @events: Bit-vector of events that have occurred. * @cmd: Details of comedi command in progress. * @wait_head: Task wait queue for file reader or writer. + * @run_complete: "run complete" completion event. + * @run_active: "run active" reference counter. * @cb_mask: Bit-vector of events that should wake waiting tasks. * @inttrig: Software trigger function for command, or NULL. * @@ -357,6 +360,8 @@ struct comedi_async { unsigned int events; struct comedi_cmd cmd; wait_queue_head_t wait_head; + struct completion run_complete; + refcount_t run_active; unsigned int cb_mask; int (*inttrig)(struct comedi_device *dev, struct comedi_subdevice *s, unsigned int x); @@ -584,6 +589,8 @@ struct comedi_device *comedi_dev_get_from_minor(unsigned int minor); int comedi_dev_put(struct comedi_device *dev); bool comedi_is_subdevice_running(struct comedi_subdevice *s); +bool comedi_get_is_subdevice_running(struct comedi_subdevice *s); +void comedi_put_is_subdevice_running(struct comedi_subdevice *s); void *comedi_alloc_spriv(struct comedi_subdevice *s, size_t size); void comedi_set_spriv_auto_free(struct comedi_subdevice *s); -- cgit v1.2.3 From d1b3b9c70e11cb4f40b4e41a4dc1503b9a3c0109 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 27 Oct 2025 15:25:02 +0000 Subject: comedi: kcomedilib: Add loop checking variants of open and close Add `comedi_open_from(path, from)` and `comedi_close_from(dev, from)` as variants of the existing `comedi_from(path)` and `comedi_close(dev)`. The additional `from` parameter is a minor device number that tells the function that the COMEDI device is being opened or closed from another COMEDI device if the value is in the range [0, `COMEDI_NUM_BOARD_MINORS`-1]. In that case the function will refuse to open the device if it would lead to a chain of devices opening each other. (It will also impose a limit on the number of simultaneous opens from one device to another because we need to count those.) The new functions are intended to be used by the "comedi_bond" driver, which is the only driver that uses the existing `comedi_open()` and `comedi_close()` functions. The new functions will be used to avoid some possible deadlock situations. Replace the existing, exported `comedi_open()` and `comedi_close()` functions with inline wrapper functions that call the newly exported `comedi_open_from()` and `comedi_close_from()` functions. Signed-off-by: Ian Abbott Link: https://patch.msgid.link/20251027153748.4569-2-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- include/linux/comedi/comedilib.h | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/comedi/comedilib.h b/include/linux/comedi/comedilib.h index 0223c9cd9215..1f2b22b383cc 100644 --- a/include/linux/comedi/comedilib.h +++ b/include/linux/comedi/comedilib.h @@ -10,8 +10,38 @@ #ifndef _LINUX_COMEDILIB_H #define _LINUX_COMEDILIB_H -struct comedi_device *comedi_open(const char *path); -int comedi_close(struct comedi_device *dev); +struct comedi_device *comedi_open_from(const char *path, int from); + +/** + * comedi_open() - Open a COMEDI device from the kernel + * @filename: Fake pathname of the form "/dev/comediN". + * + * Converts @filename to a COMEDI device number and "opens" it if it exists + * and is attached to a low-level COMEDI driver. + * + * Return: A pointer to the COMEDI device on success. + * Return %NULL on failure. + */ +static inline struct comedi_device *comedi_open(const char *path) +{ + return comedi_open_from(path, -1); +} + +int comedi_close_from(struct comedi_device *dev, int from); + +/** + * comedi_close() - Close a COMEDI device from the kernel + * @dev: COMEDI device. + * + * Closes a COMEDI device previously opened by comedi_open(). + * + * Returns: 0 + */ +static inline int comedi_close(struct comedi_device *dev) +{ + return comedi_close_from(dev, -1); +} + int comedi_dio_get_config(struct comedi_device *dev, unsigned int subdev, unsigned int chan, unsigned int *io); int comedi_dio_config(struct comedi_device *dev, unsigned int subdev, -- cgit v1.2.3 From f0fdaa4ad55b7c6e46a5ccb9102bc9a96cad360f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 27 Oct 2025 21:04:09 -0700 Subject: virt: acrn: split acrn_mmio_dev_res out of acrn_mmiodev Add struct acrn_mmio_dev_res before struct acrn_mmio_dev. The former is used in the latter and breaking them up provides better kernel-doc documentation for the struct members. Suggested-by: Fei Li Signed-off-by: Randy Dunlap Acked-by: Fei Li Link: https://patch.msgid.link/20251028040409.868254-1-rdunlap@infradead.org Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/acrn.h | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/acrn.h b/include/uapi/linux/acrn.h index 7b714c1902eb..79e7855a8c42 100644 --- a/include/uapi/linux/acrn.h +++ b/include/uapi/linux/acrn.h @@ -418,26 +418,32 @@ struct acrn_pcidev { }; /** - * struct acrn_mmiodev - Info for assigning or de-assigning a MMIO device - * @name: Name of the MMIO device. - * @res[].user_vm_pa: Physical address of User VM of the MMIO region - * for the MMIO device. - * @res[].service_vm_pa: Physical address of Service VM of the MMIO - * region for the MMIO device. - * @res[].size: Size of the MMIO region for the MMIO device. - * @res[].mem_type: Memory type of the MMIO region for the MMIO - * device. + * struct acrn_mmio_dev_res - MMIO device resource description + * @user_vm_pa: Physical address of User VM of the MMIO region + * for the MMIO device. + * @service_vm_pa: Physical address of Service VM of the MMIO + * region for the MMIO device. + * @size: Size of the MMIO region for the MMIO device. + * @mem_type: Memory type of the MMIO region for the MMIO + * device. + */ +struct acrn_mmio_dev_res { + __u64 user_vm_pa; + __u64 service_vm_pa; + __u64 size; + __u64 mem_type; +}; + +/** + * struct acrn_mmiodev - Info for assigning or de-assigning an MMIO device + * @name: Name of the MMIO device. + * @res: Array of MMIO device descriptions * * This structure will be passed to hypervisor directly. */ struct acrn_mmiodev { __u8 name[8]; - struct { - __u64 user_vm_pa; - __u64 service_vm_pa; - __u64 size; - __u64 mem_type; - } res[ACRN_MMIODEV_RES_NUM]; + struct acrn_mmio_dev_res res[ACRN_MMIODEV_RES_NUM]; }; /** -- cgit v1.2.3 From f85d90dd8d0efbc75e79698e147c6e682df22e1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 29 Oct 2025 09:12:10 +0100 Subject: sysfs: attribute_group: allow registration of const attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be able to constify instances of struct attribute it has to be possible to add them to struct attribute_group. The current type of the attrs member however is not compatible with that. Introduce a union that allows registration of both const and non-const attributes to enable a piecewise transition. As both union member types are compatible no logic needs to be adapted. Technically it is now possible register a const struct attribute and receive it as mutable pointer in the callbacks. This is a soundness issue. But this same soundness issue already exists today in sysfs_create_file(). Also the struct definition and callback implementation are always closely linked and are meant to be moved to const in lockstep. Similar to commit 906c508afdca ("sysfs: attribute_group: allow registration of const bin_attribute") Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20251029-sysfs-const-attr-prep-v5-1-ea7d745acff4@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 9a25a2911652..e34d6af96abb 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -105,7 +105,10 @@ struct attribute_group { size_t (*bin_size)(struct kobject *, const struct bin_attribute *, int); - struct attribute **attrs; + union { + struct attribute **attrs; + const struct attribute *const *attrs_const; + }; const struct bin_attribute *const *bin_attrs; }; -- cgit v1.2.3 From 964c93b1eef37e3bbe0edb37346c076217d71fe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 29 Oct 2025 09:12:11 +0100 Subject: sysfs: transparently handle const pointers in ATTRIBUTE_GROUPS() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To ease the constification process of 'struct attribute', transparently handle the const pointers in ATTRIBUTE_GROUPS(). A cast is used instead of assigning to .attrs_new as it keeps the macro smaller. As both members are aliased to each other the result is identical. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20251029-sysfs-const-attr-prep-v5-2-ea7d745acff4@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index e34d6af96abb..92f82cee5f11 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -287,7 +287,12 @@ static const struct attribute_group *_name##_groups[] = { \ #define ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ - .attrs = _name##_attrs, \ + .attrs = _Generic(_name##_attrs, \ + struct attribute **: \ + _name##_attrs, \ + const struct attribute *const *: \ + (void *)_name##_attrs \ + ), \ }; \ __ATTRIBUTE_GROUPS(_name) -- cgit v1.2.3 From 02ac5335a55111d87a7a618355261b4407ed0f7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 29 Oct 2025 09:12:12 +0100 Subject: sysfs: introduce __SYSFS_FUNCTION_ALTERNATIVE() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For the constification phase of 'struct attribute' various callback struct members will need to exist in both const and non-const variants. Keeping both members in a union avoids memory and CPU overhead but will be detected and trapped by Control Flow Integrity (CFI). By deciding between a struct and a union depending whether CFI is enabled, most configurations can avoid this overhead. Code using these callbacks will still need to be updated to handle both members explicitly. In the union case the compiler will recognize that testing for one union member is enough and optimize away the code for the other one. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20251029-sysfs-const-attr-prep-v5-3-ea7d745acff4@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 92f82cee5f11..9cef5bf24ba7 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -58,6 +58,12 @@ do { \ #define sysfs_attr_init(attr) do {} while (0) #endif +#ifdef CONFIG_CFI +#define __SYSFS_FUNCTION_ALTERNATIVE(MEMBERS...) struct { MEMBERS } +#else +#define __SYSFS_FUNCTION_ALTERNATIVE(MEMBERS...) union { MEMBERS } +#endif + /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name -- cgit v1.2.3 From 7dd9fdb4939b972c1d0523e94fb3f70789653f0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 29 Oct 2025 09:12:13 +0100 Subject: sysfs: attribute_group: enable const variants of is_visible() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When constifying instances of struct attribute, for consistency the corresponding .is_visible() callback should be adapted, too. Introduce a temporary transition mechanism until all callbacks are converted. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20251029-sysfs-const-attr-prep-v5-4-ea7d745acff4@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 9cef5bf24ba7..592886ed6ca9 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -104,8 +104,12 @@ do { \ */ struct attribute_group { const char *name; - umode_t (*is_visible)(struct kobject *, - struct attribute *, int); + __SYSFS_FUNCTION_ALTERNATIVE( + umode_t (*is_visible)(struct kobject *, + struct attribute *, int); + umode_t (*is_visible_const)(struct kobject *, + const struct attribute *, int); + ); umode_t (*is_bin_visible)(struct kobject *, const struct bin_attribute *, int); size_t (*bin_size)(struct kobject *, -- cgit v1.2.3 From 71464949b1f5f8b8599d057fea525a2a520f84d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 29 Oct 2025 09:12:16 +0100 Subject: sysfs: simplify attribute definition macros MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define the macros in terms of each other. This makes them easier to understand and also will make it easier to implement the transition machinery for 'const struct attribute'. __ATTR_RO_MODE() can't be implemented in terms of __ATTR() as not all attributes have a .store callback. The same issue theoretically exists for __ATTR_WO(), but practically that does not occur today. Reorder __ATTR_RO() below __ATTR_RO_MODE() to keep the order of the macro definition consistent with respect to each other. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20251029-sysfs-const-attr-prep-v5-7-ea7d745acff4@weissschuh.net Signed-off-by: Greg Kroah-Hartman --- include/linux/sysfs.h | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 592886ed6ca9..c33a96b7391a 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -251,28 +251,20 @@ struct attribute_group { .store = _store, \ } -#define __ATTR_RO(_name) { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = _name##_show, \ -} - #define __ATTR_RO_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ } -#define __ATTR_RW_MODE(_name, _mode) { \ - .attr = { .name = __stringify(_name), \ - .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ - .show = _name##_show, \ - .store = _name##_store, \ -} +#define __ATTR_RO(_name) \ + __ATTR_RO_MODE(_name, 0444) -#define __ATTR_WO(_name) { \ - .attr = { .name = __stringify(_name), .mode = 0200 }, \ - .store = _name##_store, \ -} +#define __ATTR_RW_MODE(_name, _mode) \ + __ATTR(_name, _mode, _name##_show, _name##_store) + +#define __ATTR_WO(_name) \ + __ATTR(_name, 0200, NULL, _name##_store) #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store) -- cgit v1.2.3 From d3d25f430cadc59d42965f54f54a8c0050931860 Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Thu, 6 Nov 2025 10:58:38 +0530 Subject: mod_devicetable: Bump auxiliary_device_id name size We have an upcoming driver named "intel_ehl_pse_io". This creates an auxiliary child device for it's GPIO sub-functionality, which matches against "intel_ehl_pse_io.gpio-elkhartlake" and overshoots the current maximum limit of 32 bytes for auxiliary device id string. Bump the size to 40 bytes to satisfy such cases. Suggested-by: Andy Shevchenko Signed-off-by: Raag Jadav Link: https://patch.msgid.link/20251106052838.433673-1-raag.jadav@intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/mod_devicetable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 6077972e8b45..24eb5a88a5c5 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -867,7 +867,7 @@ struct mhi_device_id { kernel_ulong_t driver_data; }; -#define AUXILIARY_NAME_SIZE 32 +#define AUXILIARY_NAME_SIZE 40 #define AUXILIARY_MODULE_PREFIX "auxiliary:" struct auxiliary_device_id { -- cgit v1.2.3 From d6f4941f1b4f3e701e422dfbfee024264294f91f Mon Sep 17 00:00:00 2001 From: Benedek Kupper Date: Tue, 7 Oct 2025 22:35:44 +0200 Subject: drivers: hid: renegotiate resolution multipliers with device after reset The scroll resolution multipliers are set in the context of hidinput_connect(), which is only called at probe time: when the host changes the value on the device with a SET_REPORT(FEATURE), and the device accepts it, these multipliers are stored on the host side, and used to calculate the final scroll event values sent to userspace. After a USB suspend, the resume operation on many hubs and chipsets involve a USB reset signal as well. A reset on the device side clears all previous state information, including the value of the multiplier report. This reset is not handled by the multiplier handling logic, so what ends up happening is the host is still expecting high-resolution scroll events, but the device is reset to default resolution, making the effective, user-perceived scroll speed incredibly slow. The solution is to renegotiate the multiplier selection after each reset. This is not the only bug related to the high-resolution scrolling implementation in the kernel (the other one is https://bugzilla.kernel.org/show_bug.cgi?id=220144), but for this one, there is no device side workaround for, leading to poor user experience with our product: https://github.com/UltimateHackingKeyboard/firmware/issues/1155 https://github.com/UltimateHackingKeyboard/firmware/issues/1261 https://github.com/UltimateHackingKeyboard/firmware/pull/1355 This patch was tested by an affected user and has been reported to fix the issue (see discussion in 1355). Signed-off-by: Benedek Kupper Signed-off-by: Jiri Kosina --- include/linux/hid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/hid.h b/include/linux/hid.h index a4ddb94e3ee5..dce862cafbbd 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -984,6 +984,7 @@ extern void hidinput_hid_event(struct hid_device *, struct hid_field *, struct h extern void hidinput_report_event(struct hid_device *hid, struct hid_report *report); extern int hidinput_connect(struct hid_device *hid, unsigned int force); extern void hidinput_disconnect(struct hid_device *); +void hidinput_reset_resume(struct hid_device *hid); struct hid_field *hid_find_field(struct hid_device *hdev, unsigned int report_type, unsigned int application, unsigned int usage); -- cgit v1.2.3 From 81c45c62dc3eefd83af8eb8df10e45705e8e3a47 Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Mon, 3 Nov 2025 09:27:55 -0800 Subject: iommu/arm-smmu-v3-iommufd: Allow attaching nested domain for GBPA cases A vDEVICE has been a hard requirement for attaching a nested domain to the device. This makes sense when installing a guest STE, since a vSID must be present and given to the kernel during the vDEVICE allocation. But, when CR0.SMMUEN is disabled, VM doesn't really need a vSID to program the vSMMU behavior as GBPA will take effect, in which case the vSTE in the nested domain could have carried the bypass or abort configuration in GBPA register. Thus, having such a hard requirement doesn't work well for GBPA. Skip vmaster allocation in arm_smmu_attach_prepare_vmaster() for an abort or bypass vSTE. Note that device on this attachment won't report vevents. Update the uAPI doc accordingly. Link: https://patch.msgid.link/r/20251103172755.2026145-1-nicolinc@nvidia.com Tested-by: Shameer Kolothum Signed-off-by: Nicolin Chen Reviewed-by: Jason Gunthorpe Reviewed-by: Pranjal Shrivastava Tested-by: Shuai Xue Signed-off-by: Jason Gunthorpe --- include/uapi/linux/iommufd.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index c218c89e0e2e..2c41920b641d 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -450,6 +450,16 @@ struct iommu_hwpt_vtd_s1 { * nested domain will translate the same as the nesting parent. The S1 will * install a Context Descriptor Table pointing at userspace memory translated * by the nesting parent. + * + * It's suggested to allocate a vDEVICE object carrying vSID and then re-attach + * the nested domain, as soon as the vSID is available in the VMM level: + * + * - when Cfg=translate, a vDEVICE must be allocated prior to attaching to the + * allocated nested domain, as CD/ATS invalidations and vevents need a vSID. + * - when Cfg=bypass/abort, a vDEVICE is not enforced during the nested domain + * attachment, to support a GBPA case where VM sets CR0.SMMUEN=0. However, if + * VM sets CR0.SMMUEN=1 while missing a vDEVICE object, kernel would fail to + * report events to the VM. E.g. F_TRANSLATION when guest STE.Cfg=abort. */ struct iommu_hwpt_arm_smmuv3 { __aligned_le64 ste[2]; -- cgit v1.2.3 From f83ac7544fbf7ba3f77c122e16ab5319f75bbdfd Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Tue, 25 Nov 2025 17:34:25 +0800 Subject: function_graph: Enable funcgraph-args and funcgraph-retaddr to work simultaneously Currently, the funcgraph-args and funcgraph-retaddr features are mutually exclusive. This patch resolves this limitation by allowing funcgraph-retaddr to have an args array. To verify the change, use perf to trace vfs_write with both options enabled: Before: # perf ftrace -G vfs_write --graph-opts args,retaddr ...... down_read() { /* <-n_tty_write+0xa3/0x540 */ __cond_resched(); /* <-down_read+0x12/0x160 */ preempt_count_add(); /* <-down_read+0x3b/0x160 */ preempt_count_sub(); /* <-down_read+0x8b/0x160 */ } After: # perf ftrace -G vfs_write --graph-opts args,retaddr ...... down_read(sem=0xffff8880100bea78) { /* <-n_tty_write+0xa3/0x540 */ __cond_resched(); /* <-down_read+0x12/0x160 */ preempt_count_add(val=1); /* <-down_read+0x3b/0x160 */ preempt_count_sub(val=1); /* <-down_read+0x8b/0x160 */ } Cc: Steven Rostedt (Google) Cc: Sven Schnelle Cc: Masami Hiramatsu Cc: Xiaoqin Zhang Link: https://patch.msgid.link/20251125093425.2563849-1-dolinux.peng@gmail.com Signed-off-by: pengdonglin Signed-off-by: Steven Rostedt (Google) --- include/linux/ftrace.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7ded7df6e9b5..6ca9c6229d93 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1126,17 +1126,14 @@ static inline void ftrace_init(void) { } */ struct ftrace_graph_ent { unsigned long func; /* Current function */ - int depth; + unsigned long depth; } __packed; /* * Structure that defines an entry function trace with retaddr. - * It's already packed but the attribute "packed" is needed - * to remove extra padding at the end. */ struct fgraph_retaddr_ent { - unsigned long func; /* Current function */ - int depth; + struct ftrace_graph_ent ent; unsigned long retaddr; /* Return address */ } __packed; -- cgit v1.2.3 From 4677e78800bbde62a9edce0eb3b40c775ec55e0d Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 25 Nov 2025 16:17:59 -0500 Subject: socket: Unify getsockname and getpeername implementation They are already implemented by the same get_name hook in the protocol level. Bring the unification one level up to reduce code duplication in preparation to supporting these as io_uring operations. Reviewed-by: Kuniyuki Iwashima Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/socket.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 3b262487ec06..937fe331ff1e 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -454,9 +454,7 @@ extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, extern int __sys_listen(int fd, int backlog); extern int __sys_listen_socket(struct socket *sock, int backlog); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len); -extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len); + int __user *usockaddr_len, int peer); extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); extern int __sys_shutdown_sock(struct socket *sock, int how); -- cgit v1.2.3 From d73c1677087391379441c0bb444c7fb4238fc6e7 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 25 Nov 2025 16:18:00 -0500 Subject: socket: Split out a getsockname helper for io_uring Similar to getsockopt, split out a helper to check security and issue the operation from the main handler that can be used by io_uring. Signed-off-by: Gabriel Krisman Bertazi Reviewed-by: Kuniyuki Iwashima Signed-off-by: Jens Axboe --- include/linux/socket.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/socket.h b/include/linux/socket.h index 937fe331ff1e..8d580074ddea 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -453,6 +453,8 @@ extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen); extern int __sys_listen(int fd, int backlog); extern int __sys_listen_socket(struct socket *sock, int backlog); +extern int do_getsockname(struct socket *sock, int peer, + struct sockaddr __user *usockaddr, int __user *usockaddr_len); extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len, int peer); extern int __sys_socketpair(int family, int type, int protocol, -- cgit v1.2.3 From 5d24321e4c159088604512d7a5c5cf634d23e01a Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Tue, 25 Nov 2025 16:18:01 -0500 Subject: io_uring: Introduce getsockname io_uring cmd Introduce a socket-specific io_uring_cmd to support getsockname/getpeername via io_uring. I made this an io_uring_cmd instead of a new operation to avoid polluting the command namespace with what is exclusively a socket operation. In addition, since we don't need to conform to existing interfaces, this merges the getsockname/getpeername in a single operation, since the implementation is pretty much the same. This has been frequently requested, for instance at [1] and more recently in the project Discord channel. The main use-case is to support fixed socket file descriptors. [1] https://github.com/axboe/liburing/issues/1356 Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index deb772222b6d..b5b23c0d5283 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -1009,6 +1009,7 @@ enum io_uring_socket_op { SOCKET_URING_OP_GETSOCKOPT, SOCKET_URING_OP_SETSOCKOPT, SOCKET_URING_OP_TX_TIMESTAMP, + SOCKET_URING_OP_GETSOCKNAME, }; /* -- cgit v1.2.3 From dac092195b6a35bc7c9f11e2884cfecb1b25e20c Mon Sep 17 00:00:00 2001 From: Yang Erkun Date: Wed, 12 Nov 2025 16:45:36 +0800 Subject: ext4: rename EXT4_GET_BLOCKS_PRE_IO This flag has been generalized to split an unwritten extent when we do dio or dioread_nolock writeback, or to avoid merge new extents which was created by extents split. Update some related comments too. Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Reviewed-by: Baokun Li Signed-off-by: Yang Erkun Message-ID: <20251112084538.1658232-2-yangerkun@huawei.com> Signed-off-by: Theodore Ts'o --- include/trace/events/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index a05bdd48e16e..fd76d14c2776 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -39,7 +39,7 @@ struct partial_cluster; { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ { EXT4_GET_BLOCKS_UNWRIT_EXT, "UNWRIT" }, \ { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \ - { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \ + { EXT4_GET_BLOCKS_SPLIT_NOMERGE, "SPLIT_NOMERGE" }, \ { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ -- cgit v1.2.3 From 85f5491d9c6e9662653c8e6e7b70637b98537ecc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 3 Nov 2025 21:34:01 +0100 Subject: libceph: drop started parameter of __ceph_open_session() With the previous commit revamping the timeout handling, started isn't used anymore. It could be taken into account by adjusting the initial value of the timeout, but there is little point as both callers capture the timestamp shortly before calling __ceph_open_session() -- the only thing of note that happens in the interim is taking client->mount_mutex and that isn't expected to take multiple seconds. Signed-off-by: Ilya Dryomov Reviewed-by: Viacheslav Dubeyko --- include/linux/ceph/libceph.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 733e7f93db66..63e0e2aa1ce9 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -306,8 +306,7 @@ struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); u64 ceph_client_gid(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); extern void ceph_reset_client_addr(struct ceph_client *client); -extern int __ceph_open_session(struct ceph_client *client, - unsigned long started); +extern int __ceph_open_session(struct ceph_client *client); extern int ceph_open_session(struct ceph_client *client); int ceph_wait_for_latest_osdmap(struct ceph_client *client, unsigned long timeout); -- cgit v1.2.3 From 6aac2aa2dfae38b60f22c3dfe4103ceefbe2d761 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 24 Nov 2025 18:11:45 +0000 Subject: phy: rename hwtstamp callback to hwtstamp_set PHY devices has hwtstamp callback which actually performs set operation. Rename it to better reflect the action. Reviewed-by: Russell King (Oracle) Reviewed-by: Kory Maincent Reviewed-by: Andrew Lunn Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20251124181151.277256-2-vadim.fedorenko@linux.dev Signed-off-by: Jakub Kicinski --- include/linux/mii_timestamper.h | 8 ++++---- include/linux/phy.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h index 995db62570f9..08863c0e9ea3 100644 --- a/include/linux/mii_timestamper.h +++ b/include/linux/mii_timestamper.h @@ -27,7 +27,7 @@ struct phy_device; * as soon as a timestamp becomes available. One of the PTP_CLASS_ * values is passed in 'type'. * - * @hwtstamp: Handles SIOCSHWTSTAMP ioctl for hardware time stamping. + * @hwtstamp_set: Handles SIOCSHWTSTAMP ioctl for hardware time stamping. * * @link_state: Allows the device to respond to changes in the link * state. The caller invokes this function while holding @@ -51,9 +51,9 @@ struct mii_timestamper { void (*txtstamp)(struct mii_timestamper *mii_ts, struct sk_buff *skb, int type); - int (*hwtstamp)(struct mii_timestamper *mii_ts, - struct kernel_hwtstamp_config *kernel_config, - struct netlink_ext_ack *extack); + int (*hwtstamp_set)(struct mii_timestamper *mii_ts, + struct kernel_hwtstamp_config *kernel_config, + struct netlink_ext_ack *extack); void (*link_state)(struct mii_timestamper *mii_ts, struct phy_device *phydev); diff --git a/include/linux/phy.h b/include/linux/phy.h index 65b0c3ca6a2b..059a104223c4 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1915,7 +1915,7 @@ static inline bool phy_polling_mode(struct phy_device *phydev) */ static inline bool phy_has_hwtstamp(struct phy_device *phydev) { - return phydev && phydev->mii_ts && phydev->mii_ts->hwtstamp; + return phydev && phydev->mii_ts && phydev->mii_ts->hwtstamp_set; } /** @@ -1950,7 +1950,7 @@ static inline int phy_hwtstamp(struct phy_device *phydev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { - return phydev->mii_ts->hwtstamp(phydev->mii_ts, cfg, extack); + return phydev->mii_ts->hwtstamp_set(phydev->mii_ts, cfg, extack); } static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, -- cgit v1.2.3 From f467777efbfb8034d813b601b961b25f777b3d37 Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Mon, 24 Nov 2025 18:11:46 +0000 Subject: phy: add hwtstamp_get callback to phy drivers PHY devices had lack of hwtstamp_get callback even though most of them are tracking configuration info. Introduce new call back to mii_timestamper. Reviewed-by: Russell King (Oracle) Reviewed-by: Kory Maincent Signed-off-by: Vadim Fedorenko Link: https://patch.msgid.link/20251124181151.277256-3-vadim.fedorenko@linux.dev Signed-off-by: Jakub Kicinski --- include/linux/mii_timestamper.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mii_timestamper.h b/include/linux/mii_timestamper.h index 08863c0e9ea3..3102c425c8e0 100644 --- a/include/linux/mii_timestamper.h +++ b/include/linux/mii_timestamper.h @@ -29,6 +29,8 @@ struct phy_device; * * @hwtstamp_set: Handles SIOCSHWTSTAMP ioctl for hardware time stamping. * + * @hwtstamp_get: Handles SIOCGHWTSTAMP ioctl for hardware time stamping. + * * @link_state: Allows the device to respond to changes in the link * state. The caller invokes this function while holding * the phy_device mutex. @@ -55,6 +57,9 @@ struct mii_timestamper { struct kernel_hwtstamp_config *kernel_config, struct netlink_ext_ack *extack); + int (*hwtstamp_get)(struct mii_timestamper *mii_ts, + struct kernel_hwtstamp_config *kernel_config); + void (*link_state)(struct mii_timestamper *mii_ts, struct phy_device *phydev); -- cgit v1.2.3 From 4a93adcbd201aad5ba607810cfe1b19d44e5d171 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 12 Nov 2025 11:28:46 +0100 Subject: of: Add wrappers to match root node with OF device ID tables Several drivers duplicate same code for getting reference to the root node, matching it against 'struct of_device_id' table and getting out the match data from the table entry. There is a of_machine_compatible_match() wrapper but it takes array of strings, which is not suitable for many drivers since they want the driver data associated with each compatible. Add two wrappers, similar to existing of_device_get_match_data(): 1. of_machine_device_match() doing only matching against 'struct of_device_id' and returning bool. 2. of_machine_get_match_data() doing the matching and returning associated driver data for found compatible. Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Krzysztof Kozlowski Reviewed-by: Lukasz Luba Tested-by: AngeloGioacchino Del Regno Link: https://patch.msgid.link/20251112-b4-of-match-matchine-data-v2-1-d46b72003fd6@linaro.org Signed-off-by: Rob Herring (Arm) --- include/linux/of.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/of.h b/include/linux/of.h index 121a288ca92d..01bb3affcd49 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -407,6 +407,8 @@ extern int of_alias_get_id(const struct device_node *np, const char *stem); extern int of_alias_get_highest_id(const char *stem); bool of_machine_compatible_match(const char *const *compats); +bool of_machine_device_match(const struct of_device_id *matches); +const void *of_machine_get_match_data(const struct of_device_id *matches); /** * of_machine_is_compatible - Test root of device tree for a given compatible value @@ -855,6 +857,17 @@ static inline bool of_machine_compatible_match(const char *const *compats) return false; } +static inline bool of_machine_device_match(const struct of_device_id *matches) +{ + return false; +} + +static inline const void * +of_machine_get_match_data(const struct of_device_id *matches) +{ + return NULL; +} + static inline bool of_console_check(const struct device_node *dn, const char *name, int index) { return false; -- cgit v1.2.3 From 1cd1c472343b06d6d32038636ce51bfa2251e3cf Mon Sep 17 00:00:00 2001 From: Jon Kohler Date: Tue, 25 Nov 2025 15:27:53 -0700 Subject: virtio-net: avoid unnecessary checksum calculation on guest RX Commit a2fb4bc4e2a6 ("net: implement virtio helpers to handle UDP GSO tunneling.") inadvertently altered checksum offload behavior for guests not using UDP GSO tunneling. Before, tun_put_user called tun_vnet_hdr_from_skb, which passed has_data_valid = true to virtio_net_hdr_from_skb. After, tun_put_user began calling tun_vnet_hdr_tnl_from_skb instead, which passes has_data_valid = false into both call sites. This caused virtio hdr flags to not include VIRTIO_NET_HDR_F_DATA_VALID for SKBs where skb->ip_summed == CHECKSUM_UNNECESSARY. As a result, guests are forced to recalculate checksums unnecessarily. Restore the previous behavior by ensuring has_data_valid = true is passed in the !tnl_gso_type case, but only from tun side, as virtio_net_hdr_tnl_from_skb() is used also by the virtio_net driver, which in turn must not use VIRTIO_NET_HDR_F_DATA_VALID on tx. cc: stable@vger.kernel.org Fixes: a2fb4bc4e2a6 ("net: implement virtio helpers to handle UDP GSO tunneling.") Signed-off-by: Jon Kohler Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Link: https://patch.msgid.link/20251125222754.1737443-1-jon@nutanix.com Signed-off-by: Jakub Kicinski --- include/linux/virtio_net.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index b673c31569f3..75dabb763c65 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -384,7 +384,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, struct virtio_net_hdr_v1_hash_tunnel *vhdr, bool tnl_hdr_negotiated, bool little_endian, - int vlan_hlen) + int vlan_hlen, + bool has_data_valid) { struct virtio_net_hdr *hdr = (struct virtio_net_hdr *)vhdr; unsigned int inner_nh, outer_th; @@ -394,8 +395,8 @@ virtio_net_hdr_tnl_from_skb(const struct sk_buff *skb, tnl_gso_type = skb_shinfo(skb)->gso_type & (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM); if (!tnl_gso_type) - return virtio_net_hdr_from_skb(skb, hdr, little_endian, false, - vlan_hlen); + return virtio_net_hdr_from_skb(skb, hdr, little_endian, + has_data_valid, vlan_hlen); /* Tunnel support not negotiated but skb ask for it. */ if (!tnl_hdr_negotiated) -- cgit v1.2.3 From 361173f95ae4b726ebbbf0bd594274f5576c4abc Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 13 Nov 2025 04:34:31 -0500 Subject: virtio: fix typo in virtio_device_ready() comment "coherenct" -> "coherent" Fixes: 8b4ec69d7e09 ("virtio: harden vring IRQ") Message-Id: Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 16001e9f9b39..1ea5baa62141 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -362,7 +362,7 @@ void virtio_device_ready(struct virtio_device *dev) * specific set_status() method. * * A well behaved device will only notify a virtqueue after - * DRIVER_OK, this means the device should "see" the coherenct + * DRIVER_OK, this means the device should "see" the coherent * memory write that set vq->broken as false which is done by * the driver when it sees DRIVER_OK, then the following * driver's vring_interrupt() will see vq->broken as false so -- cgit v1.2.3 From 7831791e77a1cd29528d4dc336ce14466aef5ba6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 13 Nov 2025 04:34:34 -0500 Subject: virtio: fix whitespace in virtio_config_ops The finalize_features documentation uses a tab between words. Use space instead. Fixes: d16c0cd27331 ("docs: driver-api: virtio: virtio on Linux") Message-Id: <39d7685c82848dc6a876d175e33a1407f6ab3fc1.1763026134.git.mst@redhat.com> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 1ea5baa62141..dbc7eff1f101 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -86,7 +86,7 @@ struct virtqueue_info { * vdev: the virtio_device * This sends the driver feature bits to the device: it can change * the dev->feature bits if it wants. - * Note that despite the name this can be called any number of + * Note that despite the name this can be called any number of * times. * Returns 0 on success or error status * @bus_name: return the bus name associated with the device (optional) -- cgit v1.2.3 From 63598fba55ab9d384818fed48dc04006cecf7be4 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 13 Nov 2025 04:34:36 -0500 Subject: virtio: fix grammar in virtio_queue_info docs Fix grammar in the description of @ctx Fixes: c502eb85c34e ("virtio: introduce virtio_queue_info struct and find_vqs_info() config op") Message-Id: Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index dbc7eff1f101..78cf4119f567 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -24,7 +24,7 @@ typedef void vq_callback_t(struct virtqueue *); * a virtqueue unused by the driver. * @callback: A callback to invoke on a used buffer notification. * NULL for a virtqueue that does not need a callback. - * @ctx: A flag to indicate to maintain an extra context per virtqueue. + * @ctx: whether to maintain an extra context per virtqueue. */ struct virtqueue_info { const char *name; -- cgit v1.2.3 From c15f42e09178d2849744ccf064200f5e7f71e688 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 13 Nov 2025 04:34:38 -0500 Subject: virtio: fix grammar in virtio_map_ops docs Fix grammar issues in the virtio_map_ops docs: - missing article before "transport" - "implements" -> "implement" to match subject Fixes: bee8c7c24b73 ("virtio: introduce map ops in virtio core") Message-Id: <3f7bcae5a984f14b72e67e82572b110acb06fa7e.1763026134.git.mst@redhat.com> Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 78cf4119f567..6660132258d4 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -141,8 +141,8 @@ struct virtio_config_ops { /** * struct virtio_map_ops - operations for mapping buffer for a virtio device - * Note: For transport that has its own mapping logic it must - * implements all of the operations + * Note: For a transport that has its own mapping logic it must + * implement all of the operations * @map_page: map a buffer to the device * map: metadata for performing mapping * page: the page that will be mapped by the device -- cgit v1.2.3 From 5e88a5a97d113619b674ebfdd1d2065f2edd10eb Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 13 Nov 2025 04:34:41 -0500 Subject: virtio: standardize Returns documentation style Remove colons after "Returns" in virtio_map_ops function documentation - both to avoid triggering an htmldoc warning and for consistency with virtio_config_ops. This affects map_page, alloc, need_sync, and max_mapping_size. Fixes: bee8c7c24b73 ("virtio: introduce map ops in virtio core") Message-Id: Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 6660132258d4..e231147ff92d 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -150,7 +150,7 @@ struct virtio_config_ops { * size: the buffer size * dir: mapping direction * attrs: mapping attributes - * Returns: the mapped address + * Returns the mapped address * @unmap_page: unmap a buffer from the device * map: device specific mapping map * map_handle: the mapped address @@ -172,7 +172,7 @@ struct virtio_config_ops { * size: the size of the buffer * map_handle: the mapping address to sync * gfp: allocation flag (GFP_XXX) - * Returns: virtual address of the allocated buffer + * Returns virtual address of the allocated buffer * @free: free a coherent buffer mapping * map: metadata for performing mapping * size: the size of the buffer @@ -182,13 +182,13 @@ struct virtio_config_ops { * @need_sync: if the buffer needs synchronization * map: metadata for performing mapping * map_handle: the mapped address - * Returns: whether the buffer needs synchronization + * Returns whether the buffer needs synchronization * @mapping_error: if the mapping address is error * map: metadata for performing mapping * map_handle: the mapped address * @max_mapping_size: get the maximum buffer size that can be mapped * map: metadata for performing mapping - * Returns: the maximum buffer size that can be mapped + * Returns the maximum buffer size that can be mapped */ struct virtio_map_ops { dma_addr_t (*map_page)(union virtio_map map, struct page *page, -- cgit v1.2.3 From 43236d8bbafff94b423afecc4a692dd90602d426 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 13 Nov 2025 04:34:43 -0500 Subject: virtio: fix virtqueue_set_affinity() docs Rewrite the comment for better grammar and clarity. Fixes: 75a0a52be3c2 ("virtio: introduce an API to set affinity for a virtqueue") Message-Id: Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index e231147ff92d..1a019a1f168d 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -384,7 +384,7 @@ const char *virtio_bus_name(struct virtio_device *vdev) * @vq: the virtqueue * @cpu_mask: the cpu mask * - * Pay attention the function are best-effort: the affinity hint may not be set + * Note that this function is best-effort: the affinity hint may not be set * due to config support, irq type and sharing. * */ -- cgit v1.2.3 From deb55fc994e3dc38f139c0147c15fc2a9db27086 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 13 Nov 2025 04:34:49 -0500 Subject: virtio: fix map ops comment @free will free the map handle not sync it. Fix the doc to match. Fixes: bee8c7c24b73 ("virtio: introduce map ops in virtio core") Message-Id: Acked-by: Jason Wang Signed-off-by: Michael S. Tsirkin --- include/linux/virtio_config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 1a019a1f168d..a1af2676bbe6 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -177,7 +177,7 @@ struct virtio_config_ops { * map: metadata for performing mapping * size: the size of the buffer * vaddr: virtual address of the buffer - * map_handle: the mapping address to sync + * map_handle: the mapping address that needs to be freed * attrs: unmapping attributes * @need_sync: if the buffer needs synchronization * map: metadata for performing mapping -- cgit v1.2.3 From 9513f25056b22100ddffe24898c587873b0d022c Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 21 Oct 2025 10:56:57 -0400 Subject: virtio: clean up features qword/dword terms virtio pci uses word to mean "16 bits". mmio uses it to mean "32 bits". To avoid confusion, let's avoid the term in core virtio altogether. Just say U64 to mean "64 bit". Fixes: e7d4c1c5a546 ("virtio: introduce extended features") Cc: Paolo Abeni Acked-by: Jason Wang Message-ID: Signed-off-by: Michael S. Tsirkin --- include/linux/virtio.h | 2 +- include/linux/virtio_config.h | 2 +- include/linux/virtio_features.h | 29 +++++++++++++++-------------- include/linux/virtio_pci_modern.h | 8 ++++---- 4 files changed, 21 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 96c66126c074..132a474e5914 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -177,7 +177,7 @@ struct virtio_device { union virtio_map vmap; #ifdef CONFIG_VIRTIO_DEBUG struct dentry *debugfs_dir; - u64 debugfs_filter_features[VIRTIO_FEATURES_DWORDS]; + u64 debugfs_filter_features[VIRTIO_FEATURES_U64S]; #endif }; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index a1af2676bbe6..69f84ea85d71 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -80,7 +80,7 @@ struct virtqueue_info { * Returns the first 64 feature bits. * @get_extended_features: * vdev: the virtio_device - * Returns the first VIRTIO_FEATURES_MAX feature bits (all we currently + * Returns the first VIRTIO_FEATURES_BITS feature bits (all we currently * need). * @finalize_features: confirm what device features we'll be using. * vdev: the virtio_device diff --git a/include/linux/virtio_features.h b/include/linux/virtio_features.h index f748f2f87de8..ea2ad8717882 100644 --- a/include/linux/virtio_features.h +++ b/include/linux/virtio_features.h @@ -4,15 +4,16 @@ #include -#define VIRTIO_FEATURES_DWORDS 2 -#define VIRTIO_FEATURES_MAX (VIRTIO_FEATURES_DWORDS * 64) -#define VIRTIO_FEATURES_WORDS (VIRTIO_FEATURES_DWORDS * 2) +#define VIRTIO_FEATURES_U64S 2 +#define VIRTIO_FEATURES_BITS (VIRTIO_FEATURES_U64S * 64) + #define VIRTIO_BIT(b) BIT_ULL((b) & 0x3f) -#define VIRTIO_DWORD(b) ((b) >> 6) +#define VIRTIO_U64(b) ((b) >> 6) + #define VIRTIO_DECLARE_FEATURES(name) \ union { \ u64 name; \ - u64 name##_array[VIRTIO_FEATURES_DWORDS];\ + u64 name##_array[VIRTIO_FEATURES_U64S];\ } static inline bool virtio_features_chk_bit(unsigned int bit) @@ -22,9 +23,9 @@ static inline bool virtio_features_chk_bit(unsigned int bit) * Don't care returning the correct value: the build * will fail before any bad features access */ - BUILD_BUG_ON(bit >= VIRTIO_FEATURES_MAX); + BUILD_BUG_ON(bit >= VIRTIO_FEATURES_BITS); } else { - if (WARN_ON_ONCE(bit >= VIRTIO_FEATURES_MAX)) + if (WARN_ON_ONCE(bit >= VIRTIO_FEATURES_BITS)) return false; } return true; @@ -34,26 +35,26 @@ static inline bool virtio_features_test_bit(const u64 *features, unsigned int bit) { return virtio_features_chk_bit(bit) && - !!(features[VIRTIO_DWORD(bit)] & VIRTIO_BIT(bit)); + !!(features[VIRTIO_U64(bit)] & VIRTIO_BIT(bit)); } static inline void virtio_features_set_bit(u64 *features, unsigned int bit) { if (virtio_features_chk_bit(bit)) - features[VIRTIO_DWORD(bit)] |= VIRTIO_BIT(bit); + features[VIRTIO_U64(bit)] |= VIRTIO_BIT(bit); } static inline void virtio_features_clear_bit(u64 *features, unsigned int bit) { if (virtio_features_chk_bit(bit)) - features[VIRTIO_DWORD(bit)] &= ~VIRTIO_BIT(bit); + features[VIRTIO_U64(bit)] &= ~VIRTIO_BIT(bit); } static inline void virtio_features_zero(u64 *features) { - memset(features, 0, sizeof(features[0]) * VIRTIO_FEATURES_DWORDS); + memset(features, 0, sizeof(features[0]) * VIRTIO_FEATURES_U64S); } static inline void virtio_features_from_u64(u64 *features, u64 from) @@ -66,7 +67,7 @@ static inline bool virtio_features_equal(const u64 *f1, const u64 *f2) { int i; - for (i = 0; i < VIRTIO_FEATURES_DWORDS; ++i) + for (i = 0; i < VIRTIO_FEATURES_U64S; ++i) if (f1[i] != f2[i]) return false; return true; @@ -74,14 +75,14 @@ static inline bool virtio_features_equal(const u64 *f1, const u64 *f2) static inline void virtio_features_copy(u64 *to, const u64 *from) { - memcpy(to, from, sizeof(to[0]) * VIRTIO_FEATURES_DWORDS); + memcpy(to, from, sizeof(to[0]) * VIRTIO_FEATURES_U64S); } static inline void virtio_features_andnot(u64 *to, const u64 *f1, const u64 *f2) { int i; - for (i = 0; i < VIRTIO_FEATURES_DWORDS; i++) + for (i = 0; i < VIRTIO_FEATURES_U64S; i++) to[i] = f1[i] & ~f2[i]; } diff --git a/include/linux/virtio_pci_modern.h b/include/linux/virtio_pci_modern.h index 48bc12d1045b..9a3f2fc53bd6 100644 --- a/include/linux/virtio_pci_modern.h +++ b/include/linux/virtio_pci_modern.h @@ -107,7 +107,7 @@ void vp_modern_set_extended_features(struct virtio_pci_modern_device *mdev, static inline u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev) { - u64 features_array[VIRTIO_FEATURES_DWORDS]; + u64 features_array[VIRTIO_FEATURES_U64S]; vp_modern_get_extended_features(mdev, features_array); return features_array[0]; @@ -116,11 +116,11 @@ vp_modern_get_features(struct virtio_pci_modern_device *mdev) static inline u64 vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev) { - u64 features_array[VIRTIO_FEATURES_DWORDS]; + u64 features_array[VIRTIO_FEATURES_U64S]; int i; vp_modern_get_driver_extended_features(mdev, features_array); - for (i = 1; i < VIRTIO_FEATURES_DWORDS; ++i) + for (i = 1; i < VIRTIO_FEATURES_U64S; ++i) WARN_ON_ONCE(features_array[i]); return features_array[0]; } @@ -128,7 +128,7 @@ vp_modern_get_driver_features(struct virtio_pci_modern_device *mdev) static inline void vp_modern_set_features(struct virtio_pci_modern_device *mdev, u64 features) { - u64 features_array[VIRTIO_FEATURES_DWORDS]; + u64 features_array[VIRTIO_FEATURES_U64S]; virtio_features_from_u64(features_array, features); vp_modern_set_extended_features(mdev, features_array); -- cgit v1.2.3 From e6c43c95009035a63091cd49736886f883127510 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 21 Nov 2025 08:39:55 -0800 Subject: net: phy: Add MDIO_PMA_CTRL1_SPEED for 2.5G and 5G to reflect PMA values The 2.5G and 5G values are not consistent between the PCS CTRL1 and PMA CTRL1 values. In order to avoid confusion between the two I am updating the values to include "PMA" in the name similar to values used in similar places. To avoid breaking UAPI I have retained the original macros and just defined them as the new PMA based defines. Signed-off-by: Alexander Duyck Link: https://patch.msgid.link/176374319569.959489.6610469879021800710.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Paolo Abeni --- include/uapi/linux/mdio.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index 6975f182b22c..9ee6eeae64b8 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -116,10 +116,18 @@ #define MDIO_CTRL1_SPEED10G (MDIO_CTRL1_SPEEDSELEXT | 0x00) /* 10PASS-TS/2BASE-TL */ #define MDIO_CTRL1_SPEED10P2B (MDIO_CTRL1_SPEEDSELEXT | 0x04) +/* Note: the MDIO_CTRL1_SPEED_XXX values for everything past 10PASS-TS/2BASE-TL + * do not match between the PCS and PMA values. Any additions past this point + * should be PMA or PCS specific. The following 2 defines are workarounds for + * values added before this was caught. They should be considered deprecated. + */ +#define MDIO_CTRL1_SPEED2_5G MDIO_PMA_CTRL1_SPEED2_5G +#define MDIO_CTRL1_SPEED5G MDIO_PMA_CTRL1_SPEED5G /* 2.5 Gb/s */ -#define MDIO_CTRL1_SPEED2_5G (MDIO_CTRL1_SPEEDSELEXT | 0x18) +#define MDIO_PMA_CTRL1_SPEED2_5G (MDIO_CTRL1_SPEEDSELEXT | 0x18) /* 5 Gb/s */ -#define MDIO_CTRL1_SPEED5G (MDIO_CTRL1_SPEEDSELEXT | 0x1c) +#define MDIO_PMA_CTRL1_SPEED5G (MDIO_CTRL1_SPEEDSELEXT | 0x1c) + /* Status register 1. */ #define MDIO_STAT1_LPOWERABLE 0x0002 /* Low-power ability */ -- cgit v1.2.3 From 7622d55276932bfeb947b7b6cbf7ea0aa41feeb8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 21 Nov 2025 08:40:02 -0800 Subject: net: pcs: xpcs: Add support for 25G, 50G, and 100G interfaces With this change we are adding support for 25G, 50G, and 100G interface types to the XPCS driver. This had supposedly been enabled with the addition of XLGMII but I don't see any capability for configuration there so I suspect it may need to be refactored in the future. With this change we can enable the XPCS driver with the selected interface and it should be able to detect link, speed, and report the link status to the phylink interface. Signed-off-by: Alexander Duyck Link: https://patch.msgid.link/176374320248.959489.11649590675011158859.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Paolo Abeni --- include/uapi/linux/mdio.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index 9ee6eeae64b8..f23cab33e586 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -123,6 +123,12 @@ */ #define MDIO_CTRL1_SPEED2_5G MDIO_PMA_CTRL1_SPEED2_5G #define MDIO_CTRL1_SPEED5G MDIO_PMA_CTRL1_SPEED5G +/* 100 Gb/s */ +#define MDIO_PCS_CTRL1_SPEED100G (MDIO_CTRL1_SPEEDSELEXT | 0x10) +/* 25 Gb/s */ +#define MDIO_PCS_CTRL1_SPEED25G (MDIO_CTRL1_SPEEDSELEXT | 0x14) +/* 50 Gb/s */ +#define MDIO_PCS_CTRL1_SPEED50G (MDIO_CTRL1_SPEEDSELEXT | 0x18) /* 2.5 Gb/s */ #define MDIO_PMA_CTRL1_SPEED2_5G (MDIO_CTRL1_SPEEDSELEXT | 0x18) /* 5 Gb/s */ -- cgit v1.2.3 From 39e138173ae7641e952b456d2de7ad2ac03e8d88 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 21 Nov 2025 08:40:09 -0800 Subject: net: pcs: xpcs: Fix PMA identifier handling in XPCS The XPCS driver was mangling the PMA identifier as the original code appears to have been focused on just capturing the OUI. Rather than store a mangled ID it is better to work with the actual PMA ID and instead just mask out the values that don't apply rather than shifting them and reordering them as you still don't get the original OUI for the NIC without having to bitswap the values as per the definition of the layout in IEEE 802.3-2022 22.2.4.3.1. By laying it out as it was in the hardware it is also less likely for us to have an unintentional collision as the enum values will occupy the revision number area while the OUI occupies the upper 22 bits. Signed-off-by: Alexander Duyck Link: https://patch.msgid.link/176374320920.959489.17267159479370601070.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Paolo Abeni --- include/linux/pcs/pcs-xpcs.h | 2 +- include/uapi/linux/mdio.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index e40f554ff717..4cf6bd611e5a 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -38,7 +38,7 @@ enum dw_xpcs_pma_id { DW_XPCS_PMA_GEN4_6G_ID, DW_XPCS_PMA_GEN5_10G_ID, DW_XPCS_PMA_GEN5_12G_ID, - WX_TXGBE_XPCS_PMA_10G_ID = 0x0018fc80, + WX_TXGBE_XPCS_PMA_10G_ID = 0xfc806000, }; struct dw_xpcs_info { diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index f23cab33e586..8d769f100de6 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -147,6 +147,11 @@ #define MDIO_AN_STAT1_PAGE 0x0040 /* Page received */ #define MDIO_AN_STAT1_XNP 0x0080 /* Extended next page status */ +/* Device Identifier 2 */ +#define MDIO_DEVID2_OUI 0xfc00 /* OUI Portion of PHY ID */ +#define MDIO_DEVID2_MODEL_NUM 0x03f0 /* Manufacturer's Model Number */ +#define MDIO_DEVID2_REV_NUM 0x000f /* Revision Number */ + /* Speed register. */ #define MDIO_SPEED_10G 0x0001 /* 10G capable */ #define MDIO_PMA_SPEED_2B 0x0002 /* 2BASE-TL capable */ -- cgit v1.2.3 From 3f29dd34f75a09ee7f8333305618edb44617d835 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 21 Nov 2025 08:40:16 -0800 Subject: net: pcs: xpcs: Add support for FBNIC 25G, 50G, 100G PMD The fbnic driver is planning to make use of the XPCS driver to enable support for PCS and better integration with phylink. To do this though we will need to enable several workarounds since the PMD interface for fbnic is likely to be unique since it is a mix of two different vendor products with a unique wrapper around the IP. I have generated a PHY identifier based on IEEE 802.3-2022 22.2.4.3.1 using an OUI belonging to Meta Platforms and used with our NICs. Using this we will provide it as the PMD ID via the SW based MDIO interface so that the fbnic device can be identified and necessary workarounds enabled in the XPCS driver. As an initial workaround this change adds an exception so that soft_reset is not set when the driver is initially bound to the PCS. In addition I have added logic to integrate the PMD Rx signal detect state into the link state for the PCS. With this we can avoid the link coming up too soon on the FBNIC PMD and as a result of it being in the training state so we can avoid link flaps. Signed-off-by: Alexander Duyck Link: https://patch.msgid.link/176374321695.959489.6648161125012056619.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Paolo Abeni --- include/linux/pcs/pcs-xpcs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pcs/pcs-xpcs.h b/include/linux/pcs/pcs-xpcs.h index 4cf6bd611e5a..36073f7b6bb4 100644 --- a/include/linux/pcs/pcs-xpcs.h +++ b/include/linux/pcs/pcs-xpcs.h @@ -39,6 +39,8 @@ enum dw_xpcs_pma_id { DW_XPCS_PMA_GEN5_10G_ID, DW_XPCS_PMA_GEN5_12G_ID, WX_TXGBE_XPCS_PMA_10G_ID = 0xfc806000, + /* Meta Platforms OUI 88:25:08, model 0, revision 0 */ + MP_FBNIC_XPCS_PMA_100G_ID = 0x46904000, }; struct dw_xpcs_info { -- cgit v1.2.3 From f2f36500a63b73a8be90127322ad740253cf89c0 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 25 Oct 2025 13:15:37 +0200 Subject: configfs: Constify ct_group_ops in struct config_item_type Make 'ct_group_ops' const in struct config_item_type. This allows constification of many structures which hold some function pointers. Signed-off-by: Christophe JAILLET Reviewed-by: Breno Leitao Link: https://lore.kernel.org/r/6b720cf407e8a6d30f35beb72e031b2553d1ab7e.1761390472.git.christophe.jaillet@wanadoo.fr Signed-off-by: Andreas Hindborg --- include/linux/configfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 698520b1bfdb..31a7d7124460 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -65,7 +65,7 @@ extern void config_item_put(struct config_item *); struct config_item_type { struct module *ct_owner; struct configfs_item_operations *ct_item_ops; - struct configfs_group_operations *ct_group_ops; + const struct configfs_group_operations *ct_group_ops; struct configfs_attribute **ct_attrs; struct configfs_bin_attribute **ct_bin_attrs; }; -- cgit v1.2.3 From f7f78098690d60a03b47942ac7d73ea17b42239e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 25 Oct 2025 13:15:38 +0200 Subject: configfs: Constify ct_item_ops in struct config_item_type Make 'ct_item_ops' const in struct config_item_type. This allows constification of many structures which hold some function pointers. Signed-off-by: Christophe JAILLET Reviewed-by: Breno Leitao Link: https://lore.kernel.org/r/f43cb57418a7f59e883be8eedc7d6abe802a2094.1761390472.git.christophe.jaillet@wanadoo.fr Signed-off-by: Andreas Hindborg --- include/linux/configfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/configfs.h b/include/linux/configfs.h index 31a7d7124460..ef65c75beeaa 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -64,7 +64,7 @@ extern void config_item_put(struct config_item *); struct config_item_type { struct module *ct_owner; - struct configfs_item_operations *ct_item_ops; + const struct configfs_item_operations *ct_item_ops; const struct configfs_group_operations *ct_group_ops; struct configfs_attribute **ct_attrs; struct configfs_bin_attribute **ct_bin_attrs; -- cgit v1.2.3 From d3f52f53a56278ce5ffeafa3cc6cfb3ecef770fe Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 5 Nov 2025 12:32:14 -0800 Subject: srcu: Create an SRCU-fast-updown API This commit creates an SRCU-fast-updown API, including DEFINE_SRCU_FAST_UPDOWN(), DEFINE_STATIC_SRCU_FAST_UPDOWN(), __init_srcu_struct_fast_updown(), init_srcu_struct_fast_updown(), srcu_read_lock_fast_updown(), srcu_read_unlock_fast_updown(), __srcu_read_lock_fast_updown(), and __srcu_read_unlock_fast_updown(). These are initially identical to their SRCU-fast counterparts, but both SRCU-fast and SRCU-fast-updown will be optimized in different directions by later commits. SRCU-fast will lack any sort of srcu_down_read() and srcu_up_read() APIs, which will enable extremely efficient NMI safety. For its part, SRCU-fast-updown will not be NMI safe, which will enable reasonably efficient implementations of srcu_down_read_fast() and srcu_up_read_fast(). This API fork happens to meet two different future use cases. * SRCU-fast will become the reimplementation basis for RCU-TASK-TRACE for consolidation. Since RCU-TASK-TRACE must be NMI safe, SRCU-fast must be as well. * SRCU-fast-updown will be needed for uretprobes code in order to get rid of the read-side memory barriers while still allowing entering the reader at task level while exiting it in a timer handler. This commit also adds rcutorture tests for the new APIs. This (annoyingly) needs to be in the same commit for bisectability. With this commit, the 0x8 value tests SRCU-fast-updown. However, most SRCU-fast testing will be via the RCU Tasks Trace wrappers. [ paulmck: Apply s/0x8/0x4/ missing change per Boqun Feng feedback. ] [ paulmck: Apply Akira Yokosawa feedback. ] Signed-off-by: Paul E. McKenney Cc: Andrii Nakryiko Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: Signed-off-by: Frederic Weisbecker --- include/linux/srcu.h | 77 ++++++++++++++++++++++++++++++++++++++++++++---- include/linux/srcutiny.h | 16 ++++++++++ include/linux/srcutree.h | 55 ++++++++++++++++++++++++++++++++-- 3 files changed, 141 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index 1dd6812aabe7..344ad51c8f6c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -28,6 +28,8 @@ struct srcu_struct; int __init_srcu_struct(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); #ifndef CONFIG_TINY_SRCU int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lock_class_key *key); +int __init_srcu_struct_fast_updown(struct srcu_struct *ssp, const char *name, + struct lock_class_key *key); #endif // #ifndef CONFIG_TINY_SRCU #define init_srcu_struct(ssp) \ @@ -44,12 +46,20 @@ int __init_srcu_struct_fast(struct srcu_struct *ssp, const char *name, struct lo __init_srcu_struct_fast((ssp), #ssp, &__srcu_key); \ }) +#define init_srcu_struct_fast_updown(ssp) \ +({ \ + static struct lock_class_key __srcu_key; \ + \ + __init_srcu_struct_fast_updown((ssp), #ssp, &__srcu_key); \ +}) + #define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name }, #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ int init_srcu_struct(struct srcu_struct *ssp); #ifndef CONFIG_TINY_SRCU int init_srcu_struct_fast(struct srcu_struct *ssp); +int init_srcu_struct_fast_updown(struct srcu_struct *ssp); #endif // #ifndef CONFIG_TINY_SRCU #define __SRCU_DEP_MAP_INIT(srcu_name) @@ -305,6 +315,46 @@ static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct * return retval; } +/** + * srcu_read_lock_fast_updown - register a new reader for an SRCU-fast-updown structure. + * @ssp: srcu_struct in which to register the new reader. + * + * Enter an SRCU read-side critical section, but for a light-weight + * smp_mb()-free reader. See srcu_read_lock() for more information. + * This function is compatible with srcu_down_read_fast(), but is not + * NMI-safe. + * + * For srcu_read_lock_fast_updown() to be used on an srcu_struct + * structure, that structure must have been defined using either + * DEFINE_SRCU_FAST_UPDOWN() or DEFINE_STATIC_SRCU_FAST_UPDOWN() on the one + * hand or initialized with init_srcu_struct_fast_updown() on the other. + * Such an srcu_struct structure cannot be passed to any non-fast-updown + * variant of srcu_read_{,un}lock() or srcu_{down,up}_read(). In kernels + * built with CONFIG_PROVE_RCU=y, __srcu_check_read_flavor() will complain + * bitterly if you ignore this * restriction. + * + * Grace-period auto-expediting is disabled for SRCU-fast-updown + * srcu_struct structures because SRCU-fast-updown expedited grace periods + * invoke synchronize_rcu_expedited(), IPIs and all. If you need expedited + * SRCU-fast-updown grace periods, use synchronize_srcu_expedited(). + * + * The srcu_read_lock_fast_updown() function can be invoked only from + * those contexts where RCU is watching, that is, from contexts where + * it would be legal to invoke rcu_read_lock(). Otherwise, lockdep will + * complain. + */ +static inline struct srcu_ctr __percpu *srcu_read_lock_fast_updown(struct srcu_struct *ssp) +__acquires(ssp) +{ + struct srcu_ctr __percpu *retval; + + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_lock_fast_updown()."); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN); + retval = __srcu_read_lock_fast_updown(ssp); + rcu_try_lock_acquire(&ssp->dep_map); + return retval; +} + /* * Used by tracing, cannot be traced and cannot call lockdep. * See srcu_read_lock_fast() for more information. @@ -335,8 +385,8 @@ static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct * { WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_down_read_fast()."); - srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); - return __srcu_read_lock_fast(ssp); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN); + return __srcu_read_lock_fast_updown(ssp); } /** @@ -432,6 +482,23 @@ static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ct RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_read_unlock_fast()."); } +/** + * srcu_read_unlock_fast_updown - unregister a old reader from an SRCU-fast-updown structure. + * @ssp: srcu_struct in which to unregister the old reader. + * @scp: return value from corresponding srcu_read_lock_fast_updown(). + * + * Exit an SRCU-fast-updown read-side critical section. + */ +static inline void +srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) __releases(ssp) +{ + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN); + srcu_lock_release(&ssp->dep_map); + __srcu_read_unlock_fast_updown(ssp, scp); + RCU_LOCKDEP_WARN(!rcu_is_watching(), + "RCU must be watching srcu_read_unlock_fast_updown()."); +} + /* * Used by tracing, cannot be traced and cannot call lockdep. * See srcu_read_unlock_fast() for more information. @@ -455,9 +522,9 @@ static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __ __releases(ssp) { WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi()); - srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST); - __srcu_read_unlock_fast(ssp, scp); - RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_up_read_fast()."); + srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST_UPDOWN); + __srcu_read_unlock_fast_updown(ssp, scp); + RCU_LOCKDEP_WARN(!rcu_is_watching(), "RCU must be watching srcu_up_read_fast_updown()."); } /** diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 1ecc3393fb26..e0698024667a 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -50,13 +50,18 @@ void srcu_drive_gp(struct work_struct *wp); #define DEFINE_SRCU_FAST(name) DEFINE_SRCU(name) #define DEFINE_STATIC_SRCU_FAST(name) \ static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) +#define DEFINE_SRCU_FAST_UPDOWN(name) DEFINE_SRCU(name) +#define DEFINE_STATIC_SRCU_FAST_UPDOWN(name) \ + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name, name, name) // Dummy structure for srcu_notifier_head. struct srcu_usage { }; #define __SRCU_USAGE_INIT(name) { } #define __init_srcu_struct_fast __init_srcu_struct +#define __init_srcu_struct_fast_updown __init_srcu_struct #ifndef CONFIG_DEBUG_LOCK_ALLOC #define init_srcu_struct_fast init_srcu_struct +#define init_srcu_struct_fast_updown init_srcu_struct #endif // #ifndef CONFIG_DEBUG_LOCK_ALLOC void synchronize_srcu(struct srcu_struct *ssp); @@ -100,6 +105,17 @@ static inline void __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ __srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp)); } +static inline struct srcu_ctr __percpu *__srcu_read_lock_fast_updown(struct srcu_struct *ssp) +{ + return __srcu_ctr_to_ptr(ssp, __srcu_read_lock(ssp)); +} + +static inline +void __srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + __srcu_read_unlock(ssp, __srcu_ptr_to_ctr(ssp, scp)); +} + static inline void synchronize_srcu_expedited(struct srcu_struct *ssp) { synchronize_srcu(ssp); diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h index 6080a9094618..d6f978b50472 100644 --- a/include/linux/srcutree.h +++ b/include/linux/srcutree.h @@ -199,8 +199,15 @@ struct srcu_struct { * * See include/linux/percpu-defs.h for the rules on per-CPU variables. * - * DEFINE_SRCU_FAST() creates an srcu_struct and associated structures - * whose readers must be of the SRCU-fast variety. + * DEFINE_SRCU_FAST() and DEFINE_STATIC_SRCU_FAST create an srcu_struct + * and associated structures whose readers must be of the SRCU-fast variety. + * DEFINE_SRCU_FAST_UPDOWN() and DEFINE_STATIC_SRCU_FAST_UPDOWN() create + * an srcu_struct and associated structures whose readers must be of the + * SRCU-fast-updown variety. The key point (aside from error checking) with + * both varieties is that the grace periods must use synchronize_rcu() + * instead of smp_mb(), and given that the first (for example) + * srcu_read_lock_fast() might race with the first synchronize_srcu(), + * this different must be specified at initialization time. */ #ifdef MODULE # define __DEFINE_SRCU(name, fast, is_static) \ @@ -221,6 +228,10 @@ struct srcu_struct { #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, 0, static) #define DEFINE_SRCU_FAST(name) __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, /* not static */) #define DEFINE_STATIC_SRCU_FAST(name) __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST, static) +#define DEFINE_SRCU_FAST_UPDOWN(name) __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST_UPDOWN, \ + /* not static */) +#define DEFINE_STATIC_SRCU_FAST_UPDOWN(name) \ + __DEFINE_SRCU(name, SRCU_READ_FLAVOR_FAST_UPDOWN, static) int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void synchronize_srcu_expedited(struct srcu_struct *ssp); @@ -305,6 +316,46 @@ __srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); // Z, and implicit RCU reader. } +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Returns a pointer that must be passed to the matching + * srcu_read_unlock_fast_updown(). This type of reader is compatible + * with srcu_down_read_fast() and srcu_up_read_fast(). + * + * See the __srcu_read_lock_fast() comment for more details. + */ +static inline +struct srcu_ctr __percpu notrace *__srcu_read_lock_fast_updown(struct srcu_struct *ssp) +{ + struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp); + + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_locks.counter); // Y, and implicit RCU reader. + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_locks)); // Y, and implicit RCU reader. + barrier(); /* Avoid leaking the critical section. */ + return scp; +} + +/* + * Removes the count for the old reader from the appropriate + * per-CPU element of the srcu_struct. Note that this may well be a + * different CPU than that which was incremented by the corresponding + * srcu_read_lock_fast(), but it must be within the same task. + * + * Please see the __srcu_read_lock_fast() function's header comment for + * information on implicit RCU readers and NMI safety. + */ +static inline void notrace +__srcu_read_unlock_fast_updown(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp) +{ + barrier(); /* Avoid leaking the critical section. */ + if (!IS_ENABLED(CONFIG_NEED_SRCU_NMI_SAFE)) + this_cpu_inc(scp->srcu_unlocks.counter); // Z, and implicit RCU reader. + else + atomic_long_inc(raw_cpu_ptr(&scp->srcu_unlocks)); // Z, and implicit RCU reader. +} + void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor); // Record SRCU-reader usage type only for CONFIG_PROVE_RCU=y kernels. -- cgit v1.2.3 From 6ca07a9b63ff4ac24931a21086542cd7092ad74f Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 1 Oct 2025 15:46:36 +0200 Subject: sysctl: Replace void pointer with const pointer to ctl_table * Replace void* data in the converter functions with a const struct ctl_table* table as it was only getting forwarding values from ctl_table->extra{1,2}. * Remove the void* data in the do_proc_* functions as they already had a pointer to the ctl_table. * Remove min/max structures do_proc_do{uint,int}vec_minmax_conv_param; the min/max values get passed directly in ctl_table. * Keep min/max initialization in extra{1,2} in proc_dou8vec_minmax. * The do_proc_douintvec was adjusted outside sysctl.c as it is exported to fs/pipe.c. Signed-off-by: Joel Granados --- include/linux/sysctl.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 28c4a997fd21..436191e569da 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -235,9 +235,8 @@ bool sysctl_is_alias(char *param); int do_proc_douintvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, - unsigned int *valp, - int write, void *data), - void *data); + unsigned int *valp, int write, + const struct ctl_table *table)); extern int unaligned_enabled; extern int no_unaligned_warning; -- cgit v1.2.3 From c5b4c183f7aeb46cd27ddea9dab776655b8d7034 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Wed, 8 Oct 2025 16:12:37 +0200 Subject: sysctl: Allow custom converters from outside sysctl The new non-static proc_dointvec_conv forwards a custom converter function to do_proc_dointvec from outside the sysctl scope. Rename the do_proc_dointvec call points so any future changes to proc_dointvec_conv are propagated in sysctl.c This is a preparation commit that allows the integer jiffie converter functions to move out of kernel/sysctl.c. Signed-off-by: Joel Granados --- include/linux/sysctl.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 436191e569da..a48273757c99 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -68,6 +68,10 @@ int proc_dostring(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dobool(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec(const struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, + int dir, const struct ctl_table *table)); int proc_douintvec(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dointvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer, -- cgit v1.2.3 From e2e5dac304fdf991fb974510db4565db04ef1335 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 14 Oct 2025 12:42:01 +0200 Subject: sysctl: Move INT converter macros to sysctl header Move direction macros (SYSCTL_{USER_TO_KERN,KERN_TO_USER}) and the integer converter macros (SYSCTL_{USER_TO_KERN,KERN_TO_USER}_INT_CONV, SYSCTL_INT_CONV_CUSTOM) into include/linux/sysctl.h. This is a preparation commit to enable jiffies converter creation outside kernel/sysctl.c. Signed-off-by: Joel Granados --- include/linux/sysctl.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'include') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index a48273757c99..a0ca9496119a 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -59,6 +59,81 @@ extern const int sysctl_vals[]; #define SYSCTL_LONG_ONE ((void *)&sysctl_long_vals[1]) #define SYSCTL_LONG_MAX ((void *)&sysctl_long_vals[2]) +/** + * + * "dir" originates from read_iter (dir = 0) or write_iter (dir = 1) + * in the file_operations struct at proc/proc_sysctl.c. Its value means + * one of two things for sysctl: + * 1. SYSCTL_USER_TO_KERN(dir) Writing to an internal kernel variable from user + * space (dir > 0) + * 2. SYSCTL_KERN_TO_USER(dir) Writing to a user space buffer from a kernel + * variable (dir == 0). + */ +#define SYSCTL_USER_TO_KERN(dir) (!!(dir)) +#define SYSCTL_KERN_TO_USER(dir) (!dir) + +#define SYSCTL_USER_TO_KERN_INT_CONV(name, u_ptr_op) \ +int sysctl_user_to_kern_int_conv##name(const bool *negp, \ + const unsigned long *u_ptr,\ + int *k_ptr) \ +{ \ + unsigned long u = u_ptr_op(*u_ptr); \ + if (*negp) { \ + if (u > (unsigned long) INT_MAX + 1) \ + return -EINVAL; \ + WRITE_ONCE(*k_ptr, -u); \ + } else { \ + if (u > (unsigned long) INT_MAX) \ + return -EINVAL; \ + WRITE_ONCE(*k_ptr, u); \ + } \ + return 0; \ +} + +#define SYSCTL_KERN_TO_USER_INT_CONV(name, k_ptr_op) \ +int sysctl_kern_to_user_int_conv##name(bool *negp, \ + unsigned long *u_ptr, \ + const int *k_ptr) \ +{ \ + int val = READ_ONCE(*k_ptr); \ + if (val < 0) { \ + *negp = true; \ + *u_ptr = -k_ptr_op((unsigned long)val); \ + } else { \ + *negp = false; \ + *u_ptr = k_ptr_op((unsigned long)val); \ + } \ + return 0; \ +} + +/** + * To range check on a converted value, use a temp k_ptr + * When checking range, value should be within (tbl->extra1, tbl->extra2) + */ +#define SYSCTL_INT_CONV_CUSTOM(name, user_to_kern, kern_to_user, \ + k_ptr_range_check) \ +int do_proc_int_conv##name(bool *negp, unsigned long *u_ptr, int *k_ptr,\ + int dir, const struct ctl_table *tbl) \ +{ \ + if (SYSCTL_KERN_TO_USER(dir)) \ + return kern_to_user(negp, u_ptr, k_ptr); \ + \ + if (k_ptr_range_check) { \ + int tmp_k, ret; \ + if (!tbl) \ + return -EINVAL; \ + ret = user_to_kern(negp, u_ptr, &tmp_k); \ + if (ret) \ + return ret; \ + if ((tbl->extra1 && *(int *)tbl->extra1 > tmp_k) || \ + (tbl->extra2 && *(int *)tbl->extra2 < tmp_k)) \ + return -EINVAL; \ + WRITE_ONCE(*k_ptr, tmp_k); \ + } else \ + return user_to_kern(negp, u_ptr, k_ptr); \ + return 0; \ +} + extern const unsigned long sysctl_long_vals[]; typedef int proc_handler(const struct ctl_table *ctl, int write, void *buffer, -- cgit v1.2.3 From 24a08eefddb33c7a259975e932c434b85f70d684 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 16 Oct 2025 10:38:45 +0200 Subject: sysctl: Move UINT converter macros to sysctl header Move SYSCTL_USER_TO_KERN_UINT_CONV and SYSCTL_UINT_CONV_CUSTOM macros to include/linux/sysctl.h. No need to embed sysctl_kern_to_user_uint_conv in a macro as it will not need a custom kernel pointer operation. This is a preparation commit to enable jiffies converter creation outside kernel/sysctl.c. Signed-off-by: Joel Granados --- include/linux/sysctl.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index a0ca9496119a..fa78136617ad 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -134,6 +134,45 @@ int do_proc_int_conv##name(bool *negp, unsigned long *u_ptr, int *k_ptr,\ return 0; \ } +#define SYSCTL_USER_TO_KERN_UINT_CONV(name, u_ptr_op) \ +int sysctl_user_to_kern_uint_conv##name(const unsigned long *u_ptr,\ + unsigned int *k_ptr) \ +{ \ + unsigned long u = u_ptr_op(*u_ptr); \ + if (u > UINT_MAX) \ + return -EINVAL; \ + WRITE_ONCE(*k_ptr, u); \ + return 0; \ +} + +#define SYSCTL_UINT_CONV_CUSTOM(name, user_to_kern, kern_to_user, \ + k_ptr_range_check) \ +int do_proc_uint_conv##name(unsigned long *u_ptr, unsigned int *k_ptr, \ + int dir, const struct ctl_table *tbl) \ +{ \ + if (SYSCTL_KERN_TO_USER(dir)) \ + return kern_to_user(u_ptr, k_ptr); \ + \ + if (k_ptr_range_check) { \ + unsigned int tmp_k; \ + int ret; \ + if (!tbl) \ + return -EINVAL; \ + ret = user_to_kern(u_ptr, &tmp_k); \ + if (ret) \ + return ret; \ + if ((tbl->extra1 && \ + *(unsigned int *)tbl->extra1 > tmp_k) || \ + (tbl->extra2 && \ + *(unsigned int *)tbl->extra2 < tmp_k)) \ + return -ERANGE; \ + WRITE_ONCE(*k_ptr, tmp_k); \ + } else \ + return user_to_kern(u_ptr, k_ptr); \ + return 0; \ +} + + extern const unsigned long sysctl_long_vals[]; typedef int proc_handler(const struct ctl_table *ctl, int write, void *buffer, @@ -166,6 +205,7 @@ int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int, void * int proc_do_large_bitmap(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_do_static_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +int sysctl_kern_to_user_uint_conv(unsigned long *u_ptr, const unsigned int *k_ptr); /* * Register a set of sysctl names by calling register_sysctl -- cgit v1.2.3 From 54932988c4230925d2bf0023509ac2fee59a089a Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 14 Oct 2025 13:04:16 +0200 Subject: sysctl: Move jiffies converters to kernel/time/jiffies.c Move integer jiffies converters (proc_dointvec{_,_ms_,_userhz_}jiffies and proc_dointvec_ms_jiffies_minmax) to kernel/time/jiffies.c. Error stubs for when CONFIG_PRCO_SYSCTL is not defined are not reproduced because all the jiffies converters go through proc_dointvec_conv which is already stubbed. This is part of the greater effort to move sysctl logic out of kernel/sysctl.c thereby reducing merge conflicts in kernel/sysctl.c. Signed-off-by: Joel Granados --- include/linux/jiffies.h | 10 ++++++++++ include/linux/sysctl.h | 7 ------- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 0d1927da8055..72d589a8a0d6 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -611,4 +611,14 @@ extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 +struct ctl_table; +int proc_dointvec_jiffies(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos); +int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos); +int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos); +int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos); + #endif diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index fa78136617ad..db4020f6933b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -192,13 +192,6 @@ int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer size_t *lenp, loff_t *ppos); int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -int proc_dointvec_jiffies(const struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos); -int proc_dointvec_userhz_jiffies(const struct ctl_table *, int, void *, size_t *, - loff_t *); -int proc_dointvec_ms_jiffies(const struct ctl_table *, int, void *, size_t *, - loff_t *); int proc_doulongvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int, void *, size_t *, loff_t *); -- cgit v1.2.3 From 4639faaa607f3bed85f2cdde686a88453c99ef06 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 14 Oct 2025 13:35:42 +0200 Subject: sysctl: Move proc_doulongvec_ms_jiffies_minmax to kernel/time/jiffies.c Move proc_doulongvec_ms_jiffies_minmax to kernel/time/jiffies.c. Create a non static wrapper function proc_doulongvec_minmax_conv that forwards the custom convmul and convdiv argument values to the internal do_proc_doulongvec_minmax. Remove unused linux/times.h include from kernel/sysctl.c. Signed-off-by: Joel Granados --- include/linux/jiffies.h | 2 ++ include/linux/sysctl.h | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 72d589a8a0d6..fdef2c155c27 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -620,5 +620,7 @@ int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos); +int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos); #endif diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index db4020f6933b..30f6a184d3f4 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -193,8 +193,9 @@ int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_doulongvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int, void *, - size_t *, loff_t *); +int proc_doulongvec_minmax_conv(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv); int proc_do_large_bitmap(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_do_static_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.3 From 30baaeb685bce0b7dfd3c5a55f22b1076c21f7b2 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 14 Oct 2025 14:21:03 +0200 Subject: sysctl: Create pipe-max-size converter using sysctl UINT macros Create a converter for the pipe-max-size proc_handler using the SYSCTL_UINT_CONV_CUSTOM. Move SYSCTL_CONV_IDENTITY macro to the sysctl header to make it available for pipe size validation. Keep returning -EINVAL when (val == 0) by using a range checking converter and setting the minimal valid value (extern1) to SYSCTL_ONE. Keep round_pipe_size by passing it as the operation for SYSCTL_USER_TO_KERN_INT_CONV. Signed-off-by: Joel Granados --- include/linux/sysctl.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 30f6a184d3f4..4c88514a7d1a 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -59,6 +59,7 @@ extern const int sysctl_vals[]; #define SYSCTL_LONG_ONE ((void *)&sysctl_long_vals[1]) #define SYSCTL_LONG_MAX ((void *)&sysctl_long_vals[2]) +#define SYSCTL_CONV_IDENTITY(val) (val) /** * * "dir" originates from read_iter (dir = 0) or write_iter (dir = 1) -- cgit v1.2.3 From 564195c1a33c8fc631cd3d306e350b0e3d3e9555 Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 16 Oct 2025 11:04:23 +0200 Subject: sysctl: Wrap do_proc_douintvec with the public function proc_douintvec_conv Make do_proc_douintvec static and export proc_douintvec_conv wrapper function for external use. This is to keep with the design in sysctl.c. Update fs/pipe.c to use the new public API. Signed-off-by: Joel Granados --- include/linux/sysctl.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 4c88514a7d1a..288fe0055cd5 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -183,14 +183,20 @@ int proc_dostring(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_dobool(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_dointvec(const struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_minmax(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos); int proc_dointvec_conv(const struct ctl_table *table, int dir, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *u_ptr, int *k_ptr, int dir, const struct ctl_table *table)); int proc_douintvec(const struct ctl_table *, int, void *, size_t *, loff_t *); -int proc_dointvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *); int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +int proc_douintvec_conv(const struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(unsigned long *lvalp, unsigned int *valp, + int write, const struct ctl_table *table)); + int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int proc_doulongvec_minmax(const struct ctl_table *, int, void *, size_t *, loff_t *); @@ -346,11 +352,6 @@ extern struct ctl_table_header *register_sysctl_mount_point(const char *path); void do_sysctl_args(void); bool sysctl_is_alias(char *param); -int do_proc_douintvec(const struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos, - int (*conv)(unsigned long *lvalp, - unsigned int *valp, int write, - const struct ctl_table *table)); extern int unaligned_enabled; extern int no_unaligned_warning; -- cgit v1.2.3 From 5fee9edf791a50182382fae23f30690c93e16cec Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 27 Nov 2025 16:34:20 +0000 Subject: ASoC: SDCA: Align mute controls to ALSA expectations Currently mute controls will be called "FU xx Mute Switch" (note the switch is added programmatically outside the coverage of this patch) and the accompanying volume control would be called "FU xx Channel Volume". These names are taken from the SDCA specification, however, this does not mesh well with the ALSA naming system. ALSA generally expects enables rather than mutes and expects that mutes and volumes have matching names. Update the names and invert the mute controls to make them more standard "FU XX Channel Switch", this does slightly deviate from the SDCA specification but it makes the rest of the Linux ecosystem a lot happier. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20251127163426.2500633-2-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index c97861508a15..2564fad33fd4 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -611,7 +611,7 @@ enum sdca_entity0_controls { #define SDCA_CTL_NDAI_PACKETTYPE_NAME "NDAI Packet Type" #define SDCA_CTL_MIXER_NAME "Mixer" #define SDCA_CTL_SELECTOR_NAME "Selector" -#define SDCA_CTL_MUTE_NAME "Mute" +#define SDCA_CTL_MUTE_NAME "Channel" #define SDCA_CTL_CHANNEL_VOLUME_NAME "Channel Volume" #define SDCA_CTL_AGC_NAME "AGC" #define SDCA_CTL_BASS_BOOST_NAME "Bass Boost" -- cgit v1.2.3 From 48fa77af2f4a55ab961520f2a0e50560dc0baca8 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 27 Nov 2025 16:34:21 +0000 Subject: ASoC: SDCA: Add terminal type into input/output widget name There have been some complaints around the UCM files for SDCA devices that the control system is quite hard to follow. This is definitely true without the specification handy the naming can be a little cryptic. However, as most of the information is parsed from DisCo there are some limits to what the driver can safely do to improve this. However, one area that can be improved is the non-streaming input/output terminals. These have a field (enum sdca_terminal_type) that describes the usage of that terminal. These types can be appended to the entity name to give the users a better clue as to the purpose. For example "OT 43", would now become "OT 43 Headphone". This would follow through into the jack controls which would change from "OT 43 Jack" to "OT 43 Headphone Jack", making the purpose much more obvious to the user. This provides slightly more readable controls without relying on implicit knowledge that individual parts might not conform to. Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20251127163426.2500633-3-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/sdca_function.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/sdca_function.h b/include/sound/sdca_function.h index 2564fad33fd4..6e9391b3816c 100644 --- a/include/sound/sdca_function.h +++ b/include/sound/sdca_function.h @@ -1456,6 +1456,8 @@ int sdca_parse_function(struct device *dev, struct sdw_slave *sdw, struct sdca_function_desc *desc, struct sdca_function_data *function); +const char *sdca_find_terminal_name(enum sdca_terminal_type type); + struct sdca_control *sdca_selector_find_control(struct device *dev, struct sdca_entity *entity, const int sel); -- cgit v1.2.3 From 2ae4659533d8e2b5e06e8f570e2b4b7b88ae0716 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 27 Nov 2025 16:34:23 +0000 Subject: ASoC: sdw_utils: Move codec_name to dai info As SDCA devices will support each DAI link on a different child device, move the codec name from codec_info to each dai_info. To allow the appropriate function device to be bound to each DAI link. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20251127163426.2500633-5-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc_sdw_utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index 76c64c5245d4..714e207d4c01 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -45,6 +45,7 @@ struct asoc_sdw_codec_info; struct asoc_sdw_dai_info { const bool direction[2]; /* playback & capture support */ + const char *codec_name; const char *dai_name; const char *component_name; const int dai_type; @@ -67,7 +68,6 @@ struct asoc_sdw_dai_info { struct asoc_sdw_codec_info { const int part_id; const int version_id; - const char *codec_name; const char *name_prefix; int amp_num; const u8 acpi_id[ACPI_ID_LEN]; @@ -131,7 +131,7 @@ int asoc_sdw_hw_free(struct snd_pcm_substream *substream); void asoc_sdw_shutdown(struct snd_pcm_substream *substream); const char *asoc_sdw_get_codec_name(struct device *dev, - const struct asoc_sdw_codec_info *codec_info, + const struct asoc_sdw_dai_info *dai_info, const struct snd_soc_acpi_link_adr *adr_link, int adr_index); -- cgit v1.2.3 From c66297d09e1a5813eb743bae8cda4e115b8a5c56 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 27 Nov 2025 16:34:24 +0000 Subject: ASoC: intel: sof_sdw: Add ability to have auxiliary devices Currently the sof_sdw machine driver assumes that all devices involved in the sound card are connected through a DAI link. However for SDCA devices we still want the HID (Human Interface Device, used for jack buttons) to be part of the sound card, but it contains no DAI links. Add support into the machine driver to specify a list of auxiliary devices to merged into the card. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20251127163426.2500633-6-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc_sdw_utils.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index 714e207d4c01..48719fde308c 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -13,6 +13,7 @@ #include #define SOC_SDW_MAX_DAI_NUM 8 +#define SOC_SDW_MAX_AUX_NUM 2 #define SOC_SDW_MAX_NO_PROPS 2 #define SOC_SDW_JACK_JDSRC(quirk) ((quirk) & GENMASK(3, 0)) @@ -65,6 +66,10 @@ struct asoc_sdw_dai_info { bool quirk_exclude; }; +struct asoc_sdw_aux_info { + const char *codec_name; +}; + struct asoc_sdw_codec_info { const int part_id; const int version_id; @@ -75,6 +80,8 @@ struct asoc_sdw_codec_info { const struct snd_soc_ops *ops; struct asoc_sdw_dai_info dais[SOC_SDW_MAX_DAI_NUM]; const int dai_num; + struct asoc_sdw_aux_info auxs[SOC_SDW_MAX_AUX_NUM]; + const int aux_num; int (*codec_card_late_probe)(struct snd_soc_card *card); @@ -165,13 +172,15 @@ int asoc_sdw_init_simple_dai_link(struct device *dev, struct snd_soc_dai_link *d int no_pcm, int (*init)(struct snd_soc_pcm_runtime *rtd), const struct snd_soc_ops *ops); -int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card, int *num_devs, int *num_ends); +int asoc_sdw_count_sdw_endpoints(struct snd_soc_card *card, + int *num_devs, int *num_ends, int *num_aux); struct asoc_sdw_dailink *asoc_sdw_find_dailink(struct asoc_sdw_dailink *dailinks, const struct snd_soc_acpi_endpoint *new); int asoc_sdw_get_dai_type(u32 type); int asoc_sdw_parse_sdw_endpoints(struct snd_soc_card *card, + struct snd_soc_aux_dev *soc_aux, struct asoc_sdw_dailink *soc_dais, struct asoc_sdw_endpoint *soc_ends, int *num_devs); -- cgit v1.2.3 From 3f6b562f2107ab2467908fa1543e1a6ea8442bd1 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Thu, 27 Nov 2025 16:34:25 +0000 Subject: ASoC: sdw_utils: Add cs42l45 support functions Add the helper functions into the machine driver for the cs42l45, this will register a jack for jack detection and add things into to the components string if they are needed. Reviewed-by: Bard Liao Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20251127163426.2500633-7-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/soc_sdw_utils.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/sound/soc_sdw_utils.h b/include/sound/soc_sdw_utils.h index 48719fde308c..227347c8f0b3 100644 --- a/include/sound/soc_sdw_utils.h +++ b/include/sound/soc_sdw_utils.h @@ -257,6 +257,8 @@ int asoc_sdw_cs42l42_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_da int asoc_sdw_cs42l43_hs_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_cs42l43_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_cs42l43_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +int asoc_sdw_cs42l45_hs_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); +int asoc_sdw_cs42l45_dmic_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_cs_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); int asoc_sdw_maxim_spk_rtd_init(struct snd_soc_pcm_runtime *rtd, struct snd_soc_dai *dai); /* TI */ -- cgit v1.2.3 From 8c8e3df3d2f51e9a3f6f1a1112adf250f7652d42 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 27 Oct 2025 23:33:00 +0100 Subject: keys: Fix grammar and formatting in 'struct key_type' comments s/it/if/ and s/revokation/revocation/, capitalize "clear", and add a period after the sentence. Fix the comment formatting. Signed-off-by: Thorsten Blum Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- include/linux/key-type.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/key-type.h b/include/linux/key-type.h index 5caf3ce82373..bb97bd3e5af4 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -107,11 +107,14 @@ struct key_type { */ int (*match_preparse)(struct key_match_data *match_data); - /* Free preparsed match data (optional). This should be supplied it - * ->match_preparse() is supplied. */ + /* + * Free preparsed match data (optional). This should be supplied if + * ->match_preparse() is supplied. + */ void (*match_free)(struct key_match_data *match_data); - /* clear some of the data from a key on revokation (optional) + /* + * Clear some of the data from a key on revocation (optional). * - the key's semaphore will be write-locked by the caller */ void (*revoke)(struct key *key); -- cgit v1.2.3 From c2d2dad24503d7e2eb7cba354fcc73f95fa78d7a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Nov 2025 14:06:45 +0000 Subject: rbtree: inline rb_first() Patch series "rbree: inline rb_first() and rb_last()". Inline these two small helpers, heavily used in TCP and FQ packet scheduler, and in many other places. This reduces kernel text size, and brings an 1.5 % improvement on network TCP stress test. This patch (of 2): This is a very small function, inlining it saves cpu cycles by reducing register pressure and removing call/ret overhead. It also reduces vmlinux text size by 744 bytes on a typical x86_64 build. Before: size vmlinux text data bss dec hex filename 34812525 22177365 5685248 62675138 3bc58c2 vmlinux After: size vmlinux text data bss dec hex filename 34811781 22177365 5685248 62674394 3bc55da vmlinux [ojeda@kernel.org: fix rust build] Link: https://lkml.kernel.org/r/20251120085518.1463498-1-ojeda@kernel.org Link: https://lkml.kernel.org/r/20251114140646.3817319-1-edumazet@google.com Link: https://lkml.kernel.org/r/20251114140646.3817319-2-edumazet@google.com Signed-off-by: Eric Dumazet Signed-off-by: Miguel Ojeda Reviewed-by: Kuan-Wei Chiu Cc: Jakub Kacinski Cc: Neal Cardwell Cc: Paolo Abeni Cc: Alice Ryhl Cc: Stehen Rothwell Signed-off-by: Andrew Morton --- include/linux/rbtree.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 8d2ba3749866..484554900f7d 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -43,7 +43,21 @@ extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); -extern struct rb_node *rb_first(const struct rb_root *); + +/* + * This function returns the first node (in sort order) of the tree. + */ +static inline struct rb_node *rb_first(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_left) + n = n->rb_left; + return n; +} extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ -- cgit v1.2.3 From 94984bfed58ca129f7e259ce09973ed0b3f540a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 14 Nov 2025 14:06:46 +0000 Subject: rbtree: inline rb_last() This is a very small function, inlining it saves cpu cycles in TCP by reducing register pressure and removing call/ret overhead. It also reduces vmlinux text size by 122 bytes on a typical x86_64 build. Before: size vmlinux text data bss dec hex filename 34811781 22177365 5685248 62674394 3bc55da vmlinux After: size vmlinux text data bss dec hex filename 34811659 22177365 5685248 62674272 3bc5560 vmlinux [ojeda@kernel.org: fix rust build] Link: https://lkml.kernel.org/r/20251120085518.1463498-1-ojeda@kernel.org Link: https://lkml.kernel.org/r/20251114140646.3817319-3-edumazet@google.com Signed-off-by: Eric Dumazet Signed-off-by: Miguel Ojeda Reviewed-by: Kuan-Wei Chiu Cc: Jakub Kacinski Cc: Neal Cardwell Cc: Paolo Abeni Cc: Alice Ryhl Cc: Stehen Rothwell Signed-off-by: Andrew Morton --- include/linux/rbtree.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 484554900f7d..4091e978aef2 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -58,7 +58,21 @@ static inline struct rb_node *rb_first(const struct rb_root *root) n = n->rb_left; return n; } -extern struct rb_node *rb_last(const struct rb_root *); + +/* + * This function returns the last node (in sort order) of the tree. + */ +static inline struct rb_node *rb_last(const struct rb_root *root) +{ + struct rb_node *n; + + n = root->rb_node; + if (!n) + return NULL; + while (n->rb_right) + n = n->rb_right; + return n; +} /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); -- cgit v1.2.3 From 70f9133096c833922c3b63461480248cefa7bb0f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Sat, 1 Nov 2025 10:23:18 -0400 Subject: kho: drop notifiers The KHO framework uses a notifier chain as the mechanism for clients to participate in the finalization process. While this works for a single, central state machine, it is too restrictive for kernel-internal components like pstore/reserve_mem or IMA. These components need a simpler, direct way to register their state for preservation (e.g., during their initcall) without being part of a complex, shutdown-time notifier sequence. The notifier model forces all participants into a single finalization flow and makes direct preservation from an arbitrary context difficult. This patch refactors the client participation model by removing the notifier chain and introducing a direct API for managing FDT subtrees. The core kho_finalize() and kho_abort() state machine remains, but clients now register their data with KHO beforehand. Link: https://lkml.kernel.org/r/20251101142325.1326536-3-pasha.tatashin@soleen.com Signed-off-by: Mike Rapoport (Microsoft) Co-developed-by: Pasha Tatashin Signed-off-by: Pasha Tatashin Cc: Alexander Graf Cc: Changyuan Lyu Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Simon Horman Cc: Tejun Heo Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 25042c1d8d54..0d860d793b66 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -10,14 +10,7 @@ struct kho_scratch { phys_addr_t size; }; -/* KHO Notifier index */ -enum kho_event { - KEXEC_KHO_FINALIZE = 0, - KEXEC_KHO_ABORT = 1, -}; - struct folio; -struct notifier_block; struct page; #define DECLARE_KHOSER_PTR(name, type) \ @@ -37,8 +30,6 @@ struct page; (typeof((s).ptr))((s).phys ? phys_to_virt((s).phys) : NULL); \ }) -struct kho_serialization; - struct kho_vmalloc_chunk; struct kho_vmalloc { DECLARE_KHOSER_PTR(first, struct kho_vmalloc_chunk *); @@ -57,12 +48,10 @@ int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); -int kho_add_subtree(struct kho_serialization *ser, const char *name, void *fdt); +int kho_add_subtree(const char *name, void *fdt); +void kho_remove_subtree(void *fdt); int kho_retrieve_subtree(const char *name, phys_addr_t *phys); -int register_kho_notifier(struct notifier_block *nb); -int unregister_kho_notifier(struct notifier_block *nb); - void kho_memory_init(void); void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, @@ -110,23 +99,16 @@ static inline void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) return NULL; } -static inline int kho_add_subtree(struct kho_serialization *ser, - const char *name, void *fdt) +static inline int kho_add_subtree(const char *name, void *fdt) { return -EOPNOTSUPP; } -static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) +static inline void kho_remove_subtree(void *fdt) { - return -EOPNOTSUPP; } -static inline int register_kho_notifier(struct notifier_block *nb) -{ - return -EOPNOTSUPP; -} - -static inline int unregister_kho_notifier(struct notifier_block *nb) +static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) { return -EOPNOTSUPP; } -- cgit v1.2.3 From 36f8f7ef7fd2f238922e9d217e86c69838319d8c Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 1 Nov 2025 10:23:19 -0400 Subject: kho: add interfaces to unpreserve folios, page ranges, and vmalloc Allow users of KHO to cancel the previous preservation by adding the necessary interfaces to unpreserve folio, pages, and vmallocs. Link: https://lkml.kernel.org/r/20251101142325.1326536-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Changyuan Lyu Cc: Christian Brauner Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jonathan Corbet Cc: Masahiro Yamada Cc: Miguel Ojeda Cc: Randy Dunlap Cc: Simon Horman Cc: Tejun Heo Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 0d860d793b66..80ece4232617 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -43,8 +43,11 @@ bool kho_is_enabled(void); bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); +int kho_unpreserve_folio(struct folio *folio); int kho_preserve_pages(struct page *page, unsigned int nr_pages); +int kho_unpreserve_pages(struct page *page, unsigned int nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); +int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); @@ -72,17 +75,32 @@ static inline int kho_preserve_folio(struct folio *folio) return -EOPNOTSUPP; } +static inline int kho_unpreserve_folio(struct folio *folio) +{ + return -EOPNOTSUPP; +} + static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages) { return -EOPNOTSUPP; } +static inline int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) +{ + return -EOPNOTSUPP; +} + static inline int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) { return -EOPNOTSUPP; } +static inline int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) +{ + return -EOPNOTSUPP; +} + static inline struct folio *kho_restore_folio(phys_addr_t phys) { return NULL; -- cgit v1.2.3 From 4c205677af2726bd3b51c02ab6a5a2b411efed09 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 13:59:52 -0500 Subject: kho: introduce high-level memory allocation API Currently, clients of KHO must manually allocate memory (e.g., via alloc_pages), calculate the page order, and explicitly call kho_preserve_folio(). Similarly, cleanup requires separate calls to unpreserve and free the memory. Introduce a high-level API to streamline this common pattern: - kho_alloc_preserve(size): Allocates physically contiguous, zeroed memory and immediately marks it for preservation. - kho_unpreserve_free(ptr): Unpreserves and frees the memory in the current kernel. - kho_restore_free(ptr): Restores the struct page state of preserved memory in the new kernel and immediately frees it to the page allocator. [pasha.tatashin@soleen.com: build fixes] Link: https://lkml.kernel.org/r/CA+CK2bBgXDhrHwTVgxrw7YTQ-0=LgW0t66CwPCgG=C85ftz4zw@mail.gmail.com Link: https://lkml.kernel.org/r/20251114190002.3311679-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index 80ece4232617..dde952227b88 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -2,8 +2,9 @@ #ifndef LINUX_KEXEC_HANDOVER_H #define LINUX_KEXEC_HANDOVER_H -#include +#include #include +#include struct kho_scratch { phys_addr_t addr; @@ -48,6 +49,9 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages); int kho_unpreserve_pages(struct page *page, unsigned int nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); +void *kho_alloc_preserve(size_t size); +void kho_unpreserve_free(void *mem); +void kho_restore_free(void *mem); struct folio *kho_restore_folio(phys_addr_t phys); struct page *kho_restore_pages(phys_addr_t phys, unsigned int nr_pages); void *kho_restore_vmalloc(const struct kho_vmalloc *preservation); @@ -101,6 +105,14 @@ static inline int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) return -EOPNOTSUPP; } +static inline void *kho_alloc_preserve(size_t size) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void kho_unpreserve_free(void *mem) { } +static inline void kho_restore_free(void *mem) { } + static inline struct folio *kho_restore_folio(phys_addr_t phys) { return NULL; @@ -122,18 +134,14 @@ static inline int kho_add_subtree(const char *name, void *fdt) return -EOPNOTSUPP; } -static inline void kho_remove_subtree(void *fdt) -{ -} +static inline void kho_remove_subtree(void *fdt) { } static inline int kho_retrieve_subtree(const char *name, phys_addr_t *phys) { return -EOPNOTSUPP; } -static inline void kho_memory_init(void) -{ -} +static inline void kho_memory_init(void) { } static inline void kho_populate(phys_addr_t fdt_phys, u64 fdt_len, phys_addr_t scratch_phys, u64 scratch_len) -- cgit v1.2.3 From de51999e687c70a41997124b43291f84324c7924 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Nov 2025 14:00:01 -0500 Subject: kho: allow memory preservation state updates after finalization Currently, kho_preserve_* and kho_unpreserve_* return -EBUSY if KHO is finalized. This enforces a rigid "freeze" on the KHO memory state. With the introduction of re-entrant finalization, this restriction is no longer necessary. Users should be allowed to modify the preservation set (e.g., adding new pages or freeing old ones) even after an initial finalization. The intended workflow for updates is now: 1. Modify state (preserve/unpreserve). 2. Call kho_finalize() again to refresh the serialized metadata. Remove the kho_out.finalized checks to enable this dynamic behavior. This also allows to convert kho_unpreserve_* functions to void, as they do not return any error anymore. Link: https://lkml.kernel.org/r/20251114190002.3311679-13-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Arnd Bergmann Cc: Baoquan He Cc: Coiby Xu Cc: Dave Vasilevsky Cc: Eric Biggers Cc: Kees Cook Signed-off-by: Andrew Morton --- include/linux/kexec_handover.h | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h index dde952227b88..5f7b9de97e8d 100644 --- a/include/linux/kexec_handover.h +++ b/include/linux/kexec_handover.h @@ -44,11 +44,11 @@ bool kho_is_enabled(void); bool is_kho_boot(void); int kho_preserve_folio(struct folio *folio); -int kho_unpreserve_folio(struct folio *folio); +void kho_unpreserve_folio(struct folio *folio); int kho_preserve_pages(struct page *page, unsigned int nr_pages); -int kho_unpreserve_pages(struct page *page, unsigned int nr_pages); +void kho_unpreserve_pages(struct page *page, unsigned int nr_pages); int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation); -int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); +void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation); void *kho_alloc_preserve(size_t size); void kho_unpreserve_free(void *mem); void kho_restore_free(void *mem); @@ -79,20 +79,14 @@ static inline int kho_preserve_folio(struct folio *folio) return -EOPNOTSUPP; } -static inline int kho_unpreserve_folio(struct folio *folio) -{ - return -EOPNOTSUPP; -} +static inline void kho_unpreserve_folio(struct folio *folio) { } static inline int kho_preserve_pages(struct page *page, unsigned int nr_pages) { return -EOPNOTSUPP; } -static inline int kho_unpreserve_pages(struct page *page, unsigned int nr_pages) -{ - return -EOPNOTSUPP; -} +static inline void kho_unpreserve_pages(struct page *page, unsigned int nr_pages) { } static inline int kho_preserve_vmalloc(void *ptr, struct kho_vmalloc *preservation) @@ -100,10 +94,7 @@ static inline int kho_preserve_vmalloc(void *ptr, return -EOPNOTSUPP; } -static inline int kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) -{ - return -EOPNOTSUPP; -} +static inline void kho_unpreserve_vmalloc(struct kho_vmalloc *preservation) { } static inline void *kho_alloc_preserve(size_t size) { -- cgit v1.2.3 From 9e2fd062fa1713a33380cc97ef324d086dd45ba5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:31 -0500 Subject: liveupdate: luo_core: Live Update Orchestrator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Live Update Orchestrator", v8. This series introduces the Live Update Orchestrator, a kernel subsystem designed to facilitate live kernel updates using a kexec-based reboot. This capability is critical for cloud environments, allowing hypervisors to be updated with minimal downtime for running virtual machines. LUO achieves this by preserving the state of selected resources, such as memory, devices and their dependencies, across the kernel transition. As a key feature, this series includes support for preserving memfd file descriptors, which allows critical in-memory data, such as guest RAM or any other large memory region, to be maintained in RAM across the kexec reboot. The other series that use LUO, are VFIO [1], IOMMU [2], and PCI [3] preservations. Github repo of this series [4]. The core of LUO is a framework for managing the lifecycle of preserved resources through a userspace-driven interface. Key features include: - Session Management Userspace agent (i.e. luod [5]) creates named sessions, each represented by a file descriptor (via centralized agent that controls /dev/liveupdate). The lifecycle of all preserved resources within a session is tied to this FD, ensuring automatic kernel cleanup if the controlling userspace agent crashes or exits unexpectedly. - File Preservation A handler-based framework allows specific file types (demonstrated here with memfd) to be preserved. Handlers manage the serialization, restoration, and lifecycle of their specific file types. - File-Lifecycle-Bound State A new mechanism for managing shared global state whose lifecycle is tied to the preservation of one or more files. This is crucial for subsystems like IOMMU or HugeTLB, where multiple file descriptors may depend on a single, shared underlying resource that must be preserved only once. - KHO Integration LUO drives the Kexec Handover framework programmatically to pass its serialized metadata to the next kernel. The LUO state is finalized and added to the kexec image just before the reboot is triggered. In the future this step will also be removed once stateless KHO is merged [6]. - Userspace Interface Control is provided via ioctl commands on /dev/liveupdate for creating and retrieving sessions, as well as on session file descriptors for managing individual files. - Testing The series includes a set of selftests, including userspace API validation, kexec-based lifecycle tests for various session and file scenarios, and a new in-kernel test module to validate the FLB logic. Introduce LUO, a mechanism intended to facilitate kernel updates while keeping designated devices operational across the transition (e.g., via kexec). The primary use case is updating hypervisors with minimal disruption to running virtual machines. For userspace side of hypervisor update we have copyless migration. LUO is for updating the kernel. This initial patch lays the groundwork for the LUO subsystem. Further functionality, including the implementation of state transition logic, integration with KHO, and hooks for subsystems and file descriptors, will be added in subsequent patches. Create a character device at /dev/liveupdate. A new uAPI header, , will define the necessary structures. The magic number for IOCTL is registered in Documentation/userspace-api/ioctl/ioctl-number.rst. Link: https://lkml.kernel.org/r/20251125165850.3389713-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20251125165850.3389713-2-pasha.tatashin@soleen.com Link: https://lore.kernel.org/all/20251018000713.677779-1-vipinsh@google.com/ [1] Link: https://lore.kernel.org/linux-iommu/20250928190624.3735830-1-skhawaja@google.com [2] Link: https://lore.kernel.org/linux-pci/20250916-luo-pci-v2-0-c494053c3c08@kernel.org [3] Link: https://github.com/googleprodkernel/linux-liveupdate/tree/luo/v8 [4] Link: https://tinyurl.com/luoddesign [5] Link: https://lore.kernel.org/all/20251020100306.2709352-1-jasonmiu@google.com [6] Link: https://lore.kernel.org/all/20251115233409.768044-1-pasha.tatashin@soleen.com [7] Link: https://github.com/soleen/linux/blob/luo/v8b03/diff.v7.v8 [8] Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zijun Hu Cc: Pratyush Yadav Cc: Zhu Yanjun Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 35 +++++++++++++++++++++++++++++++ include/uapi/linux/liveupdate.h | 46 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 include/linux/liveupdate.h create mode 100644 include/uapi/linux/liveupdate.h (limited to 'include') diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h new file mode 100644 index 000000000000..c6a1d6bd90cb --- /dev/null +++ b/include/linux/liveupdate.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ +#ifndef _LINUX_LIVEUPDATE_H +#define _LINUX_LIVEUPDATE_H + +#include +#include +#include + +#ifdef CONFIG_LIVEUPDATE + +/* Return true if live update orchestrator is enabled */ +bool liveupdate_enabled(void); + +/* Called during kexec to tell LUO that entered into reboot */ +int liveupdate_reboot(void); + +#else /* CONFIG_LIVEUPDATE */ + +static inline bool liveupdate_enabled(void) +{ + return false; +} + +static inline int liveupdate_reboot(void) +{ + return 0; +} + +#endif /* CONFIG_LIVEUPDATE */ +#endif /* _LINUX_LIVEUPDATE_H */ diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h new file mode 100644 index 000000000000..df34c1642c4d --- /dev/null +++ b/include/uapi/linux/liveupdate.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +/* + * Userspace interface for /dev/liveupdate + * Live Update Orchestrator + * + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +#ifndef _UAPI_LIVEUPDATE_H +#define _UAPI_LIVEUPDATE_H + +#include +#include + +/** + * DOC: General ioctl format + * + * The ioctl interface follows a general format to allow for extensibility. Each + * ioctl is passed in a structure pointer as the argument providing the size of + * the structure in the first u32. The kernel checks that any structure space + * beyond what it understands is 0. This allows userspace to use the backward + * compatible portion while consistently using the newer, larger, structures. + * + * ioctls use a standard meaning for common errnos: + * + * - ENOTTY: The IOCTL number itself is not supported at all + * - E2BIG: The IOCTL number is supported, but the provided structure has + * non-zero in a part the kernel does not understand. + * - EOPNOTSUPP: The IOCTL number is supported, and the structure is + * understood, however a known field has a value the kernel does not + * understand or support. + * - EINVAL: Everything about the IOCTL was understood, but a field is not + * correct. + * - ENOENT: A provided token does not exist. + * - ENOMEM: Out of memory. + * - EOVERFLOW: Mathematics overflowed. + * + * As well as additional errnos, within specific ioctls. + */ + +/* The ioctl type, documented in ioctl-number.rst */ +#define LIVEUPDATE_IOCTL_TYPE 0xBA + +#endif /* _UAPI_LIVEUPDATE_H */ -- cgit v1.2.3 From 1aece821004f67f46ef4db7199bbeca87cf22bdd Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:32 -0500 Subject: liveupdate: luo_core: integrate with KHO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integrate the LUO with the KHO framework to enable passing LUO state across a kexec reboot. This patch implements the lifecycle integration with KHO: 1. Incoming State: During early boot (`early_initcall`), LUO checks if KHO is active. If so, it retrieves the "LUO" subtree, verifies the "luo-v1" compatibility string, and reads the `liveupdate-number` to track the update count. 2. Outgoing State: During late initialization (`late_initcall`), LUO allocates a new FDT for the next kernel, populates it with the basic header (compatible string and incremented update number), and registers it with KHO (`kho_add_subtree`). 3. Finalization: The `liveupdate_reboot()` notifier is updated to invoke `kho_finalize()`. This ensures that all memory segments marked for preservation are properly serialized before the kexec jump. Link: https://lkml.kernel.org/r/20251125165850.3389713-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Tested-by: David Matlack Reviewed-by: Mike Rapoport (Microsoft) Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/kho/abi/luo.h | 58 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 include/linux/kho/abi/luo.h (limited to 'include') diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h new file mode 100644 index 000000000000..2099b51929e5 --- /dev/null +++ b/include/linux/kho/abi/luo.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + */ + +/** + * DOC: Live Update Orchestrator ABI + * + * This header defines the stable Application Binary Interface used by the + * Live Update Orchestrator to pass state from a pre-update kernel to a + * post-update kernel. The ABI is built upon the Kexec HandOver framework + * and uses a Flattened Device Tree to describe the preserved data. + * + * This interface is a contract. Any modification to the FDT structure, node + * properties, compatible strings, or the layout of the `__packed` serialization + * structures defined here constitutes a breaking change. Such changes require + * incrementing the version number in the relevant `_COMPATIBLE` string to + * prevent a new kernel from misinterpreting data from an old kernel. + * + * Changes are allowed provided the compatibility version is incremented; + * however, backward/forward compatibility is only guaranteed for kernels + * supporting the same ABI version. + * + * FDT Structure Overview: + * The entire LUO state is encapsulated within a single KHO entry named "LUO". + * This entry contains an FDT with the following layout: + * + * .. code-block:: none + * + * / { + * compatible = "luo-v1"; + * liveupdate-number = <...>; + * }; + * + * Main LUO Node (/): + * + * - compatible: "luo-v1" + * Identifies the overall LUO ABI version. + * - liveupdate-number: u64 + * A counter tracking the number of successful live updates performed. + */ + +#ifndef _LINUX_KHO_ABI_LUO_H +#define _LINUX_KHO_ABI_LUO_H + +/* + * The LUO FDT hooks all LUO state for sessions, fds, etc. + * In the root it also carries "liveupdate-number" 64-bit property that + * corresponds to the number of live-updates performed on this machine. + */ +#define LUO_FDT_SIZE PAGE_SIZE +#define LUO_FDT_KHO_ENTRY_NAME "LUO" +#define LUO_FDT_COMPATIBLE "luo-v1" +#define LUO_FDT_LIVEUPDATE_NUM "liveupdate-number" + +#endif /* _LINUX_KHO_ABI_LUO_H */ -- cgit v1.2.3 From 0153094d03df5a2e834a19c59b255649a258ae46 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:34 -0500 Subject: liveupdate: luo_session: add sessions support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce concept of "Live Update Sessions" within the LUO framework. LUO sessions provide a mechanism to group and manage `struct file *` instances (representing file descriptors) that need to be preserved across a kexec-based live update. Each session is identified by a unique name and acts as a container for file objects whose state is critical to a userspace workload, such as a virtual machine or a high-performance database, aiming to maintain their functionality across a kernel transition. This groundwork establishes the framework for preserving file-backed state across kernel updates, with the actual file data preservation mechanisms to be implemented in subsequent patches. [dan.carpenter@linaro.org: fix use after free in luo_session_deserialize()] Link: https://lkml.kernel.org/r/c5dd637d7eed3a3be48c5e9fedb881596a3b1f5a.1764163896.git.dan.carpenter@linaro.org Link: https://lkml.kernel.org/r/20251125165850.3389713-5-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Signed-off-by: Dan Carpenter Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/kho/abi/luo.h | 71 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/liveupdate.h | 3 ++ 2 files changed, 74 insertions(+) (limited to 'include') diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h index 2099b51929e5..bf1ab2910959 100644 --- a/include/linux/kho/abi/luo.h +++ b/include/linux/kho/abi/luo.h @@ -32,6 +32,11 @@ * / { * compatible = "luo-v1"; * liveupdate-number = <...>; + * + * luo-session { + * compatible = "luo-session-v1"; + * luo-session-header = ; + * }; * }; * * Main LUO Node (/): @@ -40,11 +45,37 @@ * Identifies the overall LUO ABI version. * - liveupdate-number: u64 * A counter tracking the number of successful live updates performed. + * + * Session Node (luo-session): + * This node describes all preserved user-space sessions. + * + * - compatible: "luo-session-v1" + * Identifies the session ABI version. + * - luo-session-header: u64 + * The physical address of a `struct luo_session_header_ser`. This structure + * is the header for a contiguous block of memory containing an array of + * `struct luo_session_ser`, one for each preserved session. + * + * Serialization Structures: + * The FDT properties point to memory regions containing arrays of simple, + * `__packed` structures. These structures contain the actual preserved state. + * + * - struct luo_session_header_ser: + * Header for the session array. Contains the total page count of the + * preserved memory block and the number of `struct luo_session_ser` + * entries that follow. + * + * - struct luo_session_ser: + * Metadata for a single session, including its name and a physical pointer + * to another preserved memory block containing an array of + * `struct luo_file_ser` for all files in that session. */ #ifndef _LINUX_KHO_ABI_LUO_H #define _LINUX_KHO_ABI_LUO_H +#include + /* * The LUO FDT hooks all LUO state for sessions, fds, etc. * In the root it also carries "liveupdate-number" 64-bit property that @@ -55,4 +86,44 @@ #define LUO_FDT_COMPATIBLE "luo-v1" #define LUO_FDT_LIVEUPDATE_NUM "liveupdate-number" +/* + * LUO FDT session node + * LUO_FDT_SESSION_HEADER: is a u64 physical address of struct + * luo_session_header_ser + */ +#define LUO_FDT_SESSION_NODE_NAME "luo-session" +#define LUO_FDT_SESSION_COMPATIBLE "luo-session-v1" +#define LUO_FDT_SESSION_HEADER "luo-session-header" + +/** + * struct luo_session_header_ser - Header for the serialized session data block. + * @count: The number of `struct luo_session_ser` entries that immediately + * follow this header in the memory block. + * + * This structure is located at the beginning of a contiguous block of + * physical memory preserved across the kexec. It provides the necessary + * metadata to interpret the array of session entries that follow. + * + * If this structure is modified, `LUO_FDT_SESSION_COMPATIBLE` must be updated. + */ +struct luo_session_header_ser { + u64 count; +} __packed; + +/** + * struct luo_session_ser - Represents the serialized metadata for a LUO session. + * @name: The unique name of the session, provided by the userspace at + * the time of session creation. + * + * This structure is used to package session-specific metadata for transfer + * between kernels via Kexec Handover. An array of these structures (one per + * session) is created and passed to the new kernel, allowing it to reconstruct + * the session context. + * + * If this structure is modified, `LUO_FDT_SESSION_COMPATIBLE` must be updated. + */ +struct luo_session_ser { + char name[LIVEUPDATE_SESSION_NAME_LENGTH]; +} __packed; + #endif /* _LINUX_KHO_ABI_LUO_H */ diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h index df34c1642c4d..40578ae19668 100644 --- a/include/uapi/linux/liveupdate.h +++ b/include/uapi/linux/liveupdate.h @@ -43,4 +43,7 @@ /* The ioctl type, documented in ioctl-number.rst */ #define LIVEUPDATE_IOCTL_TYPE 0xBA +/* The maximum length of session name including null termination */ +#define LIVEUPDATE_SESSION_NAME_LENGTH 64 + #endif /* _UAPI_LIVEUPDATE_H */ -- cgit v1.2.3 From 81cd25d263a182b3dcdc8af3b92e4b8e4db336de Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:35 -0500 Subject: liveupdate: luo_core: add user interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce the user-space interface for the Live Update Orchestrator via ioctl commands, enabling external control over the live update process and management of preserved resources. The idea is that there is going to be a single userspace agent driving the live update, therefore, only a single process can ever hold this device opened at a time. The following ioctl commands are introduced: LIVEUPDATE_IOCTL_CREATE_SESSION Provides a way for userspace to create a named session for grouping file descriptors that need to be preserved. It returns a new file descriptor representing the session. LIVEUPDATE_IOCTL_RETRIEVE_SESSION Allows the userspace agent in the new kernel to reclaim a preserved session by its name, receiving a new file descriptor to manage the restored resources. Link: https://lkml.kernel.org/r/20251125165850.3389713-6-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/uapi/linux/liveupdate.h | 64 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h index 40578ae19668..1183cf984b5f 100644 --- a/include/uapi/linux/liveupdate.h +++ b/include/uapi/linux/liveupdate.h @@ -46,4 +46,68 @@ /* The maximum length of session name including null termination */ #define LIVEUPDATE_SESSION_NAME_LENGTH 64 +/* The /dev/liveupdate ioctl commands */ +enum { + LIVEUPDATE_CMD_BASE = 0x00, + LIVEUPDATE_CMD_CREATE_SESSION = LIVEUPDATE_CMD_BASE, + LIVEUPDATE_CMD_RETRIEVE_SESSION = 0x01, +}; + +/** + * struct liveupdate_ioctl_create_session - ioctl(LIVEUPDATE_IOCTL_CREATE_SESSION) + * @size: Input; sizeof(struct liveupdate_ioctl_create_session) + * @fd: Output; The new file descriptor for the created session. + * @name: Input; A null-terminated string for the session name, max + * length %LIVEUPDATE_SESSION_NAME_LENGTH including termination + * character. + * + * Creates a new live update session for managing preserved resources. + * This ioctl can only be called on the main /dev/liveupdate device. + * + * Return: 0 on success, negative error code on failure. + */ +struct liveupdate_ioctl_create_session { + __u32 size; + __s32 fd; + __u8 name[LIVEUPDATE_SESSION_NAME_LENGTH]; +}; + +#define LIVEUPDATE_IOCTL_CREATE_SESSION \ + _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_CREATE_SESSION) + +/** + * struct liveupdate_ioctl_retrieve_session - ioctl(LIVEUPDATE_IOCTL_RETRIEVE_SESSION) + * @size: Input; sizeof(struct liveupdate_ioctl_retrieve_session) + * @fd: Output; The new file descriptor for the retrieved session. + * @name: Input; A null-terminated string identifying the session to retrieve. + * The name must exactly match the name used when the session was + * created in the previous kernel. + * + * Retrieves a handle (a new file descriptor) for a preserved session by its + * name. This is the primary mechanism for a userspace agent to regain control + * of its preserved resources after a live update. + * + * The userspace application provides the null-terminated `name` of a session + * it created before the live update. If a preserved session with a matching + * name is found, the kernel instantiates it and returns a new file descriptor + * in the `fd` field. This new session FD can then be used for all file-specific + * operations, such as restoring individual file descriptors with + * LIVEUPDATE_SESSION_RETRIEVE_FD. + * + * It is the responsibility of the userspace application to know the names of + * the sessions it needs to retrieve. If no session with the given name is + * found, the ioctl will fail with -ENOENT. + * + * This ioctl can only be called on the main /dev/liveupdate device when the + * system is in the LIVEUPDATE_STATE_UPDATED state. + */ +struct liveupdate_ioctl_retrieve_session { + __u32 size; + __s32 fd; + __u8 name[LIVEUPDATE_SESSION_NAME_LENGTH]; +}; + +#define LIVEUPDATE_IOCTL_RETRIEVE_SESSION \ + _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_RETRIEVE_SESSION) + #endif /* _UAPI_LIVEUPDATE_H */ -- cgit v1.2.3 From 7c722a7f44e0c1f9714084152226bc7bd644b7e3 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:36 -0500 Subject: liveupdate: luo_file: implement file systems callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch implements the core mechanism for managing preserved files throughout the live update lifecycle. It provides the logic to invoke the file handler callbacks (preserve, unpreserve, freeze, unfreeze, retrieve, and finish) at the appropriate stages. During the reboot phase, luo_file_freeze() serializes the final metadata for each file (handler compatible string, token, and data handle) into a memory region preserved by KHO. In the new kernel, luo_file_deserialize() reconstructs the in-memory file list from this data, preparing the session for retrieval. Link: https://lkml.kernel.org/r/20251125165850.3389713-7-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/kho/abi/luo.h | 39 +++++++++++++++++- include/linux/liveupdate.h | 98 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kho/abi/luo.h b/include/linux/kho/abi/luo.h index bf1ab2910959..bb099c92e469 100644 --- a/include/linux/kho/abi/luo.h +++ b/include/linux/kho/abi/luo.h @@ -69,6 +69,11 @@ * Metadata for a single session, including its name and a physical pointer * to another preserved memory block containing an array of * `struct luo_file_ser` for all files in that session. + * + * - struct luo_file_ser: + * Metadata for a single preserved file. Contains the `compatible` string to + * find the correct handler in the new kernel, a user-provided `token` for + * identification, and an opaque `data` handle for the handler to use. */ #ifndef _LINUX_KHO_ABI_LUO_H @@ -86,13 +91,43 @@ #define LUO_FDT_COMPATIBLE "luo-v1" #define LUO_FDT_LIVEUPDATE_NUM "liveupdate-number" +#define LIVEUPDATE_HNDL_COMPAT_LENGTH 48 + +/** + * struct luo_file_ser - Represents the serialized preserves files. + * @compatible: File handler compatible string. + * @data: Private data + * @token: User provided token for this file + * + * If this structure is modified, LUO_SESSION_COMPATIBLE must be updated. + */ +struct luo_file_ser { + char compatible[LIVEUPDATE_HNDL_COMPAT_LENGTH]; + u64 data; + u64 token; +} __packed; + +/** + * struct luo_file_set_ser - Represents the serialized metadata for file set + * @files: The physical address of a contiguous memory block that holds + * the serialized state of files (array of luo_file_ser) in this file + * set. + * @count: The total number of files that were part of this session during + * serialization. Used for iteration and validation during + * restoration. + */ +struct luo_file_set_ser { + u64 files; + u64 count; +} __packed; + /* * LUO FDT session node * LUO_FDT_SESSION_HEADER: is a u64 physical address of struct * luo_session_header_ser */ #define LUO_FDT_SESSION_NODE_NAME "luo-session" -#define LUO_FDT_SESSION_COMPATIBLE "luo-session-v1" +#define LUO_FDT_SESSION_COMPATIBLE "luo-session-v2" #define LUO_FDT_SESSION_HEADER "luo-session-header" /** @@ -114,6 +149,7 @@ struct luo_session_header_ser { * struct luo_session_ser - Represents the serialized metadata for a LUO session. * @name: The unique name of the session, provided by the userspace at * the time of session creation. + * @file_set_ser: Serialized files belonging to this session, * * This structure is used to package session-specific metadata for transfer * between kernels via Kexec Handover. An array of these structures (one per @@ -124,6 +160,7 @@ struct luo_session_header_ser { */ struct luo_session_ser { char name[LIVEUPDATE_SESSION_NAME_LENGTH]; + struct luo_file_set_ser file_set_ser; } __packed; #endif /* _LINUX_KHO_ABI_LUO_H */ diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index c6a1d6bd90cb..122ad8f16ff9 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -8,8 +8,93 @@ #define _LINUX_LIVEUPDATE_H #include +#include +#include #include #include +#include + +struct liveupdate_file_handler; +struct file; + +/** + * struct liveupdate_file_op_args - Arguments for file operation callbacks. + * @handler: The file handler being called. + * @retrieved: The retrieve status for the 'can_finish / finish' + * operation. + * @file: The file object. For retrieve: [OUT] The callback sets + * this to the new file. For other ops: [IN] The caller sets + * this to the file being operated on. + * @serialized_data: The opaque u64 handle, preserve/prepare/freeze may update + * this field. + * + * This structure bundles all parameters for the file operation callbacks. + * The 'data' and 'file' fields are used for both input and output. + */ +struct liveupdate_file_op_args { + struct liveupdate_file_handler *handler; + bool retrieved; + struct file *file; + u64 serialized_data; +}; + +/** + * struct liveupdate_file_ops - Callbacks for live-updatable files. + * @can_preserve: Required. Lightweight check to see if this handler is + * compatible with the given file. + * @preserve: Required. Performs state-saving for the file. + * @unpreserve: Required. Cleans up any resources allocated by @preserve. + * @freeze: Optional. Final actions just before kernel transition. + * @unfreeze: Optional. Undo freeze operations. + * @retrieve: Required. Restores the file in the new kernel. + * @can_finish: Optional. Check if this FD can finish, i.e. all restoration + * pre-requirements for this FD are satisfied. Called prior to + * finish, in order to do successful finish calls for all + * resources in the session. + * @finish: Required. Final cleanup in the new kernel. + * @owner: Module reference + * + * All operations (except can_preserve) receive a pointer to a + * 'struct liveupdate_file_op_args' containing the necessary context. + */ +struct liveupdate_file_ops { + bool (*can_preserve)(struct liveupdate_file_handler *handler, + struct file *file); + int (*preserve)(struct liveupdate_file_op_args *args); + void (*unpreserve)(struct liveupdate_file_op_args *args); + int (*freeze)(struct liveupdate_file_op_args *args); + void (*unfreeze)(struct liveupdate_file_op_args *args); + int (*retrieve)(struct liveupdate_file_op_args *args); + bool (*can_finish)(struct liveupdate_file_op_args *args); + void (*finish)(struct liveupdate_file_op_args *args); + struct module *owner; +}; + +/** + * struct liveupdate_file_handler - Represents a handler for a live-updatable file type. + * @ops: Callback functions + * @compatible: The compatibility string (e.g., "memfd-v1", "vfiofd-v1") + * that uniquely identifies the file type this handler + * supports. This is matched against the compatible string + * associated with individual &struct file instances. + * + * Modules that want to support live update for specific file types should + * register an instance of this structure. LUO uses this registration to + * determine if a given file can be preserved and to find the appropriate + * operations to manage its state across the update. + */ +struct liveupdate_file_handler { + const struct liveupdate_file_ops *ops; + const char compatible[LIVEUPDATE_HNDL_COMPAT_LENGTH]; + + /* private: */ + + /* + * Used for linking this handler instance into a global list of + * registered file handlers. + */ + struct list_head __private list; +}; #ifdef CONFIG_LIVEUPDATE @@ -19,6 +104,9 @@ bool liveupdate_enabled(void); /* Called during kexec to tell LUO that entered into reboot */ int liveupdate_reboot(void); +int liveupdate_register_file_handler(struct liveupdate_file_handler *fh); +int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh); + #else /* CONFIG_LIVEUPDATE */ static inline bool liveupdate_enabled(void) @@ -31,5 +119,15 @@ static inline int liveupdate_reboot(void) return 0; } +static inline int liveupdate_register_file_handler(struct liveupdate_file_handler *fh) +{ + return -EOPNOTSUPP; +} + +static inline int liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_LIVEUPDATE */ #endif /* _LINUX_LIVEUPDATE_H */ -- cgit v1.2.3 From 16cec0d265219f14a7fcebcc43aeb69205adba56 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Tue, 25 Nov 2025 11:58:37 -0500 Subject: liveupdate: luo_session: add ioctls for file preservation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introducing the userspace interface and internal logic required to manage the lifecycle of file descriptors within a session. Previously, a session was merely a container; this change makes it a functional management unit. The following capabilities are added: A new set of ioctl commands are added, which operate on the file descriptor returned by CREATE_SESSION. This allows userspace to: - LIVEUPDATE_SESSION_PRESERVE_FD: Add a file descriptor to a session to be preserved across the live update. - LIVEUPDATE_SESSION_RETRIEVE_FD: Retrieve a preserved file in the new kernel using its unique token. - LIVEUPDATE_SESSION_FINISH: finish session The session's .release handler is enhanced to be state-aware. When a session's file descriptor is closed, it correctly unpreserves the session based on its current state before freeing all associated file resources. Link: https://lkml.kernel.org/r/20251125165850.3389713-8-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Reviewed-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/uapi/linux/liveupdate.h | 103 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/liveupdate.h b/include/uapi/linux/liveupdate.h index 1183cf984b5f..30bc66ee9436 100644 --- a/include/uapi/linux/liveupdate.h +++ b/include/uapi/linux/liveupdate.h @@ -53,6 +53,14 @@ enum { LIVEUPDATE_CMD_RETRIEVE_SESSION = 0x01, }; +/* ioctl commands for session file descriptors */ +enum { + LIVEUPDATE_CMD_SESSION_BASE = 0x40, + LIVEUPDATE_CMD_SESSION_PRESERVE_FD = LIVEUPDATE_CMD_SESSION_BASE, + LIVEUPDATE_CMD_SESSION_RETRIEVE_FD = 0x41, + LIVEUPDATE_CMD_SESSION_FINISH = 0x42, +}; + /** * struct liveupdate_ioctl_create_session - ioctl(LIVEUPDATE_IOCTL_CREATE_SESSION) * @size: Input; sizeof(struct liveupdate_ioctl_create_session) @@ -110,4 +118,99 @@ struct liveupdate_ioctl_retrieve_session { #define LIVEUPDATE_IOCTL_RETRIEVE_SESSION \ _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_RETRIEVE_SESSION) +/* Session specific IOCTLs */ + +/** + * struct liveupdate_session_preserve_fd - ioctl(LIVEUPDATE_SESSION_PRESERVE_FD) + * @size: Input; sizeof(struct liveupdate_session_preserve_fd) + * @fd: Input; The user-space file descriptor to be preserved. + * @token: Input; An opaque, unique token for preserved resource. + * + * Holds parameters for preserving a file descriptor. + * + * User sets the @fd field identifying the file descriptor to preserve + * (e.g., memfd, kvm, iommufd, VFIO). The kernel validates if this FD type + * and its dependencies are supported for preservation. If validation passes, + * the kernel marks the FD internally and *initiates the process* of preparing + * its state for saving. The actual snapshotting of the state typically occurs + * during the subsequent %LIVEUPDATE_IOCTL_PREPARE execution phase, though + * some finalization might occur during freeze. + * On successful validation and initiation, the kernel uses the @token + * field with an opaque identifier representing the resource being preserved. + * This token confirms the FD is targeted for preservation and is required for + * the subsequent %LIVEUPDATE_SESSION_RETRIEVE_FD call after the live update. + * + * Return: 0 on success (validation passed, preservation initiated), negative + * error code on failure (e.g., unsupported FD type, dependency issue, + * validation failed). + */ +struct liveupdate_session_preserve_fd { + __u32 size; + __s32 fd; + __aligned_u64 token; +}; + +#define LIVEUPDATE_SESSION_PRESERVE_FD \ + _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_SESSION_PRESERVE_FD) + +/** + * struct liveupdate_session_retrieve_fd - ioctl(LIVEUPDATE_SESSION_RETRIEVE_FD) + * @size: Input; sizeof(struct liveupdate_session_retrieve_fd) + * @fd: Output; The new file descriptor representing the fully restored + * kernel resource. + * @token: Input; An opaque, token that was used to preserve the resource. + * + * Retrieve a previously preserved file descriptor. + * + * User sets the @token field to the value obtained from a successful + * %LIVEUPDATE_IOCTL_FD_PRESERVE call before the live update. On success, + * the kernel restores the state (saved during the PREPARE/FREEZE phases) + * associated with the token and populates the @fd field with a new file + * descriptor referencing the restored resource in the current (new) kernel. + * This operation must be performed *before* signaling completion via + * %LIVEUPDATE_IOCTL_FINISH. + * + * Return: 0 on success, negative error code on failure (e.g., invalid token). + */ +struct liveupdate_session_retrieve_fd { + __u32 size; + __s32 fd; + __aligned_u64 token; +}; + +#define LIVEUPDATE_SESSION_RETRIEVE_FD \ + _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_SESSION_RETRIEVE_FD) + +/** + * struct liveupdate_session_finish - ioctl(LIVEUPDATE_SESSION_FINISH) + * @size: Input; sizeof(struct liveupdate_session_finish) + * @reserved: Input; Must be zero. Reserved for future use. + * + * Signals the completion of the restoration process for a retrieved session. + * This is the final operation that should be performed on a session file + * descriptor after a live update. + * + * This ioctl must be called once all required file descriptors for the session + * have been successfully retrieved (using %LIVEUPDATE_SESSION_RETRIEVE_FD) and + * are fully restored from the userspace and kernel perspective. + * + * Upon success, the kernel releases its ownership of the preserved resources + * associated with this session. This allows internal resources to be freed, + * typically by decrementing reference counts on the underlying preserved + * objects. + * + * If this operation fails, the resources remain preserved in memory. Userspace + * may attempt to call finish again. The resources will otherwise be reset + * during the next live update cycle. + * + * Return: 0 on success, negative error code on failure. + */ +struct liveupdate_session_finish { + __u32 size; + __u32 reserved; +}; + +#define LIVEUPDATE_SESSION_FINISH \ + _IO(LIVEUPDATE_IOCTL_TYPE, LIVEUPDATE_CMD_SESSION_FINISH) + #endif /* _UAPI_LIVEUPDATE_H */ -- cgit v1.2.3 From 6ff1610ced5689c9af4c28a1798e04b74128a703 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Nov 2025 11:58:40 -0500 Subject: mm: shmem: use SHMEM_F_* flags instead of VM_* flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit shmem_inode_info::flags can have the VM flags VM_NORESERVE and VM_LOCKED. These are used to suppress pre-accounting or to lock the pages in the inode respectively. Using the VM flags directly makes it difficult to add shmem-specific flags that are unrelated to VM behavior since one would need to find a VM flag not used by shmem and re-purpose it. Introduce SHMEM_F_NORESERVE and SHMEM_F_LOCKED which represent the same information, but their bits are independent of the VM flags. Callers can still pass VM_NORESERVE to shmem_get_inode(), but it gets transformed to the shmem-specific flag internally. No functional changes intended. Link: https://lkml.kernel.org/r/20251125165850.3389713-11-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0e47465ef0fd..650874b400b5 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -10,6 +10,7 @@ #include #include #include +#include struct swap_iocb; @@ -19,6 +20,11 @@ struct swap_iocb; #define SHMEM_MAXQUOTAS 2 #endif +/* Suppress pre-accounting of the entire object size. */ +#define SHMEM_F_NORESERVE BIT(0) +/* Disallow swapping. */ +#define SHMEM_F_LOCKED BIT(1) + struct shmem_inode_info { spinlock_t lock; unsigned int seals; /* shmem seals */ -- cgit v1.2.3 From e165e2a2577b048664be09c074a10304290055f0 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Nov 2025 11:58:41 -0500 Subject: mm: shmem: allow freezing inode mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To prepare a shmem inode for live update, its index -> folio mappings must be serialized. Once the mappings are serialized, they cannot change since it would cause the serialized data to become inconsistent. This can be done by pinning the folios to avoid migration, and by making sure no folios can be added to or removed from the inode. While mechanisms to pin folios already exist, the only way to stop folios being added or removed are the grow and shrink file seals. But file seals come with their own semantics, one of which is that they can't be removed. This doesn't work with liveupdate since it can be cancelled or error out, which would need the seals to be removed and the file's normal functionality to be restored. Introduce SHMEM_F_MAPPING_FROZEN to indicate this instead. It is internal to shmem and is not directly exposed to userspace. It functions similar to F_SEAL_GROW | F_SEAL_SHRINK, but additionally disallows hole punching, and can be removed. Link: https://lkml.kernel.org/r/20251125165850.3389713-12-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 650874b400b5..d34a64eafe60 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -24,6 +24,14 @@ struct swap_iocb; #define SHMEM_F_NORESERVE BIT(0) /* Disallow swapping. */ #define SHMEM_F_LOCKED BIT(1) +/* + * Disallow growing, shrinking, or hole punching in the inode. Combined with + * folio pinning, makes sure the inode's mapping stays fixed. + * + * In some ways similar to F_SEAL_GROW | F_SEAL_SHRINK, but can be removed and + * isn't directly visible to userspace. + */ +#define SHMEM_F_MAPPING_FROZEN BIT(2) struct shmem_inode_info { spinlock_t lock; @@ -186,6 +194,15 @@ static inline bool shmem_file(struct file *file) return shmem_mapping(file->f_mapping); } +/* Must be called with inode lock taken exclusive. */ +static inline void shmem_freeze(struct inode *inode, bool freeze) +{ + if (freeze) + SHMEM_I(inode)->flags |= SHMEM_F_MAPPING_FROZEN; + else + SHMEM_I(inode)->flags &= ~SHMEM_F_MAPPING_FROZEN; +} + /* * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages * beyond i_size's notion of EOF, which fallocate has committed to reserving: -- cgit v1.2.3 From 8def18633e8df54a05cf7d323d0df24c21b320d6 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Nov 2025 11:58:43 -0500 Subject: liveupdate: luo_file: add private argument to store runtime state MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently file handlers only get the serialized_data field to store their state. This field has a pointer to the serialized state of the file, and it becomes a part of LUO file's serialized state. File handlers can also need some runtime state to track information that shouldn't make it in the serialized data. One such example is a vmalloc pointer. While kho_preserve_vmalloc() preserves the memory backing a vmalloc allocation, it does not store the original vmap pointer, since that has no use being passed to the next kernel. The pointer is needed to free the memory in case the file is unpreserved. Provide a private field in struct luo_file and pass it to all the callbacks. The field's can be set by preserve, and must be freed by unpreserve. Link: https://lkml.kernel.org/r/20251125165850.3389713-14-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav Co-developed-by: Pasha Tatashin Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/liveupdate.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index 122ad8f16ff9..a7f6ee5b6771 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -27,6 +27,10 @@ struct file; * this to the file being operated on. * @serialized_data: The opaque u64 handle, preserve/prepare/freeze may update * this field. + * @private_data: Private data for the file used to hold runtime state that + * is not preserved. Set by the handler's .preserve() + * callback, and must be freed in the handler's + * .unpreserve() callback. * * This structure bundles all parameters for the file operation callbacks. * The 'data' and 'file' fields are used for both input and output. @@ -36,6 +40,7 @@ struct liveupdate_file_op_args { bool retrieved; struct file *file; u64 serialized_data; + void *private_data; }; /** -- cgit v1.2.3 From b3749f174d686627f702234e64bad976dc432dbc Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 25 Nov 2025 11:58:44 -0500 Subject: mm: memfd_luo: allow preserving memfd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ability to preserve a memfd allows userspace to use KHO and LUO to transfer its memory contents to the next kernel. This is useful in many ways. For one, it can be used with IOMMUFD as the backing store for IOMMU page tables. Preserving IOMMUFD is essential for performing a hypervisor live update with passthrough devices. memfd support provides the first building block for making that possible. For another, applications with a large amount of memory that takes time to reconstruct, reboots to consume kernel upgrades can be very expensive. memfd with LUO gives those applications reboot-persistent memory that they can use to quickly save and reconstruct that state. While memfd is backed by either hugetlbfs or shmem, currently only support on shmem is added. To be more precise, support for anonymous shmem files is added. The handover to the next kernel is not transparent. All the properties of the file are not preserved; only its memory contents, position, and size. The recreated file gets the UID and GID of the task doing the restore, and the task's cgroup gets charged with the memory. Once preserved, the file cannot grow or shrink, and all its pages are pinned to avoid migrations and swapping. The file can still be read from or written to. Use vmalloc to get the buffer to hold the folios, and preserve it using kho_preserve_vmalloc(). This doesn't have the size limit. Link: https://lkml.kernel.org/r/20251125165850.3389713-15-pasha.tatashin@soleen.com Signed-off-by: Pratyush Yadav Co-developed-by: Pasha Tatashin Signed-off-by: Pasha Tatashin Reviewed-by: Mike Rapoport (Microsoft) Tested-by: David Matlack Cc: Aleksander Lobakin Cc: Alexander Graf Cc: Alice Ryhl Cc: Andriy Shevchenko Cc: anish kumar Cc: Anna Schumaker Cc: Bartosz Golaszewski Cc: Bjorn Helgaas Cc: Borislav Betkov Cc: Chanwoo Choi Cc: Chen Ridong Cc: Chris Li Cc: Christian Brauner Cc: Daniel Wagner Cc: Danilo Krummrich Cc: Dan Williams Cc: David Hildenbrand Cc: David Jeffery Cc: David Rientjes Cc: Greg Kroah-Hartman Cc: Guixin Liu Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ilpo Järvinen Cc: Ingo Molnar Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Joanthan Cameron Cc: Joel Granados Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lennart Poettering Cc: Leon Romanovsky Cc: Leon Romanovsky Cc: Lukas Wunner Cc: Marc Rutland Cc: Masahiro Yamada Cc: Matthew Maurer Cc: Miguel Ojeda Cc: Myugnjoo Ham Cc: Parav Pandit Cc: Pratyush Yadav Cc: Randy Dunlap Cc: Roman Gushchin Cc: Saeed Mahameed Cc: Samiullah Khawaja Cc: Song Liu Cc: Steven Rostedt Cc: Stuart Hayes Cc: Tejun Heo Cc: Thomas Gleinxer Cc: Thomas Weißschuh Cc: Vincent Guittot Cc: William Tu Cc: Yoann Congal Cc: Zhu Yanjun Cc: Zijun Hu Signed-off-by: Andrew Morton --- include/linux/kho/abi/memfd.h | 77 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 include/linux/kho/abi/memfd.h (limited to 'include') diff --git a/include/linux/kho/abi/memfd.h b/include/linux/kho/abi/memfd.h new file mode 100644 index 000000000000..da7d063474a1 --- /dev/null +++ b/include/linux/kho/abi/memfd.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2025, Google LLC. + * Pasha Tatashin + * + * Copyright (C) 2025 Amazon.com Inc. or its affiliates. + * Pratyush Yadav + */ + +#ifndef _LINUX_KHO_ABI_MEMFD_H +#define _LINUX_KHO_ABI_MEMFD_H + +#include +#include + +/** + * DOC: memfd Live Update ABI + * + * This header defines the ABI for preserving the state of a memfd across a + * kexec reboot using the LUO. + * + * The state is serialized into a packed structure `struct memfd_luo_ser` + * which is handed over to the next kernel via the KHO mechanism. + * + * This interface is a contract. Any modification to the structure layout + * constitutes a breaking change. Such changes require incrementing the + * version number in the MEMFD_LUO_FH_COMPATIBLE string. + */ + +/** + * MEMFD_LUO_FOLIO_DIRTY - The folio is dirty. + * + * This flag indicates the folio contains data from user. A non-dirty folio is + * one that was allocated (say using fallocate(2)) but not written to. + */ +#define MEMFD_LUO_FOLIO_DIRTY BIT(0) + +/** + * MEMFD_LUO_FOLIO_UPTODATE - The folio is up-to-date. + * + * An up-to-date folio has been zeroed out. shmem zeroes out folios on first + * use. This flag tracks which folios need zeroing. + */ +#define MEMFD_LUO_FOLIO_UPTODATE BIT(1) + +/** + * struct memfd_luo_folio_ser - Serialized state of a single folio. + * @pfn: The page frame number of the folio. + * @flags: Flags to describe the state of the folio. + * @index: The page offset (pgoff_t) of the folio within the original file. + */ +struct memfd_luo_folio_ser { + u64 pfn:52; + u64 flags:12; + u64 index; +} __packed; + +/** + * struct memfd_luo_ser - Main serialization structure for a memfd. + * @pos: The file's current position (f_pos). + * @size: The total size of the file in bytes (i_size). + * @nr_folios: Number of folios in the folios array. + * @folios: KHO vmalloc descriptor pointing to the array of + * struct memfd_luo_folio_ser. + */ +struct memfd_luo_ser { + u64 pos; + u64 size; + u64 nr_folios; + struct kho_vmalloc folios; +} __packed; + +/* The compatibility string for memfd file handler */ +#define MEMFD_LUO_FH_COMPATIBLE "memfd-v1" + +#endif /* _LINUX_KHO_ABI_MEMFD_H */ -- cgit v1.2.3 From 3fa805c37dd4d3e72ae5c58800f3f46ab3ca1f70 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 10 Oct 2025 03:36:50 -0700 Subject: vmcoreinfo: track and log recoverable hardware errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a generic infrastructure for tracking recoverable hardware errors (HW errors that are visible to the OS but does not cause a panic) and record them for vmcore consumption. This aids post-mortem crash analysis tools by preserving a count and timestamp for the last occurrence of such errors. On the other side, correctable errors, which the OS typically remains unaware of because the underlying hardware handles them transparently, are less relevant for crash dump and therefore are NOT tracked in this infrastructure. Add centralized logging for sources of recoverable hardware errors based on the subsystem it has been notified. hwerror_data is write-only at kernel runtime, and it is meant to be read from vmcore using tools like crash/drgn. For example, this is how it looks like when opening the crashdump from drgn. >>> prog['hwerror_data'] (struct hwerror_info[1]){ { .count = (int)844, .timestamp = (time64_t)1752852018, }, ... This helps fleet operators quickly triage whether a crash may be influenced by hardware recoverable errors (which executes a uncommon code path in the kernel), especially when recoverable errors occurred shortly before a panic, such as the bug fixed by commit ee62ce7a1d90 ("page_pool: Track DMA-mapped pages and unmap them when destroying the pool") This is not intended to replace full hardware diagnostics but provides a fast way to correlate hardware events with kernel panics quickly. Rare machine check exceptions—like those indicated by mce_flags.p5 or mce_flags.winchip—are not accounted for in this method, as they fall outside the intended usage scope for this feature's user base. [leitao@debian.org: add hw-recoverable-errors to toctree] Link: https://lkml.kernel.org/r/20251127-vmcoreinfo_fix-v1-1-26f5b1c43da9@debian.org Link: https://lkml.kernel.org/r/20251010-vmcore_hw_error-v5-1-636ede3efe44@debian.org Signed-off-by: Breno Leitao Suggested-by: Tony Luck Suggested-by: Shuai Xue Reviewed-by: Shuai Xue Reviewed-by: Hanjun Guo [APEI] Cc: Bjorn Helgaas Cc: Bob Moore Cc: Borislav Betkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Morse Cc: Konrad Rzessutek Wilk Cc: Len Brown Cc: Mahesh Salgaonkar Cc: Mauro Carvalho Chehab Cc: "Oliver O'Halloran" Cc: Omar Sandoval Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- include/linux/vmcore_info.h | 8 ++++++++ include/uapi/linux/vmcore.h | 9 +++++++++ 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/vmcore_info.h b/include/linux/vmcore_info.h index 37e003ae5262..e71518caacdf 100644 --- a/include/linux/vmcore_info.h +++ b/include/linux/vmcore_info.h @@ -5,6 +5,7 @@ #include #include #include +#include #define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) #define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(NN_PRSTATUS), 4) @@ -77,4 +78,11 @@ extern u32 *vmcoreinfo_note; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len); void final_note(Elf_Word *buf); + +#ifdef CONFIG_VMCORE_INFO +void hwerr_log_error_type(enum hwerr_error_type src); +#else +static inline void hwerr_log_error_type(enum hwerr_error_type src) {}; +#endif + #endif /* LINUX_VMCORE_INFO_H */ diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h index 3e9da91866ff..2ba89fafa518 100644 --- a/include/uapi/linux/vmcore.h +++ b/include/uapi/linux/vmcore.h @@ -15,4 +15,13 @@ struct vmcoredd_header { __u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */ }; +enum hwerr_error_type { + HWERR_RECOV_CPU, + HWERR_RECOV_MEMORY, + HWERR_RECOV_PCI, + HWERR_RECOV_CXL, + HWERR_RECOV_OTHERS, + HWERR_RECOV_MAX, +}; + #endif /* _UAPI_VMCORE_H */ -- cgit v1.2.3 From 93d7a7ed07342f5e3da2d250cfd67f899d0b5318 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Oct 2025 12:32:23 +0200 Subject: netfilter: flowtable: move path discovery infrastructure to its own file This file contains the path discovery that is run from the forward chain for the packet offloading the flow into the flowtable. This consists of a series of calls to dev_fill_forward_path() for each device stack. More topologies may be supported in the future, so move this code to its own file to separate it from the nftables flow_offload expression. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index c003cd194fa2..e9f72d2558e9 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -222,6 +222,12 @@ struct nf_flow_route { struct flow_offload *flow_offload_alloc(struct nf_conn *ct); void flow_offload_free(struct flow_offload *flow); +struct nft_flowtable; +struct nft_pktinfo; +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft); + static inline int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, flow_setup_cb_t *cb, void *cb_priv) -- cgit v1.2.3 From b5964aac51e0c286a50e68225e0dfcf11fb554cb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Oct 2025 12:32:35 +0200 Subject: netfilter: flowtable: consolidate xmit path Use dev_queue_xmit() for the XMIT_NEIGH case. Store the interface index of the real device behind the vlan/pppoe device, this introduces an extra lookup for the real device in the xmit path because rt->dst.dev provides the vlan/pppoe device. XMIT_NEIGH now looks more similar to XMIT_DIRECT but the check for stale dst and the neighbour lookup still remain in place which is convenient to deal with network topology changes. Note that nft_flow_route() needs to relax the check for _XMIT_NEIGH so the existing basic xfrm offload (which only works in one direction) does not break. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index e9f72d2558e9..7c330caae52b 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -141,6 +141,7 @@ struct flow_offload_tuple { union { struct { struct dst_entry *dst_cache; + u32 ifidx; u32 dst_cookie; }; struct { -- cgit v1.2.3 From 030feea3097c41ed268c81240e5c334d9977b1c4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 10 Oct 2025 12:50:07 +0200 Subject: netfilter: flowtable: remove hw_ifidx hw_ifidx was originally introduced to store the real netdevice as a requirement for the hardware offload support in: 73f97025a972 ("netfilter: nft_flow_offload: use direct xmit if hardware offload is enabled") Since ("netfilter: flowtable: consolidate xmit path"), ifidx and hw_ifidx points to the real device in the xmit path, remove it. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 7c330caae52b..f7306276ece7 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -146,7 +146,6 @@ struct flow_offload_tuple { }; struct { u32 ifidx; - u32 hw_ifidx; u8 h_source[ETH_ALEN]; u8 h_dest[ETH_ALEN]; } out; -- cgit v1.2.3 From ab427db17885814069bae891834f20842f0ac3a4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 7 Nov 2025 12:14:46 +0100 Subject: netfilter: flowtable: Add IPIP rx sw acceleration Introduce sw acceleration for rx path of IPIP tunnels relying on the netfilter flowtable infrastructure. Subsequent patches will add sw acceleration for IPIP tunnels tx path. This series introduces basic infrastructure to accelerate other tunnel types (e.g. IP6IP6). IPIP rx sw acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP tunnel is used to access a remote site (using eth1 as the underlay device): ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2) $ip addr show 6: eth0: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.0.2/24 scope global eth0 valid_lft forever preferred_lft forever 7: eth1: mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.1.1/24 scope global eth1 valid_lft forever preferred_lft forever 8: tun0@NONE: mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/ipip 192.168.1.1 peer 192.168.1.2 inet 192.168.100.1/24 scope global tun0 valid_lft forever preferred_lft forever $ip route show default via 192.168.100.2 dev tun0 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2 192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1 192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1 $nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } } chain forward { type filter hook forward priority filter; policy accept; meta l4proto { tcp, udp } flow add @ft } } Reproducing the scenario described above using veths I got the following results: - TCP stream received from the IPIP tunnel: - net-next: (baseline) ~ 71Gbps - net-next + IPIP flowtbale support: ~101Gbps Signed-off-by: Lorenzo Bianconi Signed-off-by: Pablo Neira Ayuso --- include/linux/netdevice.h | 13 +++++++++++++ include/net/netfilter/nf_flow_table.h | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e808071dbb7d..bf99fe8622da 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -877,6 +877,7 @@ enum net_device_path_type { DEV_PATH_PPPOE, DEV_PATH_DSA, DEV_PATH_MTK_WDMA, + DEV_PATH_TUN, }; struct net_device_path { @@ -888,6 +889,18 @@ struct net_device_path { __be16 proto; u8 h_dest[ETH_ALEN]; } encap; + struct { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + + u8 l3_proto; + } tun; struct { enum { DEV_PATH_BR_VLAN_KEEP, diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index f7306276ece7..b09c11c048d5 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -107,6 +107,19 @@ enum flow_offload_xmit_type { #define NF_FLOW_TABLE_ENCAP_MAX 2 +struct flow_offload_tunnel { + union { + struct in_addr src_v4; + struct in6_addr src_v6; + }; + union { + struct in_addr dst_v4; + struct in6_addr dst_v6; + }; + + u8 l3_proto; +}; + struct flow_offload_tuple { union { struct in_addr src_v4; @@ -130,12 +143,15 @@ struct flow_offload_tuple { __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; + struct flow_offload_tunnel tun; + /* All members above are keys for lookups, see flow_offload_hash(). */ struct { } __hash; u8 dir:2, xmit_type:3, encap_num:2, + tun_num:2, in_vlan_ingress:2; u16 mtu; union { @@ -206,7 +222,9 @@ struct nf_flow_route { u16 id; __be16 proto; } encap[NF_FLOW_TABLE_ENCAP_MAX]; + struct flow_offload_tunnel tun; u8 num_encaps:2, + num_tuns:2, ingress_vlans:2; } in; struct { -- cgit v1.2.3 From be102eb6a0e7c03db00e50540622f4e43b2d2844 Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Fri, 21 Nov 2025 01:14:30 +0100 Subject: netfilter: nf_conncount: rework API to use sk_buff directly When using nf_conncount infrastructure for non-confirmed connections a duplicated track is possible due to an optimization introduced since commit d265929930e2 ("netfilter: nf_conncount: reduce unnecessary GC"). In order to fix this introduce a new conncount API that receives directly an sk_buff struct. It fetches the tuple and zone and the corresponding ct from it. It comes with both existing conncount variants nf_conncount_count_skb() and nf_conncount_add_skb(). In addition remove the old API and adjust all the users to use the new one. This way, for each sk_buff struct it is possible to check if there is a ct present and already confirmed. If so, skip the add operation. Fixes: d265929930e2 ("netfilter: nf_conncount: reduce unnecessary GC") Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_count.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_count.h b/include/net/netfilter/nf_conntrack_count.h index 1b58b5b91ff6..52a06de41aa0 100644 --- a/include/net/netfilter/nf_conntrack_count.h +++ b/include/net/netfilter/nf_conntrack_count.h @@ -18,15 +18,14 @@ struct nf_conncount_list { struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen); void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data); -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); - -int nf_conncount_add(struct net *net, struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key); + +int nf_conncount_add_skb(struct net *net, const struct sk_buff *skb, + u16 l3num, struct nf_conncount_list *list); void nf_conncount_list_init(struct nf_conncount_list *list); -- cgit v1.2.3 From c4f0ab06e1e0c1331e6febd03538a7f621f15134 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 1 Nov 2025 12:20:50 -0700 Subject: netfilter: ip6t_srh: fix UAPI kernel-doc comments format Fix the kernel-doc format for struct members to be "@member" instead of "@ member" to avoid kernel-doc warnings. Warning: ip6t_srh.h:60 struct member 'next_hdr' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'hdr_len' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'segs_left' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'last_entry' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'tag' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'mt_flags' not described in 'ip6t_srh' Warning: ip6t_srh.h:60 struct member 'mt_invflags' not described in 'ip6t_srh' Warning: ip6t_srh.h:93 struct member 'next_hdr' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'hdr_len' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'segs_left' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'last_entry' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'tag' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'psid_addr' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'nsid_addr' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'lsid_addr' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'psid_msk' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'nsid_msk' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'lsid_msk' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'mt_flags' not described in 'ip6t_srh1' Warning: ip6t_srh.h:93 struct member 'mt_invflags' not described in 'ip6t_srh1' Signed-off-by: Randy Dunlap Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_ipv6/ip6t_srh.h | 40 ++++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h index 54ed83360dac..80c66c8ece82 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h +++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h @@ -41,13 +41,13 @@ /** * struct ip6t_srh - SRH match options - * @ next_hdr: Next header field of SRH - * @ hdr_len: Extension header length field of SRH - * @ segs_left: Segments left field of SRH - * @ last_entry: Last entry field of SRH - * @ tag: Tag field of SRH - * @ mt_flags: match options - * @ mt_invflags: Invert the sense of match options + * @next_hdr: Next header field of SRH + * @hdr_len: Extension header length field of SRH + * @segs_left: Segments left field of SRH + * @last_entry: Last entry field of SRH + * @tag: Tag field of SRH + * @mt_flags: match options + * @mt_invflags: Invert the sense of match options */ struct ip6t_srh { @@ -62,19 +62,19 @@ struct ip6t_srh { /** * struct ip6t_srh1 - SRH match options (revision 1) - * @ next_hdr: Next header field of SRH - * @ hdr_len: Extension header length field of SRH - * @ segs_left: Segments left field of SRH - * @ last_entry: Last entry field of SRH - * @ tag: Tag field of SRH - * @ psid_addr: Address of previous SID in SRH SID list - * @ nsid_addr: Address of NEXT SID in SRH SID list - * @ lsid_addr: Address of LAST SID in SRH SID list - * @ psid_msk: Mask of previous SID in SRH SID list - * @ nsid_msk: Mask of next SID in SRH SID list - * @ lsid_msk: MAsk of last SID in SRH SID list - * @ mt_flags: match options - * @ mt_invflags: Invert the sense of match options + * @next_hdr: Next header field of SRH + * @hdr_len: Extension header length field of SRH + * @segs_left: Segments left field of SRH + * @last_entry: Last entry field of SRH + * @tag: Tag field of SRH + * @psid_addr: Address of previous SID in SRH SID list + * @nsid_addr: Address of NEXT SID in SRH SID list + * @lsid_addr: Address of LAST SID in SRH SID list + * @psid_msk: Mask of previous SID in SRH SID list + * @nsid_msk: Mask of next SID in SRH SID list + * @lsid_msk: MAsk of last SID in SRH SID list + * @mt_flags: match options + * @mt_invflags: Invert the sense of match options */ struct ip6t_srh1 { -- cgit v1.2.3 From d3a439e55c193b930e0007967cf8d7a29890449b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 1 Nov 2025 12:20:38 -0700 Subject: netfilter: nf_tables: improve UAPI kernel-doc comments In include/uapi/linux/netfilter/nf_tables.h, correct the kernel-doc comments for mistyped enum names and enum values to avoid these kernel-doc warnings and improve the documentation: nf_tables.h:896: warning: Enum value 'NFT_EXTHDR_OP_TCPOPT' not described in enum 'nft_exthdr_op' nf_tables.h:896: warning: Excess enum value 'NFT_EXTHDR_OP_TCP' description in 'nft_exthdr_op' nf_tables.h:1210: warning: expecting prototype for enum nft_flow_attributes. Prototype was for enum nft_offload_attributes instead nf_tables.h:1428: warning: expecting prototype for enum nft_reject_code. Prototype was for enum nft_reject_inet_code instead (add beginning '@' to each enum value description:) nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_FAMILY' not described in enum 'nft_tproxy_attributes' nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_REG_ADDR' not described in enum 'nft_tproxy_attributes' nf_tables.h:1493: warning: Enum value 'NFTA_TPROXY_REG_PORT' not described in enum 'nft_tproxy_attributes' nf_tables.h:1796: warning: expecting prototype for enum nft_device_attributes. Prototype was for enum nft_devices_attributes instead Signed-off-by: Randy Dunlap Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 7c0c915f0306..45c71f7d21c2 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -881,7 +881,7 @@ enum nft_exthdr_flags { * enum nft_exthdr_op - nf_tables match options * * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers - * @NFT_EXTHDR_OP_TCP: match against tcp options + * @NFT_EXTHDR_OP_TCPOPT: match against tcp options * @NFT_EXTHDR_OP_IPV4: match against ipv4 options * @NFT_EXTHDR_OP_SCTP: match against sctp chunks * @NFT_EXTHDR_OP_DCCP: match against dccp otions @@ -1200,7 +1200,7 @@ enum nft_ct_attributes { #define NFTA_CT_MAX (__NFTA_CT_MAX - 1) /** - * enum nft_flow_attributes - ct offload expression attributes + * enum nft_offload_attributes - ct offload expression attributes * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING) */ enum nft_offload_attributes { @@ -1410,7 +1410,7 @@ enum nft_reject_types { }; /** - * enum nft_reject_code - Generic reject codes for IPv4/IPv6 + * enum nft_reject_inet_code - Generic reject codes for IPv4/IPv6 * * @NFT_REJECT_ICMPX_NO_ROUTE: no route to host / network unreachable * @NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable @@ -1480,9 +1480,9 @@ enum nft_nat_attributes { /** * enum nft_tproxy_attributes - nf_tables tproxy expression netlink attributes * - * NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) - * NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) - * NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) + * @NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) + * @NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) + * @NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) */ enum nft_tproxy_attributes { NFTA_TPROXY_UNSPEC, @@ -1783,7 +1783,7 @@ enum nft_synproxy_attributes { #define NFTA_SYNPROXY_MAX (__NFTA_SYNPROXY_MAX - 1) /** - * enum nft_device_attributes - nf_tables device netlink attributes + * enum nft_devices_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) * @NFTA_DEVICE_PREFIX: device name prefix, a simple wildcard (NLA_STRING) -- cgit v1.2.3 From f6ed9c5d3190cf18382ee75e0420602101f53586 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2025 14:52:49 -0500 Subject: overflow: Introduce struct_offset() to get offset of member The trace_marker_raw file in tracefs takes a buffer from user space that contains an id as well as a raw data string which is usually a binary structure. The structure used has the following: struct raw_data_entry { struct trace_entry ent; unsigned int id; char buf[]; }; Since the passed in "cnt" variable is both the size of buf as well as the size of id, the code to allocate the location on the ring buffer had: size = struct_size(entry, buf, cnt - sizeof(entry->id)); Which is quite ugly and hard to understand. Instead, add a helper macro called struct_offset() which then changes the above to a simple and easy to understand: size = struct_offset(entry, id) + cnt; This will likely come in handy for other use cases too. Link: https://lore.kernel.org/all/CAHk-=whYZVoEdfO1PmtbirPdBMTV9Nxt9f09CK0k6S+HJD3Zmg@mail.gmail.com/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: "Gustavo A. R. Silva" Link: https://patch.msgid.link/20251126145249.05b1770a@gandalf.local.home Suggested-by: Linus Torvalds Reviewed-by: Kees Cook Signed-off-by: Steven Rostedt (Google) --- include/linux/overflow.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/overflow.h b/include/linux/overflow.h index 725f95f7e416..736f633b2d5f 100644 --- a/include/linux/overflow.h +++ b/include/linux/overflow.h @@ -458,6 +458,18 @@ static inline size_t __must_check size_sub(size_t minuend, size_t subtrahend) #define struct_size_t(type, member, count) \ struct_size((type *)NULL, member, count) +/** + * struct_offset() - Calculate the offset of a member within a struct + * @p: Pointer to the struct + * @member: Name of the member to get the offset of + * + * Calculates the offset of a particular @member of the structure pointed + * to by @p. + * + * Return: number of bytes to the location of @member. + */ +#define struct_offset(p, member) (offsetof(typeof(*(p)), member)) + /** * __DEFINE_FLEX() - helper macro for DEFINE_FLEX() family. * Enables caller macro to pass arbitrary trailing expressions -- cgit v1.2.3 From df59bb5b9af3fc24d957261e9f80f0c0dec151a4 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 26 Nov 2025 13:36:46 +0900 Subject: netmem, devmem, tcp: access pp fields through @desc in net_iov Convert all the legacy code directly accessing the pp fields in net_iov to access them through @desc in net_iov. Signed-off-by: Byungchul Park Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ff90281ddf90..86737076101d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3778,8 +3778,8 @@ static inline dma_addr_t __skb_frag_dma_map(struct device *dev, enum dma_data_direction dir) { if (skb_frag_is_net_iov(frag)) { - return netmem_to_net_iov(frag->netmem)->dma_addr + offset + - frag->offset; + return netmem_to_net_iov(frag->netmem)->desc.dma_addr + + offset + frag->offset; } return dma_map_page(dev, skb_frag_page(frag), skb_frag_off(frag) + offset, size, dir); -- cgit v1.2.3 From 6557cae0a2a1952645e5df50e1d6eb7267ea2131 Mon Sep 17 00:00:00 2001 From: Peter Enderborg Date: Wed, 26 Nov 2025 14:54:06 +0100 Subject: if_ether.h: Clarify ethertype validity for gsw1xx dsa This 0x88C3 is registered to Infineon Technologies Corporate Research ST and are used by MaxLinear. Infineon made a spin off called Lantiq. Lantiq was acquired by Intel MaxLinear acquired Intels Connected Home division. The product FAQ from MaxLinear describes it's history from the F24S. The driver for the gsw1xx is based on Lantiq showing it's similarities. Ref https://standards-oui.ieee.org/ethertype/eth.txt Signed-off-by: Peter Enderborg Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_ether.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 2c93b7b731c8..df9d44a11540 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -92,7 +92,9 @@ #define ETH_P_ETHERCAT 0x88A4 /* EtherCAT */ #define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ #define ETH_P_802_EX1 0x88B5 /* 802.1 Local Experimental 1. */ -#define ETH_P_MXLGSW 0x88C3 /* MaxLinear GSW DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_MXLGSW 0x88C3 /* Infineon Technologies Corporate Research ST + * Used by MaxLinear GSW DSA + */ #define ETH_P_PREAUTH 0x88C7 /* 802.11 Preauthentication */ #define ETH_P_TIPC 0x88CA /* TIPC */ #define ETH_P_LLDP 0x88CC /* Link Layer Discovery Protocol */ -- cgit v1.2.3 From d856f9d27885c499d96ab7fe506083346ccf145d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 27 Nov 2025 19:54:07 -0400 Subject: iommupt/vtd: Allow VT-d to have a larger table top than the vasz requires VT-d second stage HW specifies both the maximum IOVA and the supported table walk starting points. Weirdly there is HW that only supports a 4 level walk but has a maximum IOVA that only needs 3. The current code miscalculates this and creates a wrongly sized page table which ultimately fails the compatibility check for number of levels. This is fixed by allowing the page table to be created with both a vasz and top_level input. The vasz will set the aperture for the domain while the top_level will set the page table geometry. Add top_level to vtdss and correct the logic in VT-d to generate the right top_level and vasz from mgaw and sagaw. Fixes: d373449d8e97 ("iommu/vt-d: Use the generic iommu page table") Reported-by: Calvin Owens Closes: https://lore.kernel.org/r/8f257d2651eb8a4358fcbd47b0145002e5f1d638.1764237717.git.calvin@wbinvd.org Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Tested-by: Calvin Owens Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index cfe05a77f86b..c134132ed10f 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -264,6 +264,8 @@ IOMMU_PROTOTYPES(amdv1_mock); struct pt_iommu_vtdss_cfg { struct pt_iommu_cfg common; + /* 4 is a 57 bit 5 level table */ + unsigned int top_level; }; struct pt_iommu_vtdss_hw_info { -- cgit v1.2.3 From 1eb0ae6fbd544619c50b4a4d96ccb4676cac03cb Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 27 Nov 2025 19:54:08 -0400 Subject: iommupt/vtd: Support mgaw's less than a 4 level walk for first stage If the IOVA is limited to less than 48 the page table will be constructed with a 3 level configuration which is unsupported by hardware. Like the second stage the caller needs to pass in both the top_level an the vasz to specify a table that has more levels than required to hold the IOVA range. Fixes: 6cbc09b7719e ("iommu/vt-d: Restore previous domain::aperture_end calculation") Reported-by: Calvin Owens Closes: https://lore.kernel.org/r/8f257d2651eb8a4358fcbd47b0145002e5f1d638.1764237717.git.calvin@wbinvd.org Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Tested-by: Calvin Owens Signed-off-by: Joerg Roedel --- include/linux/generic_pt/iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index c134132ed10f..9eefbb74efd0 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -277,6 +277,8 @@ IOMMU_FORMAT(vtdss, vtdss_pt); struct pt_iommu_x86_64_cfg { struct pt_iommu_cfg common; + /* 4 is a 57 bit 5 level table */ + unsigned int top_level; }; struct pt_iommu_x86_64_hw_info { -- cgit v1.2.3 From 4be9e04ebf75a5c4478c1c6295e2122e5dc98f5f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 28 Nov 2025 10:55:09 +0100 Subject: vfs: add needed headers for new struct delegation definition The definition of struct delegation uses stdint.h integer types. Add the necessary headers to ensure that always works. Fixes: 1602bad16d7d ("vfs: expose delegation support to userland") Signed-off-by: Jeff Layton Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 008fac15e573..5e277fd955aa 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -4,6 +4,11 @@ #include #include +#ifdef __KERNEL__ +#include +#else +#include +#endif #define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) #define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) -- cgit v1.2.3 From 5aefbf5b68794870ccec126cd68bbfd1ee09283a Mon Sep 17 00:00:00 2001 From: "Derek J. Clark" Date: Thu, 27 Nov 2025 07:16:03 -0800 Subject: acpi: platform_profile - Add max-power profile option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some devices, namely Lenovo Legion devices, have an "extreme" mode where power draw is at the maximum limit of the cooling hardware. Add a new "max-power" platform profile to properly reflect this operating mode. Reviewed-by: Mario Limonciello (AMD) Acked-by: Rafael J. Wysocki (Intel) Signed-off-by: Derek J. Clark Reviewed-by: Armin Wolf Reviewed-by: Mark Pearson Link: https://patch.msgid.link/20251127151605.1018026-2-derekjohn.clark@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_profile.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index a299225ab92e..855b28340e95 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -24,6 +24,7 @@ enum platform_profile_option { PLATFORM_PROFILE_BALANCED, PLATFORM_PROFILE_BALANCED_PERFORMANCE, PLATFORM_PROFILE_PERFORMANCE, + PLATFORM_PROFILE_MAX_POWER, PLATFORM_PROFILE_CUSTOM, PLATFORM_PROFILE_LAST, /*must always be last */ }; -- cgit v1.2.3 From 011703a9acd76edc7c85d80dbccb6e50dba53aad Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 23 Nov 2025 17:33:19 +0100 Subject: file: add FD_{ADD,PREPARE}() I've been playing with this to allow for moderately flexible usage of the get_unused_fd_flags() + create file + fd_install() pattern that's used quite extensively. How callers allocate files is really heterogenous so it's not really convenient to fold them into a single class. It's possibe to split them into subclasses like for anon inodes. I think that's not necessarily nice as well. My take is to add two primites: (1) FD_ADD() the simple cases a file is installed: fd = FD_ADD(O_CLOEXEC, open_file(some, args))); if (fd >= 0) kvm_get_kvm(vcpu->kvm); return fd; (2) FD_PREPARE() that captures all the cases where access to fd or file or additional work before publishing the fd is needed: FD_PREPARE(fdf, open_flag, file_open_handle(&path, open_flag)); if (fdf.err) return fdf.err; if (copy_to_user(/* something something */)) return -EFAULT; return fd_publish(fdf); I've converted all of the easy cases over to it and it gets rid of an aweful lot of convoluted cleanup logic. It's centered around struct fd_prepare. FD_PREPARE() encapsulates all of allocation and cleanup logic and must be followed by a call to fd_publish() which associates the fd with the file and installs it into the callers fdtable. If fd_publish() isn't called both are deallocated. It mandates a specific order namely that first we allocate the fd and then instantiate the file. But that shouldn't be a problem nearly everyone I've converted uses this exact pattern anyway. There's a bunch of additional cases where it would be easy to convert them to this pattern. For example, the whole sync file stuff in dma currently retains the containing structure of the file instead of the file itself even though it's only used to allocate files. Changing that would make it fall into the FD_PREPARE() pattern easily. I've not done that work yet. There's room for extending this in a way that wed'd have subclasses for some particularly often use patterns but as I said I'm not even sure that's worth it. Link: https://patch.msgid.link/20251123-work-fd-prepare-v4-1-b6efa1706cfd@kernel.org Signed-off-by: Christian Brauner --- include/linux/cleanup.h | 7 +++ include/linux/file.h | 126 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) (limited to 'include') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 2573585b7f06..361104bcfe92 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -261,6 +261,10 @@ const volatile void * __must_check_fn(const volatile void *val) * CLASS(name, var)(args...): * declare the variable @var as an instance of the named class * + * CLASS_INIT(name, var, init_expr): + * declare the variable @var as an instance of the named class with + * custom initialization expression. + * * Ex. * * DEFINE_CLASS(fdget, struct fd, fdput(_T), fdget(fd), int fd) @@ -290,6 +294,9 @@ static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ class_##_name##_t var __cleanup(class_##_name##_destructor) = \ class_##_name##_constructor +#define CLASS_INIT(_name, _var, _init_expr) \ + class_##_name##_t _var __cleanup(class_##_name##_destructor) = (_init_expr) + #define scoped_class(_name, var, args) \ for (CLASS(_name, var)(args); \ __guard_ptr(_name)(&var) || !__is_cond_ptr(_name); \ diff --git a/include/linux/file.h b/include/linux/file.h index af1768d934a0..cf389fde9bc2 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -127,4 +127,130 @@ extern void __fput_sync(struct file *); extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max; +/* + * fd_prepare: Combined fd + file allocation cleanup class. + * @err: Error code to indicate if allocation succeeded. + * @__fd: Allocated fd (may not be accessed directly) + * @__file: Allocated struct file pointer (may not be accessed directly) + * + * Allocates an fd and a file together. On error paths, automatically cleans + * up whichever resource was successfully allocated. Allows flexible file + * allocation with different functions per usage. + * + * Do not use directly. + */ +struct fd_prepare { + s32 err; + s32 __fd; /* do not access directly */ + struct file *__file; /* do not access directly */ +}; + +/* Typedef for fd_prepare cleanup guards. */ +typedef struct fd_prepare class_fd_prepare_t; + +/* + * Accessors for fd_prepare class members. + * _Generic() is used for zero-cost type safety. + */ +#define fd_prepare_fd(_fdf) \ + (_Generic((_fdf), struct fd_prepare: (_fdf).__fd)) + +#define fd_prepare_file(_fdf) \ + (_Generic((_fdf), struct fd_prepare: (_fdf).__file)) + +/* Do not use directly. */ +static inline void class_fd_prepare_destructor(const struct fd_prepare *fdf) +{ + if (unlikely(fdf->err)) { + if (likely(fdf->__fd >= 0)) + put_unused_fd(fdf->__fd); + if (unlikely(!IS_ERR_OR_NULL(fdf->__file))) + fput(fdf->__file); + } +} + +/* Do not use directly. */ +static inline int class_fd_prepare_lock_err(const struct fd_prepare *fdf) +{ + if (unlikely(fdf->err)) + return fdf->err; + if (unlikely(fdf->__fd < 0)) + return fdf->__fd; + if (unlikely(IS_ERR(fdf->__file))) + return PTR_ERR(fdf->__file); + if (unlikely(!fdf->__file)) + return -ENOMEM; + return 0; +} + +/* + * __FD_PREPARE_INIT - Helper to initialize fd_prepare class. + * @_fd_flags: flags for get_unused_fd_flags() + * @_file_owned: expression that returns struct file * + * + * Returns a struct fd_prepare with fd, file, and err set. + * If fd allocation fails, fd will be negative and err will be set. If + * fd succeeds but file_init_expr fails, file will be ERR_PTR and err + * will be set. The err field is the single source of truth for error + * checking. + */ +#define __FD_PREPARE_INIT(_fd_flags, _file_owned) \ + ({ \ + struct fd_prepare fdf = { \ + .__fd = get_unused_fd_flags((_fd_flags)), \ + }; \ + if (likely(fdf.__fd >= 0)) \ + fdf.__file = (_file_owned); \ + fdf.err = ACQUIRE_ERR(fd_prepare, &fdf); \ + fdf; \ + }) + +/* + * FD_PREPARE - Macro to declare and initialize an fd_prepare variable. + * + * Declares and initializes an fd_prepare variable with automatic + * cleanup. No separate scope required - cleanup happens when variable + * goes out of scope. + * + * @_fdf: name of struct fd_prepare variable to define + * @_fd_flags: flags for get_unused_fd_flags() + * @_file_owned: struct file to take ownership of (can be expression) + */ +#define FD_PREPARE(_fdf, _fd_flags, _file_owned) \ + CLASS_INIT(fd_prepare, _fdf, __FD_PREPARE_INIT(_fd_flags, _file_owned)) + +/* + * fd_publish - Publish prepared fd and file to the fd table. + * @_fdf: struct fd_prepare variable + */ +#define fd_publish(_fdf) \ + ({ \ + struct fd_prepare *fdp = &(_fdf); \ + VFS_WARN_ON_ONCE(fdp->err); \ + VFS_WARN_ON_ONCE(fdp->__fd < 0); \ + VFS_WARN_ON_ONCE(IS_ERR_OR_NULL(fdp->__file)); \ + fd_install(fdp->__fd, fdp->__file); \ + fdp->__fd; \ + }) + +/* Do not use directly. */ +#define __FD_ADD(_fdf, _fd_flags, _file_owned) \ + ({ \ + FD_PREPARE(_fdf, _fd_flags, _file_owned); \ + s32 ret = _fdf.err; \ + if (likely(!ret)) \ + ret = fd_publish(_fdf); \ + ret; \ + }) + +/* + * FD_ADD - Allocate and install an fd and file in one step. + * @_fd_flags: flags for get_unused_fd_flags() + * @_file_owned: struct file to take ownership of + * + * Returns the allocated fd number, or negative error code on failure. + */ +#define FD_ADD(_fd_flags, _file_owned) \ + __FD_ADD(__UNIQUE_ID(fd_prepare), _fd_flags, _file_owned) + #endif /* __LINUX_FILE_H */ -- cgit v1.2.3 From 816c9cac35185aff33da1eb73cc974349623eb3a Mon Sep 17 00:00:00 2001 From: Richard Fitzgerald Date: Fri, 28 Nov 2025 11:25:20 +0000 Subject: ASoC: cs35l56: Log a message if firmware is missing If the amp is still reporting FIRMWARE_MISSING after cs35l56_patch() has completed it is helpful to log a warning. After a complete firmware download the FIRMWARE_MISSING flag will be clear. If this isn't the case, the driver should log a message to report this. The amp can produce basic audio output without firmware, as a fallback, so this wasn't originally logged as a warning condition because the amp is still in an operational state - just not with full functionality. However, it was not at all obvious to an end user that anything is unusual. Signed-off-by: Richard Fitzgerald Link: https://patch.msgid.link/20251128112520.40067-1-rf@opensource.cirrus.com Signed-off-by: Mark Brown --- include/sound/cs35l56.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/sound/cs35l56.h b/include/sound/cs35l56.h index 883f6a7e50aa..5928af539c46 100644 --- a/include/sound/cs35l56.h +++ b/include/sound/cs35l56.h @@ -409,6 +409,7 @@ int cs35l56_cal_set_status_get(struct cs35l56_base *cs35l56_base, struct snd_ctl_elem_value *uvalue); int cs35l56_read_prot_status(struct cs35l56_base *cs35l56_base, bool *fw_missing, unsigned int *fw_version); +void cs35l56_warn_if_firmware_missing(struct cs35l56_base *cs35l56_base); void cs35l56_log_tuning(struct cs35l56_base *cs35l56_base, struct cs_dsp *cs_dsp); int cs35l56_hw_init(struct cs35l56_base *cs35l56_base); int cs35l56_get_speaker_id(struct cs35l56_base *cs35l56_base); -- cgit v1.2.3 From f01c0f7ee59fce16e5bae92a2d388a8a6fdf3f0f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 27 Nov 2025 22:27:39 -0800 Subject: gpio: regmap: fix kernel-doc notation Add a ':' to the end of struct member names to prevent kernel-doc warnings: Warning: include/linux/gpio/regmap.h:108 struct member 'regmap_irq_line' not described in 'gpio_regmap_config' Warning: include/linux/gpio/regmap.h:108 struct member 'regmap_irq_flags' not described in 'gpio_regmap_config' Fixes: 553b75d4bfe9 ("gpio: regmap: Allow to allocate regmap-irq device") Signed-off-by: Randy Dunlap Reviewed-by: Michael Walle Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20251128062739.845403-1-rdunlap@infradead.org Signed-off-by: Bartosz Golaszewski --- include/linux/gpio/regmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gpio/regmap.h b/include/linux/gpio/regmap.h index 87983a5f3681..12d154732ca9 100644 --- a/include/linux/gpio/regmap.h +++ b/include/linux/gpio/regmap.h @@ -50,8 +50,8 @@ struct regmap; * @regmap_irq_chip: (Optional) Pointer on an regmap_irq_chip structure. If * set, a regmap-irq device will be created and the IRQ * domain will be set accordingly. - * @regmap_irq_line (Optional) The IRQ the device uses to signal interrupts. - * @regmap_irq_flags (Optional) The IRQF_ flags to use for the interrupt. + * @regmap_irq_line: (Optional) The IRQ the device uses to signal interrupts. + * @regmap_irq_flags: (Optional) The IRQF_ flags to use for the interrupt. * * The ->reg_mask_xlate translates a given base address and GPIO offset to * register and mask pair. The base address is one of the given register -- cgit v1.2.3 From a195c7ccfb7a21b8118139835e25936ec8722596 Mon Sep 17 00:00:00 2001 From: Jason-JH Lin Date: Thu, 23 Oct 2025 01:16:30 +0800 Subject: mailbox: mtk-cmdq: Refine DMA address handling for the command buffer GCE can only fetch the command buffer address from a 32-bit register. Some SoCs support a 35-bit command buffer address for GCE, which requires a right shift of 3 bits before setting the address into the 32-bit register. A comment has been added to the header of cmdq_get_shift_pa() to explain this requirement. To prevent the GCE command buffer address from being DMA mapped beyond its supported bit range, the DMA bit mask for the device is set during initialization. Additionally, to ensure the correct shift is applied when setting or reading the register that stores the GCE command buffer address, new APIs, cmdq_convert_gce_addr() and cmdq_revert_gce_addr(), have been introduced for consistent operations on this register. The variable type for the command buffer address has been standardized to dma_addr_t to prevent handling issues caused by type mismatches. Fixes: 0858fde496f8 ("mailbox: cmdq: variablize address shift in platform") Signed-off-by: Jason-JH Lin Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Jassi Brar --- include/linux/mailbox/mtk-cmdq-mailbox.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/mailbox/mtk-cmdq-mailbox.h b/include/linux/mailbox/mtk-cmdq-mailbox.h index 4c1a91b07de3..e1555e06e7e5 100644 --- a/include/linux/mailbox/mtk-cmdq-mailbox.h +++ b/include/linux/mailbox/mtk-cmdq-mailbox.h @@ -77,6 +77,16 @@ struct cmdq_pkt { size_t buf_size; /* real buffer size */ }; +/** + * cmdq_get_shift_pa() - get the shift bits of physical address + * @chan: mailbox channel + * + * GCE can only fetch the command buffer address from a 32-bit register. + * Some SOCs support more than 32-bit command buffer address for GCE, which + * requires some shift bits to make the address fit into the 32-bit register. + * + * Return: the shift bits of physical address + */ u8 cmdq_get_shift_pa(struct mbox_chan *chan); #endif /* __MTK_CMDQ_MAILBOX_H__ */ -- cgit v1.2.3 From d0c98769ee7d5db8d699a270690639cde1766cd4 Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 28 Nov 2025 16:53:13 +0800 Subject: blk-mq: use array manage hctx map instead of xarray After commit 4e5cc99e1e48 ("blk-mq: manage hctx map via xarray"), we use an xarray instead of array to store hctx, but in poll mode, each time in blk_mq_poll, we need use xa_load to find corresponding hctx, this introduce some costs. In my test, xa_load may cost 3.8% cpu. This patch revert previous change, eliminates the overhead of xa_load and can result in a 3% performance improvement. Signed-off-by: Fengnan Chang Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 3 ++- include/linux/blkdev.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b54506b3b76d..9208ff90ae16 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1016,7 +1016,8 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) } #define queue_for_each_hw_ctx(q, hctx, i) \ - xa_for_each(&(q)->hctx_table, (i), (hctx)) + for ((i) = 0; (i) < (q)->nr_hw_queues && \ + ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cb4ba09959ee..6195f89648db 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -503,7 +503,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct xarray hctx_table; + struct blk_mq_hw_ctx **queue_hw_ctx; struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; -- cgit v1.2.3 From 89e1fb7ceffd898505ad7fa57acec0585bfaa2cc Mon Sep 17 00:00:00 2001 From: Fengnan Chang Date: Fri, 28 Nov 2025 16:53:14 +0800 Subject: blk-mq: fix potential uaf for 'queue_hw_ctx' This is just apply Kuai's patch in [1] with mirror changes. blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate submit_queues through configfs for null_blk), while it might still be used from other context(e.g. switch elevator to none): t1 t2 elevator_switch blk_mq_unquiesce_queue blk_mq_run_hw_queues queue_for_each_hw_ctx // assembly code for hctx = (q)->queue_hw_ctx[i] mov 0x48(%rbp),%rdx -> read old queue_hw_ctx __blk_mq_update_nr_hw_queues blk_mq_realloc_hw_ctxs hctxs = q->queue_hw_ctx q->queue_hw_ctx = new_hctxs kfree(hctxs) movslq %ebx,%rax mov (%rdx,%rax,8),%rdi ->uaf This problem was found by code review, and I comfirmed that the concurrent scenario do exist(specifically 'q->queue_hw_ctx' can be changed during blk_mq_run_hw_queues()), however, the uaf problem hasn't been repoduced yet without hacking the kernel. Sicne the queue is freezed in __blk_mq_update_nr_hw_queues(), fix the problem by protecting 'queue_hw_ctx' through rcu where it can be accessed without grabbing 'q_usage_counter'. [1] https://lore.kernel.org/all/20220225072053.2472431-1-yukuai3@huawei.com/ Signed-off-by: Yu Kuai Signed-off-by: Fengnan Chang Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 13 ++++++++++++- include/linux/blkdev.h | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9208ff90ae16..eb7254b3dddd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1015,9 +1015,20 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) return rq + 1; } +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) +{ + struct blk_mq_hw_ctx *hctx; + + rcu_read_lock(); + hctx = rcu_dereference(q->queue_hw_ctx)[id]; + rcu_read_unlock(); + + return hctx; +} + #define queue_for_each_hw_ctx(q, hctx, i) \ for ((i) = 0; (i) < (q)->nr_hw_queues && \ - ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) + ({ hctx = queue_hctx((q), i); 1; }); (i)++) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6195f89648db..72e34acd439c 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -503,7 +503,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct blk_mq_hw_ctx **queue_hw_ctx; + struct blk_mq_hw_ctx * __rcu *queue_hw_ctx; struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; -- cgit v1.2.3 From 9574b21e952256d4fa3c8797c94482a240992d18 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 21 Nov 2025 09:58:23 +0800 Subject: kfifo: add kfifo_alloc_node() helper for NUMA awareness Add __kfifo_alloc_node() by refactoring and reusing __kfifo_alloc(), and define kfifo_alloc_node() macro to support NUMA-aware memory allocation. The new __kfifo_alloc_node() function accepts a NUMA node parameter and uses kmalloc_array_node() instead of kmalloc_array() for node-specific allocation. The existing __kfifo_alloc() now calls __kfifo_alloc_node() with NUMA_NO_NODE to maintain backward compatibility. This enables users to allocate kfifo buffers on specific NUMA nodes, which is important for performance in NUMA systems where the kfifo will be primarily accessed by threads running on specific nodes. Cc: Stefani Seibold Cc: Andrew Morton Cc: linux-kernel@vger.kernel.org Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/kfifo.h | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index fd743d4c4b4b..8b81ac74829c 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -369,6 +369,30 @@ __kfifo_int_must_check_helper( \ }) \ ) +/** + * kfifo_alloc_node - dynamically allocates a new fifo buffer on a NUMA node + * @fifo: pointer to the fifo + * @size: the number of elements in the fifo, this must be a power of 2 + * @gfp_mask: get_free_pages mask, passed to kmalloc() + * @node: NUMA node to allocate memory on + * + * This macro dynamically allocates a new fifo buffer with NUMA node awareness. + * + * The number of elements will be rounded-up to a power of 2. + * The fifo will be release with kfifo_free(). + * Return 0 if no error, otherwise an error code. + */ +#define kfifo_alloc_node(fifo, size, gfp_mask, node) \ +__kfifo_int_must_check_helper( \ +({ \ + typeof((fifo) + 1) __tmp = (fifo); \ + struct __kfifo *__kfifo = &__tmp->kfifo; \ + __is_kfifo_ptr(__tmp) ? \ + __kfifo_alloc_node(__kfifo, size, sizeof(*__tmp->type), gfp_mask, node) : \ + -EINVAL; \ +}) \ +) + /** * kfifo_free - frees the fifo * @fifo: the fifo to be freed @@ -899,8 +923,14 @@ __kfifo_uint_must_check_helper( \ ) -extern int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, - size_t esize, gfp_t gfp_mask); +extern int __kfifo_alloc_node(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask, int node); + +static inline int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask) +{ + return __kfifo_alloc_node(fifo, size, esize, gfp_mask, NUMA_NO_NODE); +} extern void __kfifo_free(struct __kfifo *fifo); -- cgit v1.2.3 From 418de94e7593081c29066555bf9059f1f7dd9d79 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 27 Nov 2025 22:57:54 -0800 Subject: sbitmap: fix all kernel-doc warnings Modify kernel-doc comments in sbitmap.h to prevent warnings: Warning: include/linux/sbitmap.h:84 struct member 'alloc_hint' not described in 'sbitmap' Warning: include/linux/sbitmap.h:151 struct member 'ws_active' not described in 'sbitmap_queue' Warning: include/linux/sbitmap.h:552 No description found for return value of 'sbq_wait_ptr' Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index ffb9907c7070..cc7ad189caa5 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -75,7 +75,7 @@ struct sbitmap { */ struct sbitmap_word *map; - /* + /** * @alloc_hint: Cache of last successfully allocated or freed bit. * * This is per-cpu, which allows multiple users to stick to different @@ -128,7 +128,7 @@ struct sbitmap_queue { */ struct sbq_wait_state *ws; - /* + /** * @ws_active: count of currently active ws waitqueues */ atomic_t ws_active; @@ -547,6 +547,8 @@ static inline void sbq_index_atomic_inc(atomic_t *index) * sbitmap_queue. * @sbq: Bitmap queue to wait on. * @wait_index: A counter per "user" of @sbq. + * + * Return: Next wait queue to be used */ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, atomic_t *wait_index) -- cgit v1.2.3 From 98693e0897f754e3f51ce6626ed5f785f625ba2b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 24 Nov 2025 15:36:22 -0700 Subject: vfio/pci: Use RCU for error/request triggers to avoid circular locking Thanks to a device generating an ACS violation during bus reset, lockdep reported the following circular locking issue: CPU0: SET_IRQS (MSI/X): holds igate, acquires memory_lock CPU1: HOT_RESET: holds memory_lock, acquires pci_bus_sem CPU2: AER: holds pci_bus_sem, acquires igate This results in a potential 3-way deadlock. Remove the pci_bus_sem->igate leg of the triangle by using RCU to peek at the eventfd rather than locking it with igate. Fixes: 3be3a074cf5b ("vfio-pci: Don't use device_lock around AER interrupt setup") Signed-off-by: Alex Williamson Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20251124223623.2770706-1-alex@shazbot.org Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 88fd2fd895d0..a1eddd55dab8 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,11 @@ struct vfio_pci_region; struct p2pdma_provider; struct dma_buf_phys_vec; +struct vfio_pci_eventfd { + struct eventfd_ctx *ctx; + struct rcu_head rcu; +}; + struct vfio_pci_regops { ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); @@ -124,8 +130,8 @@ struct vfio_pci_core_device { struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; - struct eventfd_ctx *err_trigger; - struct eventfd_ctx *req_trigger; + struct vfio_pci_eventfd __rcu *err_trigger; + struct vfio_pci_eventfd __rcu *req_trigger; struct eventfd_ctx *pm_wake_eventfd_ctx; struct list_head dummy_resources_list; struct mutex ioeventfds_lock; -- cgit v1.2.3 From 9b92bc7554b543dc00a0a0b62904a9ef2ad5c4b0 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:27 +0000 Subject: vfio: refactor vfio_pci_mmap_huge_fault function Refactor vfio_pci_mmap_huge_fault to take out the implementation to map the VMA to the PTE/PMD/PUD as a separate function. Export the new function to be used by nvgrace-gpu module. Move the alignment check code to verify that pfn and VMA VA is aligned to the page order to the header file and make it inline. No functional change is intended. Cc: Shameer Kolothum Cc: Alex Williamson Cc: Jason Gunthorpe Reviewed-by: Shameer Kolothum Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-2-ankita@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a1eddd55dab8..5569488ec4dc 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -170,6 +170,9 @@ ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos); ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos); +vm_fault_t vfio_pci_vmf_insert_pfn(struct vfio_pci_core_device *vdev, + struct vm_fault *vmf, unsigned long pfn, + unsigned int order); int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma); void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count); int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf); @@ -212,4 +215,14 @@ VFIO_IOREAD_DECLARATION(32) VFIO_IOREAD_DECLARATION(64) #endif +static inline bool is_aligned_for_order(struct vm_area_struct *vma, + unsigned long addr, + unsigned long pfn, + unsigned int order) +{ + return !(order && (addr < vma->vm_start || + addr + (PAGE_SIZE << order) > vma->vm_end || + !IS_ALIGNED(pfn, 1 << order))); +} + #endif /* VFIO_PCI_CORE_H */ -- cgit v1.2.3 From a23b10608d420346e5af7eda6c46726a61572469 Mon Sep 17 00:00:00 2001 From: Ankit Agrawal Date: Thu, 27 Nov 2025 17:06:32 +0000 Subject: vfio/nvgrace-gpu: wait for the GPU mem to be ready Speculative prefetches from CPU to GPU memory until the GPU is ready after reset can cause harmless corrected RAS events to be logged on Grace systems. It is thus preferred that the mapping not be re-established until the GPU is ready post reset. The GPU readiness can be checked through BAR0 registers similar to the checking at the time of device probe. It can take several seconds for the GPU to be ready. So it is desirable that the time overlaps as much of the VM startup as possible to reduce impact on the VM bootup time. The GPU readiness state is thus checked on the first fault/huge_fault request or read/write access which amortizes the GPU readiness time. The first fault and read/write checks the GPU state when the reset_done flag - which denotes whether the GPU has just been reset. The memory_lock is taken across map/access to avoid races with GPU reset. Also check if the memory is enabled, before waiting for GPU to be ready. Otherwise the readiness check would block for 30s. Lastly added PM handling wrapping on read/write access. Cc: Shameer Kolothum Cc: Alex Williamson Cc: Jason Gunthorpe Cc: Vikram Sethi Reviewed-by: Shameer Kolothum Suggested-by: Alex Williamson Signed-off-by: Ankit Agrawal Link: https://lore.kernel.org/r/20251127170632.3477-7-ankita@nvidia.com Signed-off-by: Alex Williamson --- include/linux/vfio_pci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 5569488ec4dc..336a0e58b443 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -188,6 +188,7 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, void __iomem *io, char __user *buf, loff_t off, size_t count, size_t x_start, size_t x_end, bool iswrite); +bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt, loff_t reg_start, size_t reg_cnt, loff_t *buf_offset, -- cgit v1.2.3 From 256a21743d911f94ce92fe28f793cd586f3860b2 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 6 Nov 2025 12:36:00 -0500 Subject: i3c: Add HDR API support Rename struct i3c_priv_xfer to struct i3c_xfer, since private xfer in the I3C spec refers only to SDR transfers. Ref: i3c spec ver1.2, section 3, Technical Overview. i3c_xfer will be used for both SDR and HDR. Rename enum i3c_hdr_mode to i3c_xfer_mode. Previous definition need match CCC GET_CAP1 bit position. Use 31 as SDR transfer mode. Add i3c_device_do_xfers() with an xfer mode argument, while keeping i3c_device_do_priv_xfers() as a wrapper that calls i3c_device_do_xfers() with I3C_SDR for backward compatibility. Introduce a 'cmd' field in struct i3c_xfer as an anonymous union with 'rnw', since HDR mode uses read/write commands instead of the SDR address bit. Add .i3c_xfers() callback for master controllers. If not implemented, fall back to SDR with .priv_xfers(). The .priv_xfers() API can be removed once all controllers switch to .i3c_xfers(). Add 'mode_mask' bitmask to advertise controller capability. Signed-off-by: Frank Li Link: https://patch.msgid.link/20251106-i3c_ddr-v11-1-33a6a66ed095@nxp.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/device.h | 40 +++++++++++++++++++++++++++++----------- include/linux/i3c/master.h | 4 ++++ 2 files changed, 33 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index 7f136de4b73e..7f7738041f38 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -39,20 +39,25 @@ enum i3c_error_code { }; /** - * enum i3c_hdr_mode - HDR mode ids + * enum i3c_xfer_mode - I3C xfer mode ids * @I3C_HDR_DDR: DDR mode * @I3C_HDR_TSP: TSP mode * @I3C_HDR_TSL: TSL mode + * @I3C_SDR: SDR mode (NOT HDR mode) */ -enum i3c_hdr_mode { - I3C_HDR_DDR, - I3C_HDR_TSP, - I3C_HDR_TSL, +enum i3c_xfer_mode { + /* The below 3 value (I3C_HDR*) must match GETCAP1 Byte bit position */ + I3C_HDR_DDR = 0, + I3C_HDR_TSP = 1, + I3C_HDR_TSL = 2, + /* Use for default SDR transfer mode */ + I3C_SDR = 0x31, }; /** - * struct i3c_priv_xfer - I3C SDR private transfer + * struct i3c_xfer - I3C data transfer * @rnw: encodes the transfer direction. true for a read, false for a write + * @cmd: Read/Write command in HDR mode, read: 0x80 - 0xff, write: 0x00 - 0x7f * @len: transfer length in bytes of the transfer * @actual_len: actual length in bytes are transferred by the controller * @data: input/output buffer @@ -60,8 +65,11 @@ enum i3c_hdr_mode { * @data.out: output buffer. Must point to a DMA-able buffer * @err: I3C error code */ -struct i3c_priv_xfer { - u8 rnw; +struct i3c_xfer { + union { + u8 rnw; + u8 cmd; + }; u16 len; u16 actual_len; union { @@ -71,6 +79,9 @@ struct i3c_priv_xfer { enum i3c_error_code err; }; +/* keep back compatible */ +#define i3c_priv_xfer i3c_xfer + /** * enum i3c_dcr - I3C DCR values * @I3C_DCR_GENERIC_DEVICE: generic I3C device @@ -297,9 +308,15 @@ static __always_inline void i3c_i2c_driver_unregister(struct i3c_driver *i3cdrv, i3c_i2c_driver_unregister, \ __i2cdrv) -int i3c_device_do_priv_xfers(struct i3c_device *dev, - struct i3c_priv_xfer *xfers, - int nxfers); +int i3c_device_do_xfers(struct i3c_device *dev, struct i3c_xfer *xfers, + int nxfers, enum i3c_xfer_mode mode); + +static inline int i3c_device_do_priv_xfers(struct i3c_device *dev, + struct i3c_priv_xfer *xfers, + int nxfers) +{ + return i3c_device_do_xfers(dev, xfers, nxfers, I3C_SDR); +} int i3c_device_do_setdasa(struct i3c_device *dev); @@ -341,5 +358,6 @@ int i3c_device_request_ibi(struct i3c_device *dev, void i3c_device_free_ibi(struct i3c_device *dev); int i3c_device_enable_ibi(struct i3c_device *dev); int i3c_device_disable_ibi(struct i3c_device *dev); +u32 i3c_device_get_supported_xfer_mode(struct i3c_device *dev); #endif /* I3C_DEV_H */ diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index c52a82dd79a6..d0d5b3a9049f 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -474,9 +474,13 @@ struct i3c_master_controller_ops { const struct i3c_ccc_cmd *cmd); int (*send_ccc_cmd)(struct i3c_master_controller *master, struct i3c_ccc_cmd *cmd); + /* Deprecated, please use i3c_xfers() */ int (*priv_xfers)(struct i3c_dev_desc *dev, struct i3c_priv_xfer *xfers, int nxfers); + int (*i3c_xfers)(struct i3c_dev_desc *dev, + struct i3c_xfer *xfers, + int nxfers, enum i3c_xfer_mode mode); int (*attach_i2c_dev)(struct i2c_dev_desc *dev); void (*detach_i2c_dev)(struct i2c_dev_desc *dev); int (*i2c_xfers)(struct i2c_dev_desc *dev, -- cgit v1.2.3 From 9280b6ebbf08e53734d34f3bb325c37cddc1422d Mon Sep 17 00:00:00 2001 From: Frank Li Date: Thu, 6 Nov 2025 12:36:01 -0500 Subject: i3c: Switch to use new i3c_xfer from i3c_priv_xfer Switch to use i3c_xfer instead of i3c_priv_xfer because framework update to support HDR mode. i3c_priv_xfer is now an alias of i3c_xfer. Signed-off-by: Frank Li Link: https://patch.msgid.link/20251106-i3c_ddr-v11-2-33a6a66ed095@nxp.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index 7f7738041f38..ae0662d9d77e 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -27,7 +27,7 @@ * These are the standard error codes as defined by the I3C specification. * When -EIO is returned by the i3c_device_do_priv_xfers() or * i3c_device_send_hdr_cmds() one can check the error code in - * &struct_i3c_priv_xfer.err or &struct i3c_hdr_cmd.err to get a better idea of + * &struct_i3c_xfer.err or &struct i3c_hdr_cmd.err to get a better idea of * what went wrong. * */ @@ -312,7 +312,7 @@ int i3c_device_do_xfers(struct i3c_device *dev, struct i3c_xfer *xfers, int nxfers, enum i3c_xfer_mode mode); static inline int i3c_device_do_priv_xfers(struct i3c_device *dev, - struct i3c_priv_xfer *xfers, + struct i3c_xfer *xfers, int nxfers) { return i3c_device_do_xfers(dev, xfers, nxfers, I3C_SDR); -- cgit v1.2.3 From cb2dc6d2869a4fb7ef8d792a81a74bc6f0958a72 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Sat, 29 Nov 2025 10:05:00 +0100 Subject: can: Kconfig: select CAN driver infrastructure by default The CAN bus support enabled with CONFIG_CAN provides a socket-based access to CAN interfaces. With the introduction of the latest CAN protocol CAN XL additional configuration status information needs to be exposed to the network layer than formerly provided by standard Linux network drivers. This requires the CAN driver infrastructure to be selected by default. As the CAN network layer can only operate on CAN interfaces anyway all distributions and common default configs enable at least one CAN driver. So selecting CONFIG_CAN_DEV when CONFIG_CAN is selected by the user has no effect on established configurations but solves potential build issues when CONFIG_CAN[_XXX]=y is set together with CANFIG_CAN_DEV=m Fixes: 1a620a723853 ("can: raw: instantly reject unsupported CAN frames") Reported-by: Vincent Mailhol Closes: https://lore.kernel.org/all/CAMZ6RqL_nGszwoLPXn1Li8op-ox4k3Hs6p=Hw6+w0W=DTtobPw@mail.gmail.com/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511280531.YnWW2Rxc-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202511280842.djCQ0N0O-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202511282325.uVQFRTkA-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202511291520.guIE1QHj-lkp@intel.com/ Suggested-by: Marc Kleine-Budde Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20251129090500.17484-1-socketcan@hartkopp.net Signed-off-by: Marc Kleine-Budde --- include/linux/can/dev.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/can/dev.h b/include/linux/can/dev.h index 52c8be5c160e..f6416a56e95d 100644 --- a/include/linux/can/dev.h +++ b/include/linux/can/dev.h @@ -111,7 +111,14 @@ struct net_device *alloc_candev_mqs(int sizeof_priv, unsigned int echo_skb_max, void free_candev(struct net_device *dev); /* a candev safe wrapper around netdev_priv */ +#if IS_ENABLED(CONFIG_CAN_NETLINK) struct can_priv *safe_candev_priv(struct net_device *dev); +#else +static inline struct can_priv *safe_candev_priv(struct net_device *dev) +{ + return NULL; +} +#endif int open_candev(struct net_device *dev); void close_candev(struct net_device *dev); -- cgit v1.2.3 From 414690746d2da0dc9a931f8c02d83e5834141251 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 24 Nov 2025 18:28:08 -0800 Subject: i2c: i2c.h: fix a bad kernel-doc line Change an empty line into a blank kernel-doc line to prevent a kernel-doc warning: Warning: ../include/uapi/linux/i2c.h:38 bad line: Fixes: bfb3939c51d5 ("i2c: refactor documentation of struct i2c_msg") Signed-off-by: Randy Dunlap Signed-off-by: Wolfram Sang --- include/uapi/linux/i2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/i2c.h b/include/uapi/linux/i2c.h index a2db2a56c8b0..2a226657d9f8 100644 --- a/include/uapi/linux/i2c.h +++ b/include/uapi/linux/i2c.h @@ -36,7 +36,7 @@ * * Only if I2C_FUNC_NOSTART is set: * %I2C_M_NOSTART: skip repeated start sequence - + * * Only if I2C_FUNC_PROTOCOL_MANGLING is set: * %I2C_M_NO_RD_ACK: in a read message, master ACK/NACK bit is skipped * %I2C_M_IGNORE_NAK: treat NACK from client as ACK -- cgit v1.2.3 From beb7021a6003d9c6a463fffca0d6311efb8e0e66 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 28 Nov 2025 23:27:57 +0000 Subject: rqspinlock: Enclose lock/unlock within lock entry acquisitions Ritesh reported that timeouts occurred frequently for rqspinlock despite reentrancy on the same lock on the same CPU in [0]. This patch closes one of the races leading to this behavior, and reduces the frequency of timeouts. We currently have a tiny window between the fast-path cmpxchg and the grabbing of the lock entry where an NMI could land, attempt the same lock that was just acquired, and end up timing out. This is not ideal. Instead, move the lock entry acquisition from the fast path to before the cmpxchg, and remove the grabbing of the lock entry in the slow path, assuming it was already taken by the fast path. The TAS fallback is invoked directly without being preceded by the typical fast path, therefore we must continue to grab the deadlock detection entry in that case. Case on lock leading to missed AA: cmpxchg lock A ... rqspinlock acquisition of A ... timeout grab_held_lock_entry(A) There is a similar case when unlocking the lock. If the NMI lands between the WRITE_ONCE and smp_store_release, it is possible that we end up in a situation where the NMI fails to diagnose the AA condition, leading to a timeout. Case on unlock leading to missed AA: WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL) ... rqspinlock acquisition of A ... timeout smp_store_release(A->locked, 0) The patch changes the order on unlock to smp_store_release() succeeded by WRITE_ONCE() of NULL. This avoids the missed AA detection described above, but may lead to a false positive if the NMI lands between these two statements, which is acceptable (and preferred over a timeout). The original intention of the reverse order on unlock was to prevent the following possible misdiagnosis of an ABBA scenario: grab entry A lock A grab entry B lock B unlock B smp_store_release(B->locked, 0) grab entry B lock B grab entry A lock A ! WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL) If the store release were is after the WRITE_ONCE, the other CPU would not observe B in the table of the CPU unlocking the lock B. However, since the threads are obviously participating in an ABBA deadlock, it is no longer appealing to use the order above since it may lead to a 250 ms timeout due to missed AA detection. [0]: https://lore.kernel.org/bpf/CAH6OuBTjG+N=+GGwcpOUbeDN563oz4iVcU3rbse68egp9wj9_A@mail.gmail.com Fixes: 0d80e7f951be ("rqspinlock: Choose trylock fallback for NMI waiters") Reported-by: Ritesh Oedayrajsingh Varma Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20251128232802.1031906-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/asm-generic/rqspinlock.h | 60 +++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h index 6d4244d643df..0f2dcbbfee2f 100644 --- a/include/asm-generic/rqspinlock.h +++ b/include/asm-generic/rqspinlock.h @@ -129,8 +129,8 @@ dec: * for lock B * release_held_lock_entry * - * try_cmpxchg_acquire for lock A * grab_held_lock_entry + * try_cmpxchg_acquire for lock A * * Lack of any ordering means reordering may occur such that dec, inc * are done before entry is overwritten. This permits a remote lock @@ -139,13 +139,8 @@ dec: * CPU holds a lock it is attempting to acquire, leading to false ABBA * diagnosis). * - * In case of unlock, we will always do a release on the lock word after - * releasing the entry, ensuring that other CPUs cannot hold the lock - * (and make conclusions about deadlocks) until the entry has been - * cleared on the local CPU, preventing any anomalies. Reordering is - * still possible there, but a remote CPU cannot observe a lock in our - * table which it is already holding, since visibility entails our - * release store for the said lock has not retired. + * The case of unlock is treated differently due to NMI reentrancy, see + * comments in res_spin_unlock. * * In theory we don't have a problem if the dec and WRITE_ONCE above get * reordered with each other, we either notice an empty NULL entry on @@ -175,10 +170,22 @@ static __always_inline int res_spin_lock(rqspinlock_t *lock) { int val = 0; - if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) { - grab_held_lock_entry(lock); + /* + * Grab the deadlock detection entry before doing the cmpxchg, so that + * reentrancy due to NMIs between the succeeding cmpxchg and creation of + * held lock entry can correctly detect an acquisition attempt in the + * interrupted context. + * + * cmpxchg lock A + * + * res_spin_lock(A) --> missed AA, leads to timeout + * + * grab_held_lock_entry(A) + */ + grab_held_lock_entry(lock); + + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) return 0; - } return resilient_queued_spin_lock_slowpath(lock, val); } @@ -192,28 +199,25 @@ static __always_inline void res_spin_unlock(rqspinlock_t *lock) { struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks); - if (unlikely(rqh->cnt > RES_NR_HELD)) - goto unlock; - WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); -unlock: /* - * Release barrier, ensures correct ordering. See release_held_lock_entry - * for details. Perform release store instead of queued_spin_unlock, - * since we use this function for test-and-set fallback as well. When we - * have CONFIG_QUEUED_SPINLOCKS=n, we clear the full 4-byte lockword. + * Release barrier, ensures correct ordering. Perform release store + * instead of queued_spin_unlock, since we use this function for the TAS + * fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear + * the full 4-byte lockword. * - * Like release_held_lock_entry, we can do the release before the dec. - * We simply care about not seeing the 'lock' in our table from a remote - * CPU once the lock has been released, which doesn't rely on the dec. + * Perform the smp_store_release before clearing the lock entry so that + * NMIs landing in the unlock path can correctly detect AA issues. The + * opposite order shown below may lead to missed AA checks: * - * Unlike smp_wmb(), release is not a two way fence, hence it is - * possible for a inc to move up and reorder with our clearing of the - * entry. This isn't a problem however, as for a misdiagnosis of ABBA, - * the remote CPU needs to hold this lock, which won't be released until - * the store below is done, which would ensure the entry is overwritten - * to NULL, etc. + * WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL) + * + * res_spin_lock(A) --> missed AA, leads to timeout + * + * smp_store_release(A->locked, 0) */ smp_store_release(&lock->locked, 0); + if (likely(rqh->cnt <= RES_NR_HELD)) + WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL); this_cpu_dec(rqspinlock_held_locks.cnt); } -- cgit v1.2.3 From 2b6a3f061f11372af79b862d6184d43193ae927f Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:00:59 +0000 Subject: mm: declare VMA flags by bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "initial work on making VMA flags a bitmap", v3. We are in the rather silly situation that we are running out of VMA flags as they are currently limited to a system word in size. This leads to absurd situations where we limit features to 64-bit architectures only because we simply do not have the ability to add a flag for 32-bit ones. This is very constraining and leads to hacks or, in the worst case, simply an inability to implement features we want for entirely arbitrary reasons. This also of course gives us something of a Y2K type situation in mm where we might eventually exhaust all of the VMA flags even on 64-bit systems. This series lays the groundwork for getting away from this limitation by establishing VMA flags as a bitmap whose size we can increase in future beyond 64 bits if required. This is necessarily a highly iterative process given the extensive use of VMA flags throughout the kernel, so we start by performing basic steps. Firstly, we declare VMA flags by bit number rather than by value, retaining the VM_xxx fields but in terms of these newly introduced VMA_xxx_BIT fields. While we are here, we use sparse annotations to ensure that, when dealing with VMA bit number parameters, we cannot be passed values which are not declared as such - providing some useful type safety. We then introduce an opaque VMA flag type, much like the opaque mm_struct flag type introduced in commit bb6525f2f8c4 ("mm: add bitmap mm->flags field"), which we establish in union with vma->vm_flags (but still set at system word size meaning there is no functional or data type size change). We update the vm_flags_xxx() helpers to use this new bitmap, introducing sensible helpers to do so. This series lays the foundation for further work to expand the use of bitmap VMA flags and eventually eliminate these arbitrary restrictions. This patch (of 4): In order to lay the groundwork for VMA flags being a bitmap rather than a system word in size, we need to be able to consistently refer to VMA flags by bit number rather than value. Take this opportunity to do so in an enum which we which is additionally useful for tooling to extract metadata from. This additionally makes it very clear which bits are being used for what at a glance. We use the VMA_ prefix for the bit values as it is logical to do so since these reference VMAs. We consistently suffix with _BIT to make it clear what the values refer to. We declare bit values even when the flags that use them would not be enabled by config options as this is simply clearer and clearly defines what bit numbers are used for what, at no additional cost. We declare a sparse-bitwise type vma_flag_t which ensures that users can't pass around invalid VMA flags by accident and prepares for future work towards VMA flags being a bitmap where we want to ensure bit values are type safe. To make life easier, we declare some macro helpers - DECLARE_VMA_BIT() allows us to avoid duplication in the enum bit number declarations (and maintaining the sparse __bitwise attribute), and INIT_VM_FLAG() is used to assist with declaration of flags. Unfortunately we can't declare both in the enum, as we run into issue with logic in the kernel requiring that flags are preprocessor definitions, and additionally we cannot have a macro which declares another macro so we must define each flag macro directly. Additionally, update the VMA userland testing vma_internal.h header to include these changes. We also have to fix the parameters to the vma_flag_*_atomic() functions since VMA_MAYBE_GUARD_BIT is now of type vma_flag_t and sparse will complain otherwise. We have to update some rather silly if-deffery found in mm/task_mmu.c which would otherwise break. Finally, we update the rust binding helper as now it cannot auto-detect the flags at all. Link: https://lkml.kernel.org/r/cover.1764064556.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/3a35e5a0bcfa00e84af24cbafc0653e74deda64a.1764064556.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 379 +++++++++++++++++++++++++++++------------------------ 1 file changed, 211 insertions(+), 168 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 75f894c3f521..a2f38fb68840 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -271,185 +271,239 @@ extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif -#define VM_MAYBE_GUARD_BIT 11 - /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ -#define VM_NONE 0x00000000 -#define VM_READ 0x00000001 /* currently active flags */ -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 +#define VM_NONE 0x00000000 -/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ -#define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_MAYSHARE 0x00000080 +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. + * + * This value is made type safe by sparse to avoid passing invalid flag values + * around. + */ +typedef int __bitwise vma_flag_t; -#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = (VMA_ ## aliased ## _BIT) +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ #ifdef CONFIG_MMU -#define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ -#else /* CONFIG_MMU */ -#define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ -#define VM_UFFD_MISSING 0 + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), #endif /* CONFIG_MMU */ -#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ -#define VM_MAYBE_GUARD BIT(VM_MAYBE_GUARD_BIT) /* The VMA maybe contains guard regions. */ -#define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ - -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ - - /* Used by sys_madvise() */ -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ - -#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ -#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ -#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ -#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ -#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ -#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ -#define VM_SYNC 0x00800000 /* Synchronous page faults */ -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ -#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ - + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ +#ifdef CONFIG_STACK_GROWSUP + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; +#undef DECLARE_VMA_BIT +#undef DECLARE_VMA_BIT_ALIAS + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) #ifdef CONFIG_MEM_SOFT_DIRTY -# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) #else -# define VM_SOFTDIRTY 0 +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE #endif - -#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ -#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ -#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ -#define VM_MERGEABLE BIT(31) /* KSM may merge identical pages */ - -#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS -#define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ -#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) -#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) -#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) -#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) -#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) -#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) -#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) -#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ - #ifdef CONFIG_ARCH_HAS_PKEYS -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 -# define VM_PKEY_BIT0 VM_HIGH_ARCH_0 -# define VM_PKEY_BIT1 VM_HIGH_ARCH_1 -# define VM_PKEY_BIT2 VM_HIGH_ARCH_2 +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) #if CONFIG_ARCH_PKEY_BITS > 3 -# define VM_PKEY_BIT3 VM_HIGH_ARCH_3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) #else -# define VM_PKEY_BIT3 0 -#endif +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ #if CONFIG_ARCH_PKEY_BITS > 4 -# define VM_PKEY_BIT4 VM_HIGH_ARCH_4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) #else -# define VM_PKEY_BIT4 0 -#endif +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ #endif /* CONFIG_ARCH_HAS_PKEYS */ - -#ifdef CONFIG_X86_USER_SHADOW_STACK -/* - * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of - * support core mm. - * - * These VMAs will get a single end guard page. This helps userspace protect - * itself from attacks. A single page is enough for current shadow stack archs - * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c - * for more details on the guard size. - */ -# define VM_SHADOW_STACK VM_HIGH_ARCH_5 -#endif - -#if defined(CONFIG_ARM64_GCS) -/* - * arm64's Guarded Control Stack implements similar functionality and - * has similar constraints to shadow stacks. - */ -# define VM_SHADOW_STACK VM_HIGH_ARCH_6 -#endif - -#ifndef VM_SHADOW_STACK -# define VM_SHADOW_STACK VM_NONE +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE #endif - #if defined(CONFIG_PPC64) -# define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ +#define VM_SAO INIT_VM_FLAG(SAO) #elif defined(CONFIG_PARISC) -# define VM_GROWSUP VM_ARCH_1 +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) #elif defined(CONFIG_SPARC64) -# define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ -# define VM_ARCH_CLEAR VM_SPARC_ADI +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif defined(CONFIG_ARM64) -# define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ -# define VM_ARCH_CLEAR VM_ARM64_BTI +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) #elif !defined(CONFIG_MMU) -# define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ -#endif - -#if defined(CONFIG_ARM64_MTE) -# define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ -# define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ -#else -# define VM_MTE VM_NONE -# define VM_MTE_ALLOWED VM_NONE +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) #endif - #ifndef VM_GROWSUP -# define VM_GROWSUP VM_NONE +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE #endif - #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR -# define VM_UFFD_MINOR_BIT 41 -# define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ -#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ -# define VM_UFFD_MINOR VM_NONE -#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ - -/* - * This flag is used to connect VFIO to arch specific KVM code. It - * indicates that the memory under this VMA is safe for use with any - * non-cachable memory type inside KVM. Some VFIO devices, on some - * platforms, are thought to be unsafe and can cause machine crashes - * if KVM does not lock down the memory type. - */ -#ifdef CONFIG_64BIT -#define VM_ALLOW_ANY_UNCACHED_BIT 39 -#define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) #else -#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_UFFD_MINOR VM_NONE #endif - #ifdef CONFIG_64BIT -#define VM_DROPPABLE_BIT 40 -#define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) -#elif defined(CONFIG_PPC32) -#define VM_DROPPABLE VM_ARCH_1 +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) #else -#define VM_DROPPABLE VM_NONE +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE #endif - -#ifdef CONFIG_64BIT -#define VM_SEALED_BIT 42 -#define VM_SEALED BIT(VM_SEALED_BIT) +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) #else -#define VM_SEALED VM_NONE +#define VM_DROPPABLE VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ @@ -475,12 +529,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) -#ifdef CONFIG_STACK_GROWSUP -#define VM_STACK VM_GROWSUP -#define VM_STACK_EARLY VM_GROWSDOWN +#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS +#define VM_SEALED_SYSMAP VM_SEALED #else -#define VM_STACK VM_GROWSDOWN -#define VM_STACK_EARLY 0 +#define VM_SEALED_SYSMAP VM_NONE #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) @@ -488,7 +540,6 @@ extern unsigned int kobjsize(const void *objp); /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) - /* * Special vmas that are non-mergable, non-mlock()able. */ @@ -523,7 +574,7 @@ extern unsigned int kobjsize(const void *objp); /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR -# define VM_ARCH_CLEAR VM_NONE +#define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) @@ -920,9 +971,9 @@ static inline void vm_flags_mod(struct vm_area_struct *vma, } static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, - int bit) + vma_flag_t bit) { - const vm_flags_t mask = BIT(bit); + const vm_flags_t mask = BIT((__force int)bit); /* Only specific flags are permitted */ if (WARN_ON_ONCE(!(mask & VM_ATOMIC_SET_ALLOWED))) @@ -935,14 +986,15 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, * Set VMA flag atomically. Requires only VMA/mmap read lock. Only specific * valid flags are allowed to do this. */ -static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit) +static inline void vma_flag_set_atomic(struct vm_area_struct *vma, + vma_flag_t bit) { /* mmap read lock/VMA read lock must be held. */ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) vma_assert_locked(vma); if (__vma_flag_atomic_valid(vma, bit)) - set_bit(bit, &ACCESS_PRIVATE(vma, __vm_flags)); + set_bit((__force int)bit, &ACCESS_PRIVATE(vma, __vm_flags)); } /* @@ -952,10 +1004,11 @@ static inline void vma_flag_set_atomic(struct vm_area_struct *vma, int bit) * This is necessarily racey, so callers must ensure that serialisation is * achieved through some other means, or that races are permissible. */ -static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, int bit) +static inline bool vma_flag_test_atomic(struct vm_area_struct *vma, + vma_flag_t bit) { if (__vma_flag_atomic_valid(vma, bit)) - return test_bit(bit, &vma->vm_flags); + return test_bit((__force int)bit, &vma->vm_flags); return false; } @@ -4517,16 +4570,6 @@ int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *st int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); - -/* - * mseal of userspace process's system mappings. - */ -#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS -#define VM_SEALED_SYSMAP VM_SEALED -#else -#define VM_SEALED_SYSMAP VM_NONE -#endif - /* * DMA mapping IDs for page_pool * -- cgit v1.2.3 From 58eac97a8ba0bcfc5dffb347e40ea3006347ff38 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:01:00 +0000 Subject: mm: simplify and rename mm flags function for clarity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The __mm_flags_set_word() function is slightly ambiguous - we use 'set' to refer to setting individual bits (such as in mm_flags_set()) but here we use it to refer to overwriting the value altogether. Rename it to __mm_flags_overwrite_word() to eliminate this ambiguity. We additionally simplify the functions, eliminating unnecessary bitmap_xxx() operations (the compiler would have optimised these out but it's worth being as clear as we can be here). Link: https://lkml.kernel.org/r/8f0bc556e1b90eca8ea5eba41f8d5d3f9cd7c98a.1764064557.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4f66a3206a63..3550672e0f9e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1314,15 +1314,13 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; -/* Set the first system word of mm flags, non-atomically. */ -static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) +/* Copy value to the first system word of mm flags, non-atomically. */ +static inline void __mm_flags_overwrite_word(struct mm_struct *mm, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); - - bitmap_copy(bitmap, &value, BITS_PER_LONG); + *ACCESS_PRIVATE(&mm->flags, __mm_flags) = value; } -/* Obtain a read-only view of the bitmap. */ +/* Obtain a read-only view of the mm flags bitmap. */ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) { return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); @@ -1331,9 +1329,7 @@ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct /* Read the first system word of mm flags, non-atomically. */ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) { - const unsigned long *bitmap = __mm_flags_get_bitmap(mm); - - return bitmap_read(bitmap, 0, BITS_PER_LONG); + return *__mm_flags_get_bitmap(mm); } /* -- cgit v1.2.3 From 9ea35a25d51b13013b724943a177a7aaf4bfed71 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 25 Nov 2025 10:01:02 +0000 Subject: mm: introduce VMA flags bitmap type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is useful to transition to using a bitmap for VMA flags so we can avoid running out of flags, especially for 32-bit kernels which are constrained to 32 flags, necessitating some features to be limited to 64-bit kernels only. By doing so, we remove any constraint on the number of VMA flags moving forwards no matter the platform and can decide in future to extend beyond 64 if required. We start by declaring an opaque types, vma_flags_t (which resembles mm_struct flags of type mm_flags_t), setting it to precisely the same size as vm_flags_t, and place it in union with vm_flags in the VMA declaration. We additionally update struct vm_area_desc equivalently placing the new opaque type in union with vm_flags. This change therefore does not impact the size of struct vm_area_struct or struct vm_area_desc. In order for the change to be iterative and to avoid impacting performance, we designate VM_xxx declared bitmap flag values as those which must exist in the first system word of the VMA flags bitmap. We therefore declare vma_flags_clear_all(), vma_flags_overwrite_word(), vma_flags_overwrite_word(), vma_flags_overwrite_word_once(), vma_flags_set_word() and vma_flags_clear_word() in order to allow us to update the existing vm_flags_*() functions to utilise these helpers. This is a stepping stone towards converting users to the VMA flags bitmap and behaves precisely as before. By doing this, we can eliminate the existing private vma->__vm_flags field in the vma->vm_flags union and replace it with the newly introduced opaque type vma_flags, which we call flags so we refer to the new bitmap field as vma->flags. We update vma_flag_[test, set]_atomic() to account for the change also. We adapt vm_flags_reset_once() to only clear those bits above the first system word providing write-once semantics to the first system word (which it is presumed the caller requires - and in all current use cases this is so). As we currently only specify that the VMA flags bitmap size is equal to BITS_PER_LONG number of bits, this is a noop, but is defensive in preparation for a future change that increases this. We additionally update the VMA userland test declarations to implement the same changes there. Finally, we update the rust code to reference vma->vm_flags on update rather than vma->__vm_flags which has been removed. This is safe for now, albeit it is implicitly performing a const cast. Once we introduce flag helpers we can improve this more. No functional change intended. Link: https://lkml.kernel.org/r/bab179d7b153ac12f221b7d65caac2759282cfe9.1764064557.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Pedro Falcato Acked-by: Alice Ryhl [rust] Cc: Alex Gaynor Cc: Alistair Popple Cc: Andreas Hindborg Cc: Axel Rasmussen Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Björn Roy Baron Cc: Boqun Feng Cc: Byungchul Park Cc: Chengming Zhou Cc: Chris Li Cc: Danilo Krummrich Cc: David Hildenbrand Cc: David Rientjes Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gary Guo Cc: Gregory Price Cc: "Huang, Ying" Cc: Ingo Molnar Cc: Jann Horn Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Joshua Hahn Cc: Juri Lelli Cc: Kairui Song Cc: Kees Cook Cc: Kemeng Shi Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Mathew Brost Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Miguel Ojeda Cc: Mike Rapoport Cc: Muchun Song Cc: Nhat Pham Cc: Nico Pache Cc: Oscar Salvador Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Rakie Kim Cc: Rik van Riel Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Valentin Schneider Cc: Vincent Guittot Cc: Wei Xu Cc: xu xin Cc: Yuanchu Xie Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 24 ++++++++++++++---- include/linux/mm_types.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 81 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index a2f38fb68840..2887d3b34d3e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -911,7 +911,8 @@ static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); - ACCESS_PRIVATE(vma, __vm_flags) = flags; + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); } /* @@ -931,14 +932,25 @@ static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); - WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); + /* + * If VMA flags exist beyond the first system word, also clear these. It + * is assumed the write once behaviour is required only for the first + * system word. + */ + if (NUM_VMA_FLAG_BITS > BITS_PER_LONG) { + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + + bitmap_zero(&bitmap[1], NUM_VMA_FLAG_BITS - BITS_PER_LONG); + } + + vma_flags_overwrite_word_once(&vma->flags, flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); - ACCESS_PRIVATE(vma, __vm_flags) |= flags; + vma_flags_set_word(&vma->flags, flags); } static inline void vm_flags_clear(struct vm_area_struct *vma, @@ -946,7 +958,7 @@ static inline void vm_flags_clear(struct vm_area_struct *vma, { VM_WARN_ON_ONCE(!pgtable_supports_soft_dirty() && (flags & VM_SOFTDIRTY)); vma_start_write(vma); - ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; + vma_flags_clear_word(&vma->flags, flags); } /* @@ -989,12 +1001,14 @@ static inline bool __vma_flag_atomic_valid(struct vm_area_struct *vma, static inline void vma_flag_set_atomic(struct vm_area_struct *vma, vma_flag_t bit) { + unsigned long *bitmap = ACCESS_PRIVATE(&vma->flags, __vma_flags); + /* mmap read lock/VMA read lock must be held. */ if (!rwsem_is_locked(&vma->vm_mm->mmap_lock)) vma_assert_locked(vma); if (__vma_flag_atomic_valid(vma, bit)) - set_bit((__force int)bit, &ACCESS_PRIVATE(vma, __vm_flags)); + set_bit((__force int)bit, bitmap); } /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3550672e0f9e..b71625378ce3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -848,6 +848,15 @@ struct mmap_action { bool hide_from_rmap_until_complete :1; }; +/* + * Opaque type representing current VMA (vm_area_struct) flag state. Must be + * accessed via vma_flags_xxx() helper functions. + */ +#define NUM_VMA_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); +} __private vma_flags_t; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -865,7 +874,10 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; pgprot_t page_prot; /* Write-only fields. */ @@ -910,10 +922,12 @@ struct vm_area_struct { /* * Flags, see mm.h. * To modify use vm_flags_{init|reset|set|clear|mod} functions. + * Preferably, use vma_flags_xxx() functions. */ union { + /* Temporary while VMA flags are being converted. */ const vm_flags_t vm_flags; - vm_flags_t __private __vm_flags; + vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK @@ -994,6 +1008,52 @@ struct vm_area_struct { #endif } __randomize_layout; +/* Clears all bits in the VMA flags bitmap, non-atomically. */ +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + #ifdef CONFIG_NUMA #define vma_policy(vma) ((vma)->vm_policy) #else -- cgit v1.2.3 From f3b566d726357df591602f195a9379494f005225 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Wed, 26 Nov 2025 02:04:35 +0000 Subject: memcg: remove inc/dec_lruvec_kmem_state helpers The dec_lruvec_kmem_state helper is unused by any caller and can be safely removed. Meanwhile, the inc_lruvec_kmem_state helper is only referenced by shadow_lru_isolate, retaining these two helpers is unnecessary. This patch removes both helper functions to eliminate redundant code. Link: https://lkml.kernel.org/r/20251126020435.1511637-1-chenridong@huaweicloud.com Signed-off-by: Chen Ridong Acked-by: Qi Zheng Acked-by: Shakeel Butt Cc: Axel Rasmussen Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Lu Jialin Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d35390f9892a..0651865a4564 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1452,16 +1452,6 @@ struct slabobj_ext { #endif } __aligned(8); -static inline void inc_lruvec_kmem_state(void *p, enum node_stat_item idx) -{ - mod_lruvec_kmem_state(p, idx, 1); -} - -static inline void dec_lruvec_kmem_state(void *p, enum node_stat_item idx) -{ - mod_lruvec_kmem_state(p, idx, -1); -} - static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; -- cgit v1.2.3 From 127fa2ae9e2b1f9b9d876dfaa39fe3640cec5764 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Sat, 18 Oct 2025 20:41:36 +0300 Subject: KEYS: trusted: Replace a redundant instance of tpm2_hash_map 'trusted_tpm2' duplicates 'tpm2_hash_map' originally part of the TPN driver, which is suboptimal. Implement and export `tpm2_find_hash_alg()` in the driver, and substitute the redundant code in 'trusted_tpm2' with a call to the new function. Reviewed-by: Jonathan McDowell Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index dc0338a783f3..b15360ff78d7 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -473,6 +473,7 @@ extern int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx, extern int tpm_get_random(struct tpm_chip *chip, u8 *data, size_t max); extern struct tpm_chip *tpm_default_chip(void); void tpm2_flush_context(struct tpm_chip *chip, u32 handle); +int tpm2_find_hash_alg(unsigned int crypto_id); static inline void tpm_buf_append_empty_auth(struct tpm_buf *buf, u32 handle) { -- cgit v1.2.3 From 2b092175f5e301cdaa935093edfef2be9defb6df Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 28 Nov 2025 16:06:41 -0500 Subject: NFS: Fix inheritance of the block sizes when automounting Only inherit the block sizes that were actually specified as mount parameters for the parent mount. Fixes: 62a55d088cd8 ("NFS: Additional refactoring for fs_context conversion") Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 4ba04de6b1ca..c58b870f31ee 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -172,6 +172,11 @@ struct nfs_server { #define NFS_MOUNT_FORCE_RDIRPLUS 0x20000000 #define NFS_MOUNT_NETUNREACH_FATAL 0x40000000 + unsigned int automount_inherit; /* Properties inherited by automount */ +#define NFS_AUTOMOUNT_INHERIT_BSIZE 0x0001 +#define NFS_AUTOMOUNT_INHERIT_RSIZE 0x0002 +#define NFS_AUTOMOUNT_INHERIT_WSIZE 0x0004 + unsigned int caps; /* server capabilities */ __u64 fattr_valid; /* Valid attributes */ unsigned int rsize; /* read size */ -- cgit v1.2.3 From 205dd7a5d6ad6f4c8e8fcd3c3b95a7c0e7067fee Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 18 Nov 2025 18:06:31 -0500 Subject: virtio_pci: drop kernel.h virtio UAPI headers really have no business pulling in kernel.h Replace it with const.h which seems to be what's needed for __KERNEL_DIV_ROUND_UP. Fixes: 7c1ae151e812 ("virtio_pci: Introduce device parts access commands") Cc: Yishai Hadas Cc: Alex Williamson Message-ID: <7a73b6c6af67e13b86633cd7bf11ad56b5d9809b.1763535341.git.mst@redhat.com> Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/virtio_pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index c691ac210ce2..e732e3456e27 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -40,7 +40,7 @@ #define _LINUX_VIRTIO_PCI_H #include -#include +#include #ifndef VIRTIO_PCI_NO_LEGACY -- cgit v1.2.3 From 51d7a054521de7085783a9a1ba15c3530863409a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 5 Nov 2025 15:23:50 +0100 Subject: locking/mutex: Redo __mutex_init() to reduce generated code size mutex_init() invokes __mutex_init() providing the name of the lock and a pointer to a the lock class. With LOCKDEP enabled this information is useful but without LOCKDEP it not used at all. Passing the pointer information of the lock class might be considered negligible but the name of the lock is passed as well and the string is stored. This information is wasting storage. Split __mutex_init() into a _genereic() variant doing the initialisation of the lock and a _lockdep() version which does _genereic() plus the lockdep bits. Restrict the lockdep version to lockdep enabled builds allowing the compiler to remove the unused parameter. This results in the following size reduction: text data bss dec filename | 30237599 8161430 1176624 39575653 vmlinux.defconfig | 30233269 8149142 1176560 39558971 vmlinux.defconfig.patched -4.2KiB -12KiB | 32455099 8471098 12934684 53860881 vmlinux.defconfig.lockdep | 32455100 8471098 12934684 53860882 vmlinux.defconfig.patched.lockdep | 27152407 7191822 2068040 36412269 vmlinux.defconfig.preempt_rt | 27145937 7183630 2067976 36397543 vmlinux.defconfig.patched.preempt_rt -6.3KiB -8KiB | 29382020 7505742 13784608 50672370 vmlinux.defconfig.preempt_rt.lockdep | 29376229 7505742 13784544 50666515 vmlinux.defconfig.patched.preempt_rt.lockdep -5.6KiB [peterz: folded fix from boqun] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Waiman Long Link: https://lkml.kernel.org/r/20251125145425.68319-1-boqun.feng@gmail.com Link: https://patch.msgid.link/20251105142350.Tfeevs2N@linutronix.de --- include/linux/mutex.h | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 847b81ca6436..bf535f0118bb 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -86,8 +86,23 @@ do { \ #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) -extern void __mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_init_lockep(lock, name, key); +} +#else +extern void mutex_init_generic(struct mutex *lock); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_init_generic(lock); +} +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ /** * mutex_is_locked - is the mutex locked @@ -111,17 +126,27 @@ extern bool mutex_is_locked(struct mutex *lock); #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) -extern void __mutex_rt_init(struct mutex *lock, const char *name, - struct lock_class_key *key); - #define mutex_is_locked(l) rt_mutex_base_is_locked(&(l)->rtmutex) -#define __mutex_init(mutex, name, key) \ -do { \ - rt_mutex_base_init(&(mutex)->rtmutex); \ - __mutex_rt_init((mutex), name, key); \ -} while (0) +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, + struct lock_class_key *key); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_rt_init_lockdep(lock, name, key); +} +#else +extern void mutex_rt_init_generic(struct mutex *mutex); + +static inline void __mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ + mutex_rt_init_generic(lock); +} +#endif /* !CONFIG_LOCKDEP */ #endif /* CONFIG_PREEMPT_RT */ #ifdef CONFIG_DEBUG_MUTEXES -- cgit v1.2.3 From 719e357fc09c63238956eb7cd546627f9e050640 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 27 Nov 2025 15:41:40 +0100 Subject: locking/local_lock: s/l/__l/ and s/tl/__tl/ to reduce the risk of shadowing The Linux kernel coding style advises to avoid common variable names in function-like macros to reduce the risk of namespace collisions. Throughout local_lock_internal.h, several macros use the rather common variable names 'l' and 'tl'. This already resulted in an actual collision: the __local_lock_acquire() function like macro is currently shadowing the parameter 'l' of the: class_##_name##_t class_##_name##_constructor(_type *l) function factory from . Rename the variable 'l' to '__l' and the variable 'tl' to '__tl' throughout the file to fix the current namespace collision and to prevent future ones. [ bigeasy: Rebase, update all l and tl instances in macros ] Signed-off-by: Vincent Mailhol Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Waiman Long Link: https://patch.msgid.link/20251127144140.215722-3-bigeasy@linutronix.de --- include/linux/local_lock_internal.h | 62 ++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index a4dc479157b5..8f82b4eb542f 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -99,18 +99,18 @@ do { \ #define __local_lock_acquire(lock) \ do { \ - local_trylock_t *tl; \ - local_lock_t *l; \ + local_trylock_t *__tl; \ + local_lock_t *__l; \ \ - l = (local_lock_t *)(lock); \ - tl = (local_trylock_t *)l; \ + __l = (local_lock_t *)(lock); \ + __tl = (local_trylock_t *)__l; \ _Generic((lock), \ local_trylock_t *: ({ \ - lockdep_assert(tl->acquired == 0); \ - WRITE_ONCE(tl->acquired, 1); \ + lockdep_assert(__tl->acquired == 0); \ + WRITE_ONCE(__tl->acquired, 1); \ }), \ local_lock_t *: (void)0); \ - local_lock_acquire(l); \ + local_lock_acquire(__l); \ } while (0) #define __local_lock(lock) \ @@ -133,36 +133,36 @@ do { \ #define __local_trylock(lock) \ ({ \ - local_trylock_t *tl; \ + local_trylock_t *__tl; \ \ preempt_disable(); \ - tl = (lock); \ - if (READ_ONCE(tl->acquired)) { \ + __tl = (lock); \ + if (READ_ONCE(__tl->acquired)) { \ preempt_enable(); \ - tl = NULL; \ + __tl = NULL; \ } else { \ - WRITE_ONCE(tl->acquired, 1); \ + WRITE_ONCE(__tl->acquired, 1); \ local_trylock_acquire( \ - (local_lock_t *)tl); \ + (local_lock_t *)__tl); \ } \ - !!tl; \ + !!__tl; \ }) #define __local_trylock_irqsave(lock, flags) \ ({ \ - local_trylock_t *tl; \ + local_trylock_t *__tl; \ \ local_irq_save(flags); \ - tl = (lock); \ - if (READ_ONCE(tl->acquired)) { \ + __tl = (lock); \ + if (READ_ONCE(__tl->acquired)) { \ local_irq_restore(flags); \ - tl = NULL; \ + __tl = NULL; \ } else { \ - WRITE_ONCE(tl->acquired, 1); \ + WRITE_ONCE(__tl->acquired, 1); \ local_trylock_acquire( \ - (local_lock_t *)tl); \ + (local_lock_t *)__tl); \ } \ - !!tl; \ + !!__tl; \ }) /* preemption or migration must be disabled before calling __local_lock_is_locked */ @@ -170,16 +170,16 @@ do { \ #define __local_lock_release(lock) \ do { \ - local_trylock_t *tl; \ - local_lock_t *l; \ + local_trylock_t *__tl; \ + local_lock_t *__l; \ \ - l = (local_lock_t *)(lock); \ - tl = (local_trylock_t *)l; \ - local_lock_release(l); \ + __l = (local_lock_t *)(lock); \ + __tl = (local_trylock_t *)__l; \ + local_lock_release(__l); \ _Generic((lock), \ local_trylock_t *: ({ \ - lockdep_assert(tl->acquired == 1); \ - WRITE_ONCE(tl->acquired, 0); \ + lockdep_assert(__tl->acquired == 1); \ + WRITE_ONCE(__tl->acquired, 0); \ }), \ local_lock_t *: (void)0); \ } while (0) @@ -223,12 +223,12 @@ typedef spinlock_t local_trylock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) #define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) -#define __local_lock_init(l) \ +#define __local_lock_init(__l) \ do { \ - local_spin_lock_init((l)); \ + local_spin_lock_init((__l)); \ } while (0) -#define __local_trylock_init(l) __local_lock_init(l) +#define __local_trylock_init(__l) __local_lock_init(__l) #define __local_lock(__lock) \ do { \ -- cgit v1.2.3 From 43decb6b628eb033a1b6188e5018773c0d38be1d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 27 Nov 2025 22:59:25 -0800 Subject: locking/local_lock: Fix all kernel-doc warnings Modify kernel-doc comments in local_lock.h to prevent warnings: Warning: include/linux/local_lock.h:9 function parameter 'lock' not described in 'local_lock_init' Warning: include/linux/local_lock.h:56 function parameter 'lock' not described in 'local_trylock_init' Warning: include/linux/local_lock.h:56 expecting prototype for local_lock_init(). Prototype was for local_trylock_init() instead Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251128065925.917917-1-rdunlap@infradead.org --- include/linux/local_lock.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 0d91d060e3e9..b0e6ab329b00 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -6,6 +6,7 @@ /** * local_lock_init - Runtime initialize a lock instance + * @lock: The lock variable */ #define local_lock_init(lock) __local_lock_init(lock) @@ -52,7 +53,8 @@ __local_unlock_irqrestore(this_cpu_ptr(lock), flags) /** - * local_lock_init - Runtime initialize a lock instance + * local_trylock_init - Runtime initialize a lock instance + * @lock: The lock variable */ #define local_trylock_init(lock) __local_trylock_init(lock) -- cgit v1.2.3 From bd45d46ffc8fa96e8ee9fa078cef53e0c1221ff4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Winiarski?= Date: Thu, 27 Nov 2025 10:39:33 +0100 Subject: drm/xe/pf: Export helpers for VFIO MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Device specific VFIO driver variant for Xe will implement VF migration. Export everything that's needed for migration ops. Reviewed-by: Michal Wajdeczko Link: https://patch.msgid.link/20251127093934.1462188-4-michal.winiarski@intel.com Signed-off-by: Michał Winiarski (cherry picked from commit 17f22465c5a5573724c942ca7147b4024631ef87) Signed-off-by: Thomas Hellström --- include/drm/intel/xe_sriov_vfio.h | 143 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 include/drm/intel/xe_sriov_vfio.h (limited to 'include') diff --git a/include/drm/intel/xe_sriov_vfio.h b/include/drm/intel/xe_sriov_vfio.h new file mode 100644 index 000000000000..e9814e8149fd --- /dev/null +++ b/include/drm/intel/xe_sriov_vfio.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2025 Intel Corporation + */ + +#ifndef _XE_SRIOV_VFIO_H_ +#define _XE_SRIOV_VFIO_H_ + +#include + +struct pci_dev; +struct xe_device; + +/** + * xe_sriov_vfio_get_pf() - Get PF &xe_device. + * @pdev: the VF &pci_dev device + * + * Return: pointer to PF &xe_device, NULL otherwise. + */ +struct xe_device *xe_sriov_vfio_get_pf(struct pci_dev *pdev); + +/** + * xe_sriov_vfio_migration_supported() - Check if migration is supported. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * + * Return: true if migration is supported, false otherwise. + */ +bool xe_sriov_vfio_migration_supported(struct xe_device *xe); + +/** + * xe_sriov_vfio_wait_flr_done() - Wait for VF FLR completion. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * This function will wait until VF FLR is processed by PF on all tiles (or + * until timeout occurs). + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_wait_flr_done(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_suspend_device() - Suspend VF. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * This function will pause VF on all tiles/GTs. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_suspend_device(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_resume_device() - Resume VF. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * This function will resume VF on all tiles. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_resume_device(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_stop_copy_enter() - Initiate a VF device migration data save. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_stop_copy_enter(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_stop_copy_exit() - Finish a VF device migration data save. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_stop_copy_exit(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_resume_data_enter() - Initiate a VF device migration data restore. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_resume_data_enter(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_resume_data_exit() - Finish a VF device migration data restore. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_resume_data_exit(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_error() - Move VF device to error state. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Reset is needed to move it out of error state. + * + * Return: 0 on success or a negative error code on failure. + */ +int xe_sriov_vfio_error(struct xe_device *xe, unsigned int vfid); + +/** + * xe_sriov_vfio_data_read() - Read migration data from the VF device. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * @buf: start address of userspace buffer + * @len: requested read size from userspace + * + * Return: number of bytes that has been successfully read, + * 0 if no more migration data is available, -errno on failure. + */ +ssize_t xe_sriov_vfio_data_read(struct xe_device *xe, unsigned int vfid, + char __user *buf, size_t len); +/** + * xe_sriov_vfio_data_write() - Write migration data to the VF device. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * @buf: start address of userspace buffer + * @len: requested write size from userspace + * + * Return: number of bytes that has been successfully written, -errno on failure. + */ +ssize_t xe_sriov_vfio_data_write(struct xe_device *xe, unsigned int vfid, + const char __user *buf, size_t len); +/** + * xe_sriov_vfio_stop_copy_size() - Get a size estimate of VF device migration data. + * @xe: the PF &xe_device obtained by calling xe_sriov_vfio_get_pf() + * @vfid: the VF identifier (can't be 0) + * + * Return: migration data size in bytes or a negative error code on failure. + */ +ssize_t xe_sriov_vfio_stop_copy_size(struct xe_device *xe, unsigned int vfid); + +#endif -- cgit v1.2.3 From 611cf41ef6ac8301d23daadd8e78b013db0c5071 Mon Sep 17 00:00:00 2001 From: Yongxin Liu Date: Fri, 28 Nov 2025 18:24:38 +0800 Subject: platform/x86: intel_pmc_ipc: fix ACPI buffer memory leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intel_pmc_ipc() function uses ACPI_ALLOCATE_BUFFER to allocate memory for the ACPI evaluation result but never frees it, causing a 192-byte memory leak on each call. This leak is triggered during network interface initialization when the stmmac driver calls intel_mac_finish() -> intel_pmc_ipc(). unreferenced object 0xffff96a848d6ea80 (size 192): comm "dhcpcd", pid 541, jiffies 4294684345 hex dump (first 32 bytes): 04 00 00 00 05 00 00 00 98 ea d6 48 a8 96 ff ff ...........H.... 00 00 00 00 00 00 00 00 01 00 00 00 00 00 00 00 ................ backtrace (crc b1564374): kmemleak_alloc+0x2d/0x40 __kmalloc_noprof+0x2fa/0x730 acpi_ut_initialize_buffer+0x83/0xc0 acpi_evaluate_object+0x29a/0x2f0 intel_pmc_ipc+0xfd/0x170 intel_mac_finish+0x168/0x230 stmmac_mac_finish+0x3d/0x50 phylink_major_config+0x22b/0x5b0 phylink_mac_initial_config.constprop.0+0xf1/0x1b0 phylink_start+0x8e/0x210 __stmmac_open+0x12c/0x2b0 stmmac_open+0x23c/0x380 __dev_open+0x11d/0x2c0 __dev_change_flags+0x1d2/0x250 netif_change_flags+0x2b/0x70 dev_change_flags+0x40/0xb0 Add __free(kfree) for ACPI object to properly release the allocated buffer. Cc: stable@vger.kernel.org Fixes: 7e2f7e25f6ff ("arch: x86: add IPC mailbox accessor function and add SoC register access") Signed-off-by: Yongxin Liu Link: https://patch.msgid.link/20251128102437.3412891-2-yongxin.liu@windriver.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen --- include/linux/platform_data/x86/intel_pmc_ipc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/platform_data/x86/intel_pmc_ipc.h b/include/linux/platform_data/x86/intel_pmc_ipc.h index 1d34435b7001..85ea381e4a27 100644 --- a/include/linux/platform_data/x86/intel_pmc_ipc.h +++ b/include/linux/platform_data/x86/intel_pmc_ipc.h @@ -9,6 +9,7 @@ #ifndef INTEL_PMC_IPC_H #define INTEL_PMC_IPC_H #include +#include #define IPC_SOC_REGISTER_ACCESS 0xAA #define IPC_SOC_SUB_CMD_READ 0x00 @@ -48,7 +49,6 @@ static inline int intel_pmc_ipc(struct pmc_ipc_cmd *ipc_cmd, struct pmc_ipc_rbuf {.type = ACPI_TYPE_INTEGER,}, }; struct acpi_object_list arg_list = { PMC_IPCS_PARAM_COUNT, params }; - union acpi_object *obj; int status; if (!ipc_cmd || !rbuf) @@ -72,7 +72,7 @@ static inline int intel_pmc_ipc(struct pmc_ipc_cmd *ipc_cmd, struct pmc_ipc_rbuf if (ACPI_FAILURE(status)) return -ENODEV; - obj = buffer.pointer; + union acpi_object *obj __free(kfree) = buffer.pointer; if (obj && obj->type == ACPI_TYPE_PACKAGE && obj->package.count == VALID_IPC_RESPONSE) { -- cgit v1.2.3 From 33b2835f0b7e2a458473b0e3a23b54b92108b6b0 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 2 Sep 2025 11:11:40 -0400 Subject: Bluetooth: HCI: Add initial support for PAST This adds PAST related commands (HCI_OP_LE_PAST, HCI_OP_LE_PAST_SET_INFO and HCI_OP_LE_PAST_PARAMS) and events (HCI_EV_LE_PAST_RECEIVED) along with handling of PAST sender and receiver features bits including new MGMG settings ( HCI_EV_LE_PAST_RECEIVED and MGMT_SETTING_PAST_RECEIVER) which userspace can use to determine if PAST is supported by the controller. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 54 ++++++++++++++++++++++++++++++++++++++++ include/net/bluetooth/hci_core.h | 12 +++++++++ include/net/bluetooth/mgmt.h | 2 ++ 3 files changed, 68 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index cb4c02d00759..d883ad233ebc 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -647,6 +647,8 @@ enum { #define HCI_LE_EXT_ADV 0x10 #define HCI_LE_PERIODIC_ADV 0x20 #define HCI_LE_CHAN_SEL_ALG2 0x40 +#define HCI_LE_PAST_SENDER 0x01 +#define HCI_LE_PAST_RECEIVER 0x02 #define HCI_LE_CIS_CENTRAL 0x10 #define HCI_LE_CIS_PERIPHERAL 0x20 #define HCI_LE_ISO_BROADCASTER 0x40 @@ -2068,6 +2070,44 @@ struct hci_cp_le_set_privacy_mode { __u8 mode; } __packed; +#define HCI_OP_LE_PAST 0x205a +struct hci_cp_le_past { + __le16 handle; + __le16 service_data; + __le16 sync_handle; +} __packed; + +struct hci_rp_le_past { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_PAST_SET_INFO 0x205b +struct hci_cp_le_past_set_info { + __le16 handle; + __le16 service_data; + __u8 adv_handle; +} __packed; + +struct hci_rp_le_past_set_info { + __u8 status; + __le16 handle; +} __packed; + +#define HCI_OP_LE_PAST_PARAMS 0x205c +struct hci_cp_le_past_params { + __le16 handle; + __u8 mode; + __le16 skip; + __le16 sync_timeout; + __u8 cte_type; +} __packed; + +struct hci_rp_le_past_params { + __u8 status; + __le16 handle; +} __packed; + #define HCI_OP_LE_READ_BUFFER_SIZE_V2 0x2060 struct hci_rp_le_read_buffer_size_v2 { __u8 status; @@ -2800,6 +2840,20 @@ struct hci_evt_le_ext_adv_set_term { __u8 num_evts; } __packed; +#define HCI_EV_LE_PAST_RECEIVED 0x18 +struct hci_ev_le_past_received { + __u8 status; + __le16 handle; + __le16 service_data; + __le16 sync_handle; + __u8 sid; + __u8 bdaddr_type; + bdaddr_t bdaddr; + __u8 phy; + __le16 interval; + __u8 clock_accuracy; +} __packed; + #define HCI_EVT_LE_CIS_ESTABLISHED 0x19 struct hci_evt_le_cis_established { __u8 status; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 0cb87687837f..1bd12c303e25 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -2053,6 +2053,18 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define sync_recv_capable(dev) \ ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) #define sync_recv_enabled(dev) (le_enabled(dev) && sync_recv_capable(dev)) +#define past_sender_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_PAST_SENDER) +#define past_receiver_capable(dev) \ + ((dev)->le_features[3] & HCI_LE_PAST_RECEIVER) +#define past_capable(dev) \ + (past_sender_capable(dev) || past_receiver_capable(dev)) +#define past_sender_enabled(dev) \ + (le_enabled(dev) && past_sender_capable(dev)) +#define past_receiver_enabled(dev) \ + (le_enabled(dev) && past_receiver_capable(dev)) +#define past_enabled(dev) \ + (past_sender_enabled(dev) || past_receiver_enabled(dev)) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) diff --git a/include/net/bluetooth/mgmt.h b/include/net/bluetooth/mgmt.h index f5be96f08b9d..8234915854b6 100644 --- a/include/net/bluetooth/mgmt.h +++ b/include/net/bluetooth/mgmt.h @@ -119,6 +119,8 @@ struct mgmt_rp_read_index_list { #define MGMT_SETTING_ISO_BROADCASTER BIT(20) #define MGMT_SETTING_ISO_SYNC_RECEIVER BIT(21) #define MGMT_SETTING_LL_PRIVACY BIT(22) +#define MGMT_SETTING_PAST_SENDER BIT(23) +#define MGMT_SETTING_PAST_RECEIVER BIT(24) #define MGMT_OP_READ_INFO 0x0004 #define MGMT_READ_INFO_SIZE 0 -- cgit v1.2.3 From c530569adc19b5f0c62955de41f067bad34e3fe0 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 2 Sep 2025 11:14:28 -0400 Subject: Bluetooth: hci_core: Introduce HCI_CONN_FLAG_PAST This introduces a new device flag so userspace can indicate if it wants to enable PAST Receiver for a specific device. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 1bd12c303e25..8c2235444808 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -166,6 +166,7 @@ enum hci_conn_flags { HCI_CONN_FLAG_REMOTE_WAKEUP = BIT(0), HCI_CONN_FLAG_DEVICE_PRIVACY = BIT(1), HCI_CONN_FLAG_ADDRESS_RESOLUTION = BIT(2), + HCI_CONN_FLAG_PAST = BIT(3), }; typedef u8 hci_conn_flags_t; -- cgit v1.2.3 From d3413703d5f8b7d1e6f514f9440ed5da1bc30796 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Sep 2025 11:34:44 -0400 Subject: Bluetooth: ISO: Add support to bind to trigger PAST This makes it possible to bind to a different destination address after being connected (BT_CONNECTED, BT_CONNECT2) which then triggers PAST Sender proceedure to transfer the PA Sync to the destination address. Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + include/net/bluetooth/hci_sync.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 8c2235444808..1f74722f3f4d 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1602,6 +1602,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, __u8 base_len, __u8 *base, u16 timeout); +int hci_past_bis(struct hci_conn *conn, bdaddr_t *dst, __u8 dst_type); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos, u16 timeout); diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index e352a4e0ef8d..3133f40fa9f9 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -188,3 +188,4 @@ int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn); int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn); +int hci_past_sync(struct hci_conn *conn, struct hci_conn *le); -- cgit v1.2.3 From 14b06c3a88f7031d64fbce197fad1d400e507663 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 26 Sep 2025 15:56:43 -0400 Subject: Bluetooth: HCI: Always use the identity address when initializing a connection This makes sure hci_conn is initialized with the identity address if a matching IRK exists which avoids the trouble of having to do it at multiple places which seems to be missing (e.g. CIS, BIS and PA). Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 1f74722f3f4d..858b58206e80 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1571,9 +1571,9 @@ int hci_le_create_cis_pending(struct hci_dev *hdev); int hci_conn_check_create_cis(struct hci_conn *conn); struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, - u8 role, u16 handle); + u8 dst_type, u8 role, u16 handle); struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, - bdaddr_t *dst, u8 role); + bdaddr_t *dst, u8 dst_type, u8 role); void hci_conn_del(struct hci_conn *conn); void hci_conn_hash_flush(struct hci_dev *hdev); -- cgit v1.2.3 From a106e50be74b0896583f4d010a69f9806e4194f4 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 14 Nov 2025 09:29:28 -0500 Subject: Bluetooth: HCI: Add support for LL Extended Feature Set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds support for emulating LL Extended Feature Set introduced in 6.0 that adds the following: Commands: - HCI_LE_Read_All_Local_Supported_­Features(0x2087)(Feature:47,1) - HCI_LE_Read_All_Remote_Features(0x2088)(Feature:47,2) Events: - HCI_LE_Read_All_Remote_Features_Complete(0x2b)(Mask bit:42) Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 23 +++++++++++++++++++++++ include/net/bluetooth/hci_core.h | 5 ++++- include/net/bluetooth/hci_sync.h | 2 ++ 3 files changed, 29 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index d883ad233ebc..a27cd3626b87 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -653,6 +653,7 @@ enum { #define HCI_LE_CIS_PERIPHERAL 0x20 #define HCI_LE_ISO_BROADCASTER 0x40 #define HCI_LE_ISO_SYNC_RECEIVER 0x80 +#define HCI_LE_LL_EXT_FEATURE 0x80 /* Connection modes */ #define HCI_CM_ACTIVE 0x0000 @@ -2255,6 +2256,19 @@ struct hci_cp_le_set_host_feature { __u8 bit_value; } __packed; +#define HCI_OP_LE_READ_ALL_LOCAL_FEATURES 0x2087 +struct hci_rp_le_read_all_local_features { + __u8 status; + __u8 page; + __u8 features[248]; +} __packed; + +#define HCI_OP_LE_READ_ALL_REMOTE_FEATURES 0x2088 +struct hci_cp_le_read_all_remote_features { + __le16 handle; + __u8 pages; +} __packed; + /* ---- HCI Events ---- */ struct hci_ev_status { __u8 status; @@ -2937,6 +2951,15 @@ struct hci_evt_le_big_info_adv_report { __u8 encryption; } __packed; +#define HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE 0x2b +struct hci_evt_le_read_all_remote_features_complete { + __u8 status; + __le16 handle; + __u8 max_pages; + __u8 valid_pages; + __u8 features[248]; +} __packed; + #define HCI_EV_VENDOR 0xff /* Internal events generated by Bluetooth stack */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 858b58206e80..4263e71a23ef 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -378,7 +378,7 @@ struct hci_dev { __u8 minor_class; __u8 max_page; __u8 features[HCI_MAX_PAGES][8]; - __u8 le_features[8]; + __u8 le_features[248]; __u8 le_accept_list_size; __u8 le_resolv_list_size; __u8 le_num_of_adv_sets; @@ -702,6 +702,7 @@ struct hci_conn { __u8 attempt; __u8 dev_class[3]; __u8 features[HCI_MAX_PAGES][8]; + __u8 le_features[248]; __u16 pkt_type; __u16 link_policy; __u8 key_type; @@ -2067,6 +2068,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); (le_enabled(dev) && past_receiver_capable(dev)) #define past_enabled(dev) \ (past_sender_enabled(dev) || past_receiver_enabled(dev)) +#define ll_ext_feature_capable(dev) \ + ((dev)->le_features[7] & HCI_LE_LL_EXT_FEATURE) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 3133f40fa9f9..56076bbc981d 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -189,3 +189,5 @@ int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn); int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn); int hci_past_sync(struct hci_conn *conn, struct hci_conn *le); + +int hci_le_read_remote_features(struct hci_conn *conn); -- cgit v1.2.3 From 9bf66036d686b9a67000ba22bd94be13a4ea79ac Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 26 Nov 2025 13:45:52 -0800 Subject: net: mana: Handle hardware recovery events when probing the device When MANA is being probed, it's possible that hardware is in recovery mode and the device may get GDMA_EQE_HWC_RESET_REQUEST over HWC in the middle of the probe. Detect such condition and go through the recovery service procedure. Signed-off-by: Long Li Reviewed-by: Haiyang Zhang Link: https://patch.msgid.link/1764193552-9712-1-git-send-email-longli@linux.microsoft.com Signed-off-by: Jakub Kicinski --- include/net/mana/gdma.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index a4cf307859f8..eaa27483f99b 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -382,6 +382,10 @@ struct gdma_irq_context { char name[MANA_IRQ_NAME_SZ]; }; +enum gdma_context_flags { + GC_PROBE_SUCCEEDED = 0, +}; + struct gdma_context { struct device *dev; struct dentry *mana_pci_debugfs; @@ -430,6 +434,8 @@ struct gdma_context { u64 pf_cap_flags1; struct workqueue_struct *service_wq; + + unsigned long flags; }; static inline bool mana_gd_is_mana(struct gdma_dev *gd) @@ -600,6 +606,9 @@ enum { /* Driver can send HWC periodically to query stats */ #define GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY BIT(21) +/* Driver can handle hardware recovery events during probe */ +#define GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY BIT(22) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ @@ -611,7 +620,8 @@ enum { GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE | \ GDMA_DRV_CAP_FLAG_1_HW_VPORT_LINK_AWARE | \ GDMA_DRV_CAP_FLAG_1_PERIODIC_STATS_QUERY | \ - GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE) + GDMA_DRV_CAP_FLAG_1_SKB_LINEARIZE | \ + GDMA_DRV_CAP_FLAG_1_PROBE_RECOVERY) #define GDMA_DRV_CAP_FLAGS2 0 -- cgit v1.2.3 From 5e1bf5ae5e3ba3588b474669ba05f5d202003d84 Mon Sep 17 00:00:00 2001 From: Parthiban Veerasooran Date: Mon, 1 Dec 2025 08:53:45 +0530 Subject: net: phy: phy-c45: add SQI and SQI+ support for OATC14 10Base-T1S PHYs Add support for reading Signal Quality Indicator (SQI) and enhanced SQI+ from OATC14 10Base-T1S PHYs. - Introduce MDIO register definitions for DCQ_SQI and DCQ_SQIPLUS. - Add `genphy_c45_oatc14_get_sqi_max()` to return the maximum supported SQI/SQI+ level. - Add `genphy_c45_oatc14_get_sqi()` to return the current SQI or SQI+ value. - Update `include/linux/phy.h` to expose the new APIs. SQI+ capability is read from the Advanced Diagnostic Features Capability register (ADFCAP). If SQI+ is supported, the driver calculates the value from the MSBs of the DCQ_SQIPLUS register; otherwise, it falls back to basic SQI (0-7 levels). This enables ethtool to report the SQI value for OATC14 10Base-T1S PHYs. Open Alliance TC14 10BASE-T1S Advanced Diagnostic PHY Features Specification ref: https://opensig.org/wp-content/uploads/2025/06/OPEN_Alliance_10BASE-T1S_Advanced_PHY_features_for-automotive_Ethernet_V2.1b.pdf Signed-off-by: Parthiban Veerasooran Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20251201032346.6699-2-parthiban.veerasooran@microchip.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/phy.h b/include/linux/phy.h index 059a104223c4..fbbe028cc4b7 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -530,6 +530,30 @@ struct phy_c45_device_ids { struct macsec_context; struct macsec_ops; +/** + * struct phy_oatc14_sqi_capability - SQI capability information for OATC14 + * 10Base-T1S PHY + * @updated: Indicates whether the SQI capability fields have been updated. + * @sqi_max: Maximum supported Signal Quality Indicator (SQI) level reported by + * the PHY. + * @sqiplus_bits: Bits for SQI+ levels supported by the PHY. + * 0 - SQI+ is not supported + * 3 - SQI+ is supported, using 3 bits (8 levels) + * 4 - SQI+ is supported, using 4 bits (16 levels) + * 5 - SQI+ is supported, using 5 bits (32 levels) + * 6 - SQI+ is supported, using 6 bits (64 levels) + * 7 - SQI+ is supported, using 7 bits (128 levels) + * 8 - SQI+ is supported, using 8 bits (256 levels) + * + * This structure is used by the OATC14 10Base-T1S PHY driver to store the SQI + * and SQI+ capability information retrieved from the PHY. + */ +struct phy_oatc14_sqi_capability { + bool updated; + int sqi_max; + u8 sqiplus_bits; +}; + /** * struct phy_device - An instance of a PHY * @@ -626,6 +650,7 @@ struct macsec_ops; * @link_down_events: Number of times link was lost * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data + * @oatc14_sqi_capability: SQI capability information for OATC14 10Base-T1S PHY * * interrupts currently only supports enabled or disabled, * but could be changed in the future to support enabling @@ -772,6 +797,8 @@ struct phy_device { /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif + + struct phy_oatc14_sqi_capability oatc14_sqi_capability; }; /* Generic phy_device::dev_flags */ @@ -2257,6 +2284,8 @@ int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); int genphy_c45_oatc14_cable_test_start(struct phy_device *phydev); int genphy_c45_oatc14_cable_test_get_status(struct phy_device *phydev, bool *finished); +int genphy_c45_oatc14_get_sqi_max(struct phy_device *phydev); +int genphy_c45_oatc14_get_sqi(struct phy_device *phydev); /* The gen10g_* functions are the old Clause 45 stub */ int gen10g_config_aneg(struct phy_device *phydev); -- cgit v1.2.3 From a0244e76213980f3b9bb5d40b0b6705fcf24230d Mon Sep 17 00:00:00 2001 From: Xiaoliang Yang Date: Sun, 30 Nov 2025 15:16:44 +0200 Subject: net: hsr: create an API to get hsr port type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the introduction of HSR_PT_INTERLINK in commit 5055cccfc2d1 ("net: hsr: Provide RedBox support (HSR-SAN)"), we see that different port types require different settings for hardware offload, which was not the case before when we only had HSR_PT_SLAVE_A and HSR_PT_SLAVE_B. But there is currently no way to know which port is which type, so create the hsr_get_port_type() API function and export it. When hsr_get_port_type() is called from the device driver, the port can must be found in the HSR port list. An important use case is for this function to work from offloading drivers' NETDEV_CHANGEUPPER handler, which is triggered by hsr_portdev_setup() -> netdev_master_upper_dev_link(). Therefore, we need to move the addition of the hsr_port to the HSR port list prior to calling hsr_portdev_setup(). This makes the error restoration path also more similar to hsr_del_port(), where kfree_rcu(port) is already used. Cc: Sebastian Andrzej Siewior Cc: Lukasz Majewski Signed-off-by: Xiaoliang Yang Signed-off-by: Vladimir Oltean Reviewed-by: Łukasz Majewski Link: https://patch.msgid.link/20251130131657.65080-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/linux/if_hsr.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/if_hsr.h b/include/linux/if_hsr.h index d7941fd88032..f4cf2dd36d19 100644 --- a/include/linux/if_hsr.h +++ b/include/linux/if_hsr.h @@ -43,6 +43,8 @@ extern bool is_hsr_master(struct net_device *dev); extern int hsr_get_version(struct net_device *dev, enum hsr_version *ver); struct net_device *hsr_get_port_ndev(struct net_device *ndev, enum hsr_port_type pt); +int hsr_get_port_type(struct net_device *hsr_dev, struct net_device *dev, + enum hsr_port_type *type); #else static inline bool is_hsr_master(struct net_device *dev) { @@ -59,6 +61,13 @@ static inline struct net_device *hsr_get_port_ndev(struct net_device *ndev, { return ERR_PTR(-EINVAL); } + +static inline int hsr_get_port_type(struct net_device *hsr_dev, + struct net_device *dev, + enum hsr_port_type *type) +{ + return -EINVAL; +} #endif /* CONFIG_HSR */ #endif /*_LINUX_IF_HSR_H_*/ -- cgit v1.2.3 From 0e75bfe340bf05d1586eaf02942438573bda69e3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sun, 30 Nov 2025 15:16:47 +0200 Subject: net: dsa: add simple HSR offload helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turns out that HSR offloads are so fine-grained that many DSA switches can do a small part even though they weren't specifically designed for the protocols supported by that driver (HSR and PRP). Specifically NETIF_F_HW_HSR_DUP - it is simple packet duplication on transmit, towards all (aka 2) ports members of the HSR device. For many DSA switches, we know how to duplicate a packet, even though we never typically use that feature. The transmit port mask from the tagging protocol can have multiple bits set, and the switch should send the packet once to every port with a bit set from that mask. Nonetheless, not all tagging protocols are like this, and sometimes the port is a single numeric value rather than a bit mask. For that reason, and also because switches can sometimes change tagging protocols for different ones, we need to make HSR offload helpers opt-in. For devices that can do nothing else HSR-specific, we introduce dsa_port_simple_hsr_join() and dsa_port_simple_hsr_leave(). These functions monitor when two user ports of the same switch are part of the same HSR device, and when that condition is true, they toggle the NETIF_F_HW_HSR_DUP feature flag of both net devices. Normally only dsa_port_simple_hsr_join() and dsa_port_simple_hsr_leave() are needed. The dsa_port_simple_hsr_validate() helper is just to see what kind of configuration could be offloadable using the generic helpers. This is used by switch drivers which are not currently using the right tagging protocol to offload this HSR ring, but could in principle offload it after changing the tagger. Suggested-by: David Yang Cc: "Alvin Šipraga" Cc: Chester A. Unal" Cc: "Clément Léger" Cc: Daniel Golle Cc: DENG Qingfang Cc: Florian Fainelli Cc: George McCollister Cc: Hauke Mehrtens Cc: Jonas Gorski Cc: Kurt Kanzenbach Cc: Linus Walleij Cc: Sean Wang Cc: UNGLinuxDriver@microchip.com Cc: Woojung Huh Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20251130131657.65080-6-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/net/dsa.h b/include/net/dsa.h index e40cdc12f7f3..cced1a866757 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1322,6 +1322,15 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct dsa_db db); +int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack); +int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack); +int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port, + struct net_device *hsr); + /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(const struct net_device *dev) { -- cgit v1.2.3 From 6b0f4ca079dbe6ae4aa57e529d67c7dc00d63577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Wed, 26 Nov 2025 17:35:37 +0000 Subject: wireguard: netlink: add YNL specification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a near[1] complete YNL specification for WireGuard, documenting the protocol in a machine-readable format, rather than comments in wireguard.h, and eases usage from C and non-C programming languages alike. The generated C library will be featured in a later patch, so in this patch I will use the in-kernel python client for examples. This makes the documentation in the UAPI header redundant, it is therefore removed. The in-line documentation in the spec is based on the existing comment in wireguard.h, and once released it will be available in the kernel documentation at: https://docs.kernel.org/netlink/specs/wireguard.html (until then run: make htmldocs) Generate wireguard.rst from this spec: $ make -C tools/net/ynl/generated/ wireguard.rst Query wireguard interface through pyynl: $ sudo ./tools/net/ynl/pyynl/cli.py --family wireguard \ --dump get-device \ --json '{"ifindex":3}' [{'fwmark': 0, 'ifindex': 3, 'ifname': 'wg-test', 'listen-port': 54318, 'peers': [{0: {'allowedips': [{0: {'cidr-mask': 0, 'family': 2, 'ipaddr': '0.0.0.0'}}, {0: {'cidr-mask': 0, 'family': 10, 'ipaddr': '::'}}], 'endpoint': b'[...]', 'last-handshake-time': {'nsec': 42, 'sec': 42}, 'persistent-keepalive-interval': 42, 'preshared-key': '[...]', 'protocol-version': 1, 'public-key': '[...]', 'rx-bytes': 42, 'tx-bytes': 42}}], 'private-key': '[...]', 'public-key': '[...]'}] Add another allowed IP prefix: $ sudo ./tools/net/ynl/pyynl/cli.py --family wireguard \ --do set-device --json '{"ifindex":3,"peers":[ {"public-key":"6a df b1 83 a4 ..","allowedips":[ {"cidr-mask":0,"family":10,"ipaddr":"::"}]}]}' [1] As can be seen above, the "endpoint" is only dumped as binary data, as it can't be fully described in YNL. It's either a struct sockaddr_in or struct sockaddr_in6 depending on the attribute length. Signed-off-by: Asbjørn Sloth Tønnesen Signed-off-by: Jason A. Donenfeld --- include/uapi/linux/wireguard.h | 129 ----------------------------------------- 1 file changed, 129 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h index 8c26391196d5..dee4401e0b5d 100644 --- a/include/uapi/linux/wireguard.h +++ b/include/uapi/linux/wireguard.h @@ -1,135 +1,6 @@ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - * - * Documentation - * ============= - * - * The below enums and macros are for interfacing with WireGuard, using generic - * netlink, with family WG_GENL_NAME and version WG_GENL_VERSION. It defines two - * methods: get and set. Note that while they share many common attributes, - * these two functions actually accept a slightly different set of inputs and - * outputs. - * - * WG_CMD_GET_DEVICE - * ----------------- - * - * May only be called via NLM_F_REQUEST | NLM_F_DUMP. The command should contain - * one but not both of: - * - * WGDEVICE_A_IFINDEX: NLA_U32 - * WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1 - * - * The kernel will then return several messages (NLM_F_MULTI) containing the - * following tree of nested items: - * - * WGDEVICE_A_IFINDEX: NLA_U32 - * WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1 - * WGDEVICE_A_PRIVATE_KEY: NLA_EXACT_LEN, len WG_KEY_LEN - * WGDEVICE_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN - * WGDEVICE_A_LISTEN_PORT: NLA_U16 - * WGDEVICE_A_FWMARK: NLA_U32 - * WGDEVICE_A_PEERS: NLA_NESTED - * 0: NLA_NESTED - * WGPEER_A_PUBLIC_KEY: NLA_EXACT_LEN, len WG_KEY_LEN - * WGPEER_A_PRESHARED_KEY: NLA_EXACT_LEN, len WG_KEY_LEN - * WGPEER_A_ENDPOINT: NLA_MIN_LEN(struct sockaddr), struct sockaddr_in or struct sockaddr_in6 - * WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL: NLA_U16 - * WGPEER_A_LAST_HANDSHAKE_TIME: NLA_EXACT_LEN, struct __kernel_timespec - * WGPEER_A_RX_BYTES: NLA_U64 - * WGPEER_A_TX_BYTES: NLA_U64 - * WGPEER_A_ALLOWEDIPS: NLA_NESTED - * 0: NLA_NESTED - * WGALLOWEDIP_A_FAMILY: NLA_U16 - * WGALLOWEDIP_A_IPADDR: NLA_MIN_LEN(struct in_addr), struct in_addr or struct in6_addr - * WGALLOWEDIP_A_CIDR_MASK: NLA_U8 - * 0: NLA_NESTED - * ... - * 0: NLA_NESTED - * ... - * ... - * WGPEER_A_PROTOCOL_VERSION: NLA_U32 - * 0: NLA_NESTED - * ... - * ... - * - * It is possible that all of the allowed IPs of a single peer will not - * fit within a single netlink message. In that case, the same peer will - * be written in the following message, except it will only contain - * WGPEER_A_PUBLIC_KEY and WGPEER_A_ALLOWEDIPS. This may occur several - * times in a row for the same peer. It is then up to the receiver to - * coalesce adjacent peers. Likewise, it is possible that all peers will - * not fit within a single message. So, subsequent peers will be sent - * in following messages, except those will only contain WGDEVICE_A_IFNAME - * and WGDEVICE_A_PEERS. It is then up to the receiver to coalesce these - * messages to form the complete list of peers. - * - * Since this is an NLA_F_DUMP command, the final message will always be - * NLMSG_DONE, even if an error occurs. However, this NLMSG_DONE message - * contains an integer error code. It is either zero or a negative error - * code corresponding to the errno. - * - * WG_CMD_SET_DEVICE - * ----------------- - * - * May only be called via NLM_F_REQUEST. The command should contain the - * following tree of nested items, containing one but not both of - * WGDEVICE_A_IFINDEX and WGDEVICE_A_IFNAME: - * - * WGDEVICE_A_IFINDEX: NLA_U32 - * WGDEVICE_A_IFNAME: NLA_NUL_STRING, maxlen IFNAMSIZ - 1 - * WGDEVICE_A_FLAGS: NLA_U32, 0 or WGDEVICE_F_REPLACE_PEERS if all current - * peers should be removed prior to adding the list below. - * WGDEVICE_A_PRIVATE_KEY: len WG_KEY_LEN, all zeros to remove - * WGDEVICE_A_LISTEN_PORT: NLA_U16, 0 to choose randomly - * WGDEVICE_A_FWMARK: NLA_U32, 0 to disable - * WGDEVICE_A_PEERS: NLA_NESTED - * 0: NLA_NESTED - * WGPEER_A_PUBLIC_KEY: len WG_KEY_LEN - * WGPEER_A_FLAGS: NLA_U32, 0 and/or WGPEER_F_REMOVE_ME if the - * specified peer should not exist at the end of the - * operation, rather than added/updated and/or - * WGPEER_F_REPLACE_ALLOWEDIPS if all current allowed - * IPs of this peer should be removed prior to adding - * the list below and/or WGPEER_F_UPDATE_ONLY if the - * peer should only be set if it already exists. - * WGPEER_A_PRESHARED_KEY: len WG_KEY_LEN, all zeros to remove - * WGPEER_A_ENDPOINT: struct sockaddr_in or struct sockaddr_in6 - * WGPEER_A_PERSISTENT_KEEPALIVE_INTERVAL: NLA_U16, 0 to disable - * WGPEER_A_ALLOWEDIPS: NLA_NESTED - * 0: NLA_NESTED - * WGALLOWEDIP_A_FAMILY: NLA_U16 - * WGALLOWEDIP_A_IPADDR: struct in_addr or struct in6_addr - * WGALLOWEDIP_A_CIDR_MASK: NLA_U8 - * WGALLOWEDIP_A_FLAGS: NLA_U32, WGALLOWEDIP_F_REMOVE_ME if - * the specified IP should be removed; - * otherwise, this IP will be added if - * it is not already present. - * 0: NLA_NESTED - * ... - * 0: NLA_NESTED - * ... - * ... - * WGPEER_A_PROTOCOL_VERSION: NLA_U32, should not be set or used at - * all by most users of this API, as the - * most recent protocol will be used when - * this is unset. Otherwise, must be set - * to 1. - * 0: NLA_NESTED - * ... - * ... - * - * It is possible that the amount of configuration data exceeds that of - * the maximum message length accepted by the kernel. In that case, several - * messages should be sent one after another, with each successive one - * filling in information not contained in the prior. Note that if - * WGDEVICE_F_REPLACE_PEERS is specified in the first message, it probably - * should not be specified in fragments that come after, so that the list - * of peers is only cleared the first time but appended after. Likewise for - * peers, if WGPEER_F_REPLACE_ALLOWEDIPS is specified in the first message - * of a peer, it likely should not be specified in subsequent fragments. - * - * If an error occurs, NLMSG_ERROR will reply containing an errno. */ #ifndef _WG_UAPI_WIREGUARD_H -- cgit v1.2.3 From b5c5a82bf5cb96e14a6627ef21be962052a0c6d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Wed, 26 Nov 2025 17:35:38 +0000 Subject: wireguard: uapi: move enum wg_cmd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch moves enum wg_cmd to the end of the file, where ynl-gen would generate it. This is an incremental step towards adopting an UAPI header generated by ynl-gen. This is split out to keep the patches readable. This is a trivial patch with no behavioural changes intended. Signed-off-by: Asbjørn Sloth Tønnesen Signed-off-by: Jason A. Donenfeld --- include/uapi/linux/wireguard.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h index dee4401e0b5d..3ebfffd61269 100644 --- a/include/uapi/linux/wireguard.h +++ b/include/uapi/linux/wireguard.h @@ -11,13 +11,6 @@ #define WG_KEY_LEN 32 -enum wg_cmd { - WG_CMD_GET_DEVICE, - WG_CMD_SET_DEVICE, - __WG_CMD_MAX -}; -#define WG_CMD_MAX (__WG_CMD_MAX - 1) - enum wgdevice_flag { WGDEVICE_F_REPLACE_PEERS = 1U << 0, __WGDEVICE_F_ALL = WGDEVICE_F_REPLACE_PEERS @@ -73,4 +66,12 @@ enum wgallowedip_attribute { }; #define WGALLOWEDIP_A_MAX (__WGALLOWEDIP_A_LAST - 1) +enum wg_cmd { + WG_CMD_GET_DEVICE, + WG_CMD_SET_DEVICE, + + __WG_CMD_MAX +}; +#define WG_CMD_MAX (__WG_CMD_MAX - 1) + #endif /* _WG_UAPI_WIREGUARD_H */ -- cgit v1.2.3 From 8d974872ab29eeb93a5b0b698007257d8be07968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Wed, 26 Nov 2025 17:35:39 +0000 Subject: wireguard: uapi: move flag enums MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the wg*_flag enums, so they are defined above the attribute set enums, where ynl-gen would place them. This is an incremental step towards adopting an UAPI header generated by ynl-gen. This is split out to keep the patches readable. This is a trivial patch with no behavioural changes intended. Signed-off-by: Asbjørn Sloth Tønnesen Signed-off-by: Jason A. Donenfeld --- include/uapi/linux/wireguard.h | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h index 3ebfffd61269..a2815f4f2910 100644 --- a/include/uapi/linux/wireguard.h +++ b/include/uapi/linux/wireguard.h @@ -15,6 +15,20 @@ enum wgdevice_flag { WGDEVICE_F_REPLACE_PEERS = 1U << 0, __WGDEVICE_F_ALL = WGDEVICE_F_REPLACE_PEERS }; + +enum wgpeer_flag { + WGPEER_F_REMOVE_ME = 1U << 0, + WGPEER_F_REPLACE_ALLOWEDIPS = 1U << 1, + WGPEER_F_UPDATE_ONLY = 1U << 2, + __WGPEER_F_ALL = WGPEER_F_REMOVE_ME | WGPEER_F_REPLACE_ALLOWEDIPS | + WGPEER_F_UPDATE_ONLY +}; + +enum wgallowedip_flag { + WGALLOWEDIP_F_REMOVE_ME = 1U << 0, + __WGALLOWEDIP_F_ALL = WGALLOWEDIP_F_REMOVE_ME +}; + enum wgdevice_attribute { WGDEVICE_A_UNSPEC, WGDEVICE_A_IFINDEX, @@ -29,13 +43,6 @@ enum wgdevice_attribute { }; #define WGDEVICE_A_MAX (__WGDEVICE_A_LAST - 1) -enum wgpeer_flag { - WGPEER_F_REMOVE_ME = 1U << 0, - WGPEER_F_REPLACE_ALLOWEDIPS = 1U << 1, - WGPEER_F_UPDATE_ONLY = 1U << 2, - __WGPEER_F_ALL = WGPEER_F_REMOVE_ME | WGPEER_F_REPLACE_ALLOWEDIPS | - WGPEER_F_UPDATE_ONLY -}; enum wgpeer_attribute { WGPEER_A_UNSPEC, WGPEER_A_PUBLIC_KEY, @@ -52,10 +59,6 @@ enum wgpeer_attribute { }; #define WGPEER_A_MAX (__WGPEER_A_LAST - 1) -enum wgallowedip_flag { - WGALLOWEDIP_F_REMOVE_ME = 1U << 0, - __WGALLOWEDIP_F_ALL = WGALLOWEDIP_F_REMOVE_ME -}; enum wgallowedip_attribute { WGALLOWEDIP_A_UNSPEC, WGALLOWEDIP_A_FAMILY, -- cgit v1.2.3 From 88cedad45ba14097e06d2c9f6578688097a94691 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Asbj=C3=B8rn=20Sloth=20T=C3=B8nnesen?= Date: Wed, 26 Nov 2025 17:35:40 +0000 Subject: wireguard: uapi: generate header with ynl-gen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use ynl-gen to generate the UAPI header for WireGuard. The cosmetic changes in this patch confirms that the spec is aligned with the implementation. By using the generated version, it ensures that they stay in sync. Changes in the generated header: * Trivial header guard rename. * Trivial white space changes. * Trivial comment changes. * Precompute bitflags in ynl-gen (see [1]). * Drop __*_F_ALL constants (see [1]). [1] https://lore.kernel.org/r/20251014123201.6ecfd146@kernel.org/ No behavioural changes intended. Signed-off-by: Asbjørn Sloth Tønnesen Signed-off-by: Jason A. Donenfeld --- include/uapi/linux/wireguard.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/wireguard.h b/include/uapi/linux/wireguard.h index a2815f4f2910..a100b9715b08 100644 --- a/include/uapi/linux/wireguard.h +++ b/include/uapi/linux/wireguard.h @@ -1,32 +1,29 @@ -/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */ -/* - * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. - */ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/wireguard.yaml */ +/* YNL-GEN uapi header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ -#ifndef _WG_UAPI_WIREGUARD_H -#define _WG_UAPI_WIREGUARD_H +#ifndef _UAPI_LINUX_WIREGUARD_H +#define _UAPI_LINUX_WIREGUARD_H -#define WG_GENL_NAME "wireguard" -#define WG_GENL_VERSION 1 +#define WG_GENL_NAME "wireguard" +#define WG_GENL_VERSION 1 -#define WG_KEY_LEN 32 +#define WG_KEY_LEN 32 enum wgdevice_flag { - WGDEVICE_F_REPLACE_PEERS = 1U << 0, - __WGDEVICE_F_ALL = WGDEVICE_F_REPLACE_PEERS + WGDEVICE_F_REPLACE_PEERS = 1, }; enum wgpeer_flag { - WGPEER_F_REMOVE_ME = 1U << 0, - WGPEER_F_REPLACE_ALLOWEDIPS = 1U << 1, - WGPEER_F_UPDATE_ONLY = 1U << 2, - __WGPEER_F_ALL = WGPEER_F_REMOVE_ME | WGPEER_F_REPLACE_ALLOWEDIPS | - WGPEER_F_UPDATE_ONLY + WGPEER_F_REMOVE_ME = 1, + WGPEER_F_REPLACE_ALLOWEDIPS = 2, + WGPEER_F_UPDATE_ONLY = 4, }; enum wgallowedip_flag { - WGALLOWEDIP_F_REMOVE_ME = 1U << 0, - __WGALLOWEDIP_F_ALL = WGALLOWEDIP_F_REMOVE_ME + WGALLOWEDIP_F_REMOVE_ME = 1, }; enum wgdevice_attribute { @@ -39,6 +36,7 @@ enum wgdevice_attribute { WGDEVICE_A_LISTEN_PORT, WGDEVICE_A_FWMARK, WGDEVICE_A_PEERS, + __WGDEVICE_A_LAST }; #define WGDEVICE_A_MAX (__WGDEVICE_A_LAST - 1) @@ -55,6 +53,7 @@ enum wgpeer_attribute { WGPEER_A_TX_BYTES, WGPEER_A_ALLOWEDIPS, WGPEER_A_PROTOCOL_VERSION, + __WGPEER_A_LAST }; #define WGPEER_A_MAX (__WGPEER_A_LAST - 1) @@ -65,6 +64,7 @@ enum wgallowedip_attribute { WGALLOWEDIP_A_IPADDR, WGALLOWEDIP_A_CIDR_MASK, WGALLOWEDIP_A_FLAGS, + __WGALLOWEDIP_A_LAST }; #define WGALLOWEDIP_A_MAX (__WGALLOWEDIP_A_LAST - 1) @@ -77,4 +77,4 @@ enum wg_cmd { }; #define WG_CMD_MAX (__WG_CMD_MAX - 1) -#endif /* _WG_UAPI_WIREGUARD_H */ +#endif /* _UAPI_LINUX_WIREGUARD_H */ -- cgit v1.2.3 From a42b71d49945aac0b943987cbdec1d1c805caab3 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 1 Dec 2025 13:35:03 +0100 Subject: ata: libata: Move quirk flags to their own enum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The anonymous enum in include/linux/libata.h that is used to store various global constants can currently be backed by type int. (It contains both negative and positive constants.) __ATA_QUIRK_MAX is currently 31. The quirk flags in the various global constants enum are defined as "1U << quirk_flag_bit". Thus if we simply add an additional quirk, the quirk flag will be 1 << 31, which is a value that is too large to be represented by a signed int. The various global constants enum will thus therefore be backed by type long. This will lead to error prints like e.g.: ata_port_err(ap, "EH pending after %d tries, giving up\n", ATA_EH_MAX_TRIES); now failing to build, with build error: error: format ‘%d’ expects argument of type ‘int’, but argument 4 has type ‘long int’ [-Werror=format=] This is because all constants in the various global constants enum now has to be printed as a long, as that is now the backing type of the enum. Since the compiler will use the smallest possible backing type for an enum, it is good practice to not mix unrelated things in a single enum. Move the quirk flags to a separate enum, so that we don't need to change the printf specifier for all other constants in the "various global constants" enum when adding an additional quirk. Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Signed-off-by: Niklas Cassel --- include/linux/libata.h | 74 ++++++++++++++++++++++++++------------------------ 1 file changed, 38 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/include/linux/libata.h b/include/linux/libata.h index 21de0935775d..9aa0541dc62d 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -85,6 +85,44 @@ enum ata_quirks { __ATA_QUIRK_MAX, }; +/* + * Quirk flags: may be set by libata or controller drivers on drives. + * Some quirks may be drive/controller pair dependent. + */ +enum { + ATA_QUIRK_DIAGNOSTIC = (1U << __ATA_QUIRK_DIAGNOSTIC), + ATA_QUIRK_NODMA = (1U << __ATA_QUIRK_NODMA), + ATA_QUIRK_NONCQ = (1U << __ATA_QUIRK_NONCQ), + ATA_QUIRK_MAX_SEC_128 = (1U << __ATA_QUIRK_MAX_SEC_128), + ATA_QUIRK_BROKEN_HPA = (1U << __ATA_QUIRK_BROKEN_HPA), + ATA_QUIRK_DISABLE = (1U << __ATA_QUIRK_DISABLE), + ATA_QUIRK_HPA_SIZE = (1U << __ATA_QUIRK_HPA_SIZE), + ATA_QUIRK_IVB = (1U << __ATA_QUIRK_IVB), + ATA_QUIRK_STUCK_ERR = (1U << __ATA_QUIRK_STUCK_ERR), + ATA_QUIRK_BRIDGE_OK = (1U << __ATA_QUIRK_BRIDGE_OK), + ATA_QUIRK_ATAPI_MOD16_DMA = (1U << __ATA_QUIRK_ATAPI_MOD16_DMA), + ATA_QUIRK_FIRMWARE_WARN = (1U << __ATA_QUIRK_FIRMWARE_WARN), + ATA_QUIRK_1_5_GBPS = (1U << __ATA_QUIRK_1_5_GBPS), + ATA_QUIRK_NOSETXFER = (1U << __ATA_QUIRK_NOSETXFER), + ATA_QUIRK_BROKEN_FPDMA_AA = (1U << __ATA_QUIRK_BROKEN_FPDMA_AA), + ATA_QUIRK_DUMP_ID = (1U << __ATA_QUIRK_DUMP_ID), + ATA_QUIRK_MAX_SEC_LBA48 = (1U << __ATA_QUIRK_MAX_SEC_LBA48), + ATA_QUIRK_ATAPI_DMADIR = (1U << __ATA_QUIRK_ATAPI_DMADIR), + ATA_QUIRK_NO_NCQ_TRIM = (1U << __ATA_QUIRK_NO_NCQ_TRIM), + ATA_QUIRK_NOLPM = (1U << __ATA_QUIRK_NOLPM), + ATA_QUIRK_WD_BROKEN_LPM = (1U << __ATA_QUIRK_WD_BROKEN_LPM), + ATA_QUIRK_ZERO_AFTER_TRIM = (1U << __ATA_QUIRK_ZERO_AFTER_TRIM), + ATA_QUIRK_NO_DMA_LOG = (1U << __ATA_QUIRK_NO_DMA_LOG), + ATA_QUIRK_NOTRIM = (1U << __ATA_QUIRK_NOTRIM), + ATA_QUIRK_MAX_SEC_1024 = (1U << __ATA_QUIRK_MAX_SEC_1024), + ATA_QUIRK_MAX_TRIM_128M = (1U << __ATA_QUIRK_MAX_TRIM_128M), + ATA_QUIRK_NO_NCQ_ON_ATI = (1U << __ATA_QUIRK_NO_NCQ_ON_ATI), + ATA_QUIRK_NO_LPM_ON_ATI = (1U << __ATA_QUIRK_NO_LPM_ON_ATI), + ATA_QUIRK_NO_ID_DEV_LOG = (1U << __ATA_QUIRK_NO_ID_DEV_LOG), + ATA_QUIRK_NO_LOG_DIR = (1U << __ATA_QUIRK_NO_LOG_DIR), + ATA_QUIRK_NO_FUA = (1U << __ATA_QUIRK_NO_FUA), +}; + enum { /* various global constants */ LIBATA_MAX_PRD = ATA_MAX_PRD / 2, @@ -390,42 +428,6 @@ enum { */ ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 8, - /* - * Quirk flags: may be set by libata or controller drivers on drives. - * Some quirks may be drive/controller pair dependent. - */ - ATA_QUIRK_DIAGNOSTIC = (1U << __ATA_QUIRK_DIAGNOSTIC), - ATA_QUIRK_NODMA = (1U << __ATA_QUIRK_NODMA), - ATA_QUIRK_NONCQ = (1U << __ATA_QUIRK_NONCQ), - ATA_QUIRK_MAX_SEC_128 = (1U << __ATA_QUIRK_MAX_SEC_128), - ATA_QUIRK_BROKEN_HPA = (1U << __ATA_QUIRK_BROKEN_HPA), - ATA_QUIRK_DISABLE = (1U << __ATA_QUIRK_DISABLE), - ATA_QUIRK_HPA_SIZE = (1U << __ATA_QUIRK_HPA_SIZE), - ATA_QUIRK_IVB = (1U << __ATA_QUIRK_IVB), - ATA_QUIRK_STUCK_ERR = (1U << __ATA_QUIRK_STUCK_ERR), - ATA_QUIRK_BRIDGE_OK = (1U << __ATA_QUIRK_BRIDGE_OK), - ATA_QUIRK_ATAPI_MOD16_DMA = (1U << __ATA_QUIRK_ATAPI_MOD16_DMA), - ATA_QUIRK_FIRMWARE_WARN = (1U << __ATA_QUIRK_FIRMWARE_WARN), - ATA_QUIRK_1_5_GBPS = (1U << __ATA_QUIRK_1_5_GBPS), - ATA_QUIRK_NOSETXFER = (1U << __ATA_QUIRK_NOSETXFER), - ATA_QUIRK_BROKEN_FPDMA_AA = (1U << __ATA_QUIRK_BROKEN_FPDMA_AA), - ATA_QUIRK_DUMP_ID = (1U << __ATA_QUIRK_DUMP_ID), - ATA_QUIRK_MAX_SEC_LBA48 = (1U << __ATA_QUIRK_MAX_SEC_LBA48), - ATA_QUIRK_ATAPI_DMADIR = (1U << __ATA_QUIRK_ATAPI_DMADIR), - ATA_QUIRK_NO_NCQ_TRIM = (1U << __ATA_QUIRK_NO_NCQ_TRIM), - ATA_QUIRK_NOLPM = (1U << __ATA_QUIRK_NOLPM), - ATA_QUIRK_WD_BROKEN_LPM = (1U << __ATA_QUIRK_WD_BROKEN_LPM), - ATA_QUIRK_ZERO_AFTER_TRIM = (1U << __ATA_QUIRK_ZERO_AFTER_TRIM), - ATA_QUIRK_NO_DMA_LOG = (1U << __ATA_QUIRK_NO_DMA_LOG), - ATA_QUIRK_NOTRIM = (1U << __ATA_QUIRK_NOTRIM), - ATA_QUIRK_MAX_SEC_1024 = (1U << __ATA_QUIRK_MAX_SEC_1024), - ATA_QUIRK_MAX_TRIM_128M = (1U << __ATA_QUIRK_MAX_TRIM_128M), - ATA_QUIRK_NO_NCQ_ON_ATI = (1U << __ATA_QUIRK_NO_NCQ_ON_ATI), - ATA_QUIRK_NO_LPM_ON_ATI = (1U << __ATA_QUIRK_NO_LPM_ON_ATI), - ATA_QUIRK_NO_ID_DEV_LOG = (1U << __ATA_QUIRK_NO_ID_DEV_LOG), - ATA_QUIRK_NO_LOG_DIR = (1U << __ATA_QUIRK_NO_LOG_DIR), - ATA_QUIRK_NO_FUA = (1U << __ATA_QUIRK_NO_FUA), - /* User visible DMA mask for DMA control. DO NOT renumber. */ ATA_DMA_MASK_ATA = (1 << 0), /* DMA on ATA Disk */ ATA_DMA_MASK_ATAPI = (1 << 1), /* DMA on ATAPI */ -- cgit v1.2.3 From 2e983271363108b3813b38754eb96d9b1cb252bb Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 1 Dec 2025 13:35:04 +0100 Subject: ata: libata-core: Quirk DELLBOSS VD max_sectors Commit 9b8b84879d4a ("block: Increase BLK_DEF_MAX_SECTORS_CAP") increased the default max_sectors_kb from 1280 KiB to 4096 KiB. DELLBOSS VD with FW rev MV.R00-0 times out when sending I/Os of size 4096 KiB. Enable ATA_QUIRK_MAX_SEC, with value 8191 (sectors) for this device, since any I/O with more sectors than that lead to I/O timeouts. With this, the DELLBOSS VD SATA controller is usable again. Cc: stable+noautosel@kernel.org # depends on Move quirk flags to their own enum Fixes: 9b8b84879d4a ("block: Increase BLK_DEF_MAX_SECTORS_CAP") Reviewed-by: Damien Le Moal Signed-off-by: Niklas Cassel --- include/linux/ata.h | 1 + include/linux/libata.h | 2 ++ 2 files changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/ata.h b/include/linux/ata.h index 792e10a09787..1786e7b1165f 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -29,6 +29,7 @@ enum { ATA_MAX_SECTORS_128 = 128, ATA_MAX_SECTORS = 256, ATA_MAX_SECTORS_1024 = 1024, + ATA_MAX_SECTORS_8191 = 8191, ATA_MAX_SECTORS_LBA48 = 65535,/* avoid count to be 0000h */ ATA_MAX_SECTORS_TAPE = 65535, ATA_MAX_TRIM_RNUM = 64, /* 512-byte payload / (6-byte LBA + 2-byte range per entry) */ diff --git a/include/linux/libata.h b/include/linux/libata.h index 9aa0541dc62d..abdc7b6f176c 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -75,6 +75,7 @@ enum ata_quirks { __ATA_QUIRK_NO_DMA_LOG, /* Do not use DMA for log read */ __ATA_QUIRK_NOTRIM, /* Do not use TRIM */ __ATA_QUIRK_MAX_SEC_1024, /* Limit max sects to 1024 */ + __ATA_QUIRK_MAX_SEC_8191, /* Limit max sects to 8191 */ __ATA_QUIRK_MAX_TRIM_128M, /* Limit max trim size to 128M */ __ATA_QUIRK_NO_NCQ_ON_ATI, /* Disable NCQ on ATI chipset */ __ATA_QUIRK_NO_LPM_ON_ATI, /* Disable LPM on ATI chipset */ @@ -115,6 +116,7 @@ enum { ATA_QUIRK_NO_DMA_LOG = (1U << __ATA_QUIRK_NO_DMA_LOG), ATA_QUIRK_NOTRIM = (1U << __ATA_QUIRK_NOTRIM), ATA_QUIRK_MAX_SEC_1024 = (1U << __ATA_QUIRK_MAX_SEC_1024), + ATA_QUIRK_MAX_SEC_8191 = (1U << __ATA_QUIRK_MAX_SEC_8191), ATA_QUIRK_MAX_TRIM_128M = (1U << __ATA_QUIRK_MAX_TRIM_128M), ATA_QUIRK_NO_NCQ_ON_ATI = (1U << __ATA_QUIRK_NO_NCQ_ON_ATI), ATA_QUIRK_NO_LPM_ON_ATI = (1U << __ATA_QUIRK_NO_LPM_ON_ATI), -- cgit v1.2.3 From 4b011b538f2b90d07580ff778e28954a4a6520eb Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 2 Dec 2025 16:38:02 +0100 Subject: i3c: fix I3C_SDR bit number 0x31 is decimal 49 and doesn't fit in a 32 bit integer, switch to the intended decimal 31. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202512020956.Dnz8A2H0-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202512021613.97jVprvJ-lkp@intel.com/ Closes: https://lore.kernel.org/oe-kbuild-all/202512021644.lp8ZMSx5-lkp@intel.com/ Link: https://patch.msgid.link/20251202153804.2640623-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/i3c/device.h b/include/linux/i3c/device.h index ae0662d9d77e..9fcb6410a584 100644 --- a/include/linux/i3c/device.h +++ b/include/linux/i3c/device.h @@ -51,7 +51,7 @@ enum i3c_xfer_mode { I3C_HDR_TSP = 1, I3C_HDR_TSL = 2, /* Use for default SDR transfer mode */ - I3C_SDR = 0x31, + I3C_SDR = 31, }; /** -- cgit v1.2.3 From e01a8baf60af43f6f87a5850dee29cf31377ec25 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 2 Dec 2025 16:38:03 +0100 Subject: i3c: document i3c_xfers i3c_xfers was left undocumented, document it. Reported-by: Stephen Rothwell Link: https://patch.msgid.link/20251202153804.2640623-2-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index d0d5b3a9049f..2fd850f4678b 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -418,7 +418,11 @@ struct i3c_bus { * @send_ccc_cmd: send a CCC command * This method is mandatory. * @priv_xfers: do one or several private I3C SDR transfers - * This method is mandatory. + * This method is mandatory when i3c_xfers is not implemented. It + * is deprecated. + * @i3c_xfers: do one or several I3C SDR or HDR transfers + * This method is mandatory when priv_xfers is not implemented but + * should be implemented instead of priv_xfers. * @attach_i2c_dev: called every time an I2C device is attached to the bus. * This is a good place to attach master controller specific * data to I2C devices. -- cgit v1.2.3 From bbaacdc339d4bde2690b659dc090af7c20a1937e Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Thu, 13 Nov 2025 16:06:18 +0100 Subject: rv: Fix compilation if !CONFIG_RV_REACTORS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel test robot spotted a compilation error if reactors are disabled. Fix the warning by keeping LTL monitor variable as always static. Cc: Thomas Weißschuh Link: https://patch.msgid.link/20251113150618.185479-2-gmonaco@redhat.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202511131948.vxi5mdjU-lkp@intel.com/ Fixes: 4f739ed19d22 ("rv: Pass va_list to reactors") Signed-off-by: Gabriele Monaco Signed-off-by: Steven Rostedt (Google) --- include/rv/ltl_monitor.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h index 00c42b36f961..eff60cd61106 100644 --- a/include/rv/ltl_monitor.h +++ b/include/rv/ltl_monitor.h @@ -17,12 +17,7 @@ #endif #define RV_MONITOR_NAME CONCATENATE(rv_, MONITOR_NAME) - -#ifdef CONFIG_RV_REACTORS static struct rv_monitor RV_MONITOR_NAME; -#else -extern struct rv_monitor RV_MONITOR_NAME; -#endif static int ltl_monitor_slot = RV_PER_TASK_MONITOR_INIT; -- cgit v1.2.3 From b08ee4d666f216a6f9e7194a9b335147d4717f33 Mon Sep 17 00:00:00 2001 From: Neilay Kharwadkar Date: Sun, 16 Nov 2025 19:20:29 +0000 Subject: lib/fonts: Add Terminus 10x18 console font Add a compile-in option for Terminus 10x18 bitmap console font to improve readability on modern laptop displays. On modern 13-16 inch laptop displays with high pixel density, common scaled resolutions like 1280x800 and 1440x900 are widely used. At these resolutions, VGA 8x16 is too small and difficult to read for extended periods, while Terminus 16x32 is too large, providing only 25-28 rows. The existing 10x18 font has poor readability. Terminus 10x18 provides improved readability with its clean, fixed-width design while maintaining practical row counts (44-50 rows). A comfortable and readable built-in font for early boot messages, kernel panics or whenever userspace is unavailable. The font was converted from standard Terminus ter-i18b.psf using psftools and formatted to match kernel font conventions. This patch is non-intrusive, no options are enabled by default so most users won't notice a thing. Signed-off-by: Neilay Kharwadkar Signed-off-by: Helge Deller --- include/linux/font.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/font.h b/include/linux/font.h index 81caffd51bb4..fd8625cd76b2 100644 --- a/include/linux/font.h +++ b/include/linux/font.h @@ -35,6 +35,7 @@ struct font_desc { #define FONT6x10_IDX 10 #define TER16x32_IDX 11 #define FONT6x8_IDX 12 +#define TER10x18_IDX 13 extern const struct font_desc font_vga_8x8, font_vga_8x16, @@ -48,7 +49,8 @@ extern const struct font_desc font_vga_8x8, font_mini_4x6, font_6x10, font_ter_16x32, - font_6x8; + font_6x8, + font_ter_10x18; /* Find a font with a specific name */ -- cgit v1.2.3 From 8a5dd102e48752f8c4144f051eccc602774f1a93 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 2 Dec 2025 13:44:46 +1100 Subject: ccp: Make snp_reclaim_pages and __sev_do_cmd_locked public The snp_reclaim_pages() helper reclaims pages in the FW state. SEV-TIO and the TMPM driver (a hardware engine which smashes IOMMU PDEs among other things) will use to reclaim memory when cleaning up. Share and export snp_reclaim_pages(). Most of the SEV-TIO code uses sev_do_cmd() which locks the sev_cmd_mutex and already exported. But the SNP init code (which also sets up SEV-TIO) executes under the sev_cmd_mutex lock so the SEV-TIO code has to use the __sev_do_cmd_locked() helper. This one though does not need to be exported/shared globally as SEV-TIO is a part of the CCP driver still. Share __sev_do_cmd_locked() via the CCP internal header. Signed-off-by: Alexey Kardashevskiy Link: https://patch.msgid.link/20251202024449.542361-2-aik@amd.com Acked-by: Tom Lendacky Signed-off-by: Dan Williams --- include/linux/psp-sev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index e0dbcb4b4fd9..34a25209f909 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -992,6 +992,7 @@ int sev_do_cmd(int cmd, void *data, int *psp_ret); void *psp_copy_user_blob(u64 uaddr, u32 len); void *snp_alloc_firmware_page(gfp_t mask); +int snp_reclaim_pages(unsigned long paddr, unsigned int npages, bool locked); void snp_free_firmware_page(void *addr); void sev_platform_shutdown(void); bool sev_is_snp_ciphertext_hiding_supported(void); @@ -1027,6 +1028,11 @@ static inline void *snp_alloc_firmware_page(gfp_t mask) return NULL; } +static inline int snp_reclaim_pages(unsigned long paddr, unsigned int npages, bool locked) +{ + return -ENODEV; +} + static inline void snp_free_firmware_page(void *addr) { } static inline void sev_platform_shutdown(void) { } -- cgit v1.2.3 From c3859de858aa7ae0d0a5ca21e8ee9792f2f256b6 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 2 Dec 2025 13:44:47 +1100 Subject: psp-sev: Assign numbers to all status codes and add new Make the definitions explicit. Add some more new codes. The following patches will be using SPDM_REQUEST and EXPAND_BUFFER_LENGTH_REQUEST, others are useful for the PSP FW diagnostics. Signed-off-by: Alexey Kardashevskiy Link: https://patch.msgid.link/20251202024449.542361-3-aik@amd.com Acked-by: Tom Lendacky Signed-off-by: Dan Williams --- include/uapi/linux/psp-sev.h | 66 +++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index c2fd324623c4..2b5b042eb73b 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -47,32 +47,32 @@ typedef enum { * with possible values from the specification. */ SEV_RET_NO_FW_CALL = -1, - SEV_RET_SUCCESS = 0, - SEV_RET_INVALID_PLATFORM_STATE, - SEV_RET_INVALID_GUEST_STATE, - SEV_RET_INAVLID_CONFIG, + SEV_RET_SUCCESS = 0, + SEV_RET_INVALID_PLATFORM_STATE = 0x0001, + SEV_RET_INVALID_GUEST_STATE = 0x0002, + SEV_RET_INAVLID_CONFIG = 0x0003, SEV_RET_INVALID_CONFIG = SEV_RET_INAVLID_CONFIG, - SEV_RET_INVALID_LEN, - SEV_RET_ALREADY_OWNED, - SEV_RET_INVALID_CERTIFICATE, - SEV_RET_POLICY_FAILURE, - SEV_RET_INACTIVE, - SEV_RET_INVALID_ADDRESS, - SEV_RET_BAD_SIGNATURE, - SEV_RET_BAD_MEASUREMENT, - SEV_RET_ASID_OWNED, - SEV_RET_INVALID_ASID, - SEV_RET_WBINVD_REQUIRED, - SEV_RET_DFFLUSH_REQUIRED, - SEV_RET_INVALID_GUEST, - SEV_RET_INVALID_COMMAND, - SEV_RET_ACTIVE, - SEV_RET_HWSEV_RET_PLATFORM, - SEV_RET_HWSEV_RET_UNSAFE, - SEV_RET_UNSUPPORTED, - SEV_RET_INVALID_PARAM, - SEV_RET_RESOURCE_LIMIT, - SEV_RET_SECURE_DATA_INVALID, + SEV_RET_INVALID_LEN = 0x0004, + SEV_RET_ALREADY_OWNED = 0x0005, + SEV_RET_INVALID_CERTIFICATE = 0x0006, + SEV_RET_POLICY_FAILURE = 0x0007, + SEV_RET_INACTIVE = 0x0008, + SEV_RET_INVALID_ADDRESS = 0x0009, + SEV_RET_BAD_SIGNATURE = 0x000A, + SEV_RET_BAD_MEASUREMENT = 0x000B, + SEV_RET_ASID_OWNED = 0x000C, + SEV_RET_INVALID_ASID = 0x000D, + SEV_RET_WBINVD_REQUIRED = 0x000E, + SEV_RET_DFFLUSH_REQUIRED = 0x000F, + SEV_RET_INVALID_GUEST = 0x0010, + SEV_RET_INVALID_COMMAND = 0x0011, + SEV_RET_ACTIVE = 0x0012, + SEV_RET_HWSEV_RET_PLATFORM = 0x0013, + SEV_RET_HWSEV_RET_UNSAFE = 0x0014, + SEV_RET_UNSUPPORTED = 0x0015, + SEV_RET_INVALID_PARAM = 0x0016, + SEV_RET_RESOURCE_LIMIT = 0x0017, + SEV_RET_SECURE_DATA_INVALID = 0x0018, SEV_RET_INVALID_PAGE_SIZE = 0x0019, SEV_RET_INVALID_PAGE_STATE = 0x001A, SEV_RET_INVALID_MDATA_ENTRY = 0x001B, @@ -87,6 +87,22 @@ typedef enum { SEV_RET_RESTORE_REQUIRED = 0x0025, SEV_RET_RMP_INITIALIZATION_FAILED = 0x0026, SEV_RET_INVALID_KEY = 0x0027, + SEV_RET_SHUTDOWN_INCOMPLETE = 0x0028, + SEV_RET_INCORRECT_BUFFER_LENGTH = 0x0030, + SEV_RET_EXPAND_BUFFER_LENGTH_REQUEST = 0x0031, + SEV_RET_SPDM_REQUEST = 0x0032, + SEV_RET_SPDM_ERROR = 0x0033, + SEV_RET_SEV_STATUS_ERR_IN_DEV_CONN = 0x0035, + SEV_RET_SEV_STATUS_INVALID_DEV_CTX = 0x0036, + SEV_RET_SEV_STATUS_INVALID_TDI_CTX = 0x0037, + SEV_RET_SEV_STATUS_INVALID_TDI = 0x0038, + SEV_RET_SEV_STATUS_RECLAIM_REQUIRED = 0x0039, + SEV_RET_IN_USE = 0x003A, + SEV_RET_SEV_STATUS_INVALID_DEV_STATE = 0x003B, + SEV_RET_SEV_STATUS_INVALID_TDI_STATE = 0x003C, + SEV_RET_SEV_STATUS_DEV_CERT_CHANGED = 0x003D, + SEV_RET_SEV_STATUS_RESYNC_REQ = 0x003E, + SEV_RET_SEV_STATUS_RESPONSE_TOO_LARGE = 0x003F, SEV_RET_MAX, } sev_ret_code; -- cgit v1.2.3 From eeb934137debfbe98be61a27756a605edf492ed3 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 2 Dec 2025 13:44:48 +1100 Subject: iommu/amd: Report SEV-TIO support The SEV-TIO switch in the AMD BIOS is reported to the OS via the IOMMU Extended Feature 2 register (EFR2), bit 1. Add helper to parse the bit and report the feature presence. Signed-off-by: Alexey Kardashevskiy Link: https://patch.msgid.link/20251202024449.542361-4-aik@amd.com Acked-by: Joerg Roedel Reviewed-by: Vasant Hegde Acked-by: Tom Lendacky Signed-off-by: Dan Williams --- include/linux/amd-iommu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 8cced632ecd0..0f64f09d1f34 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -18,10 +18,12 @@ struct task_struct; struct pci_dev; extern void amd_iommu_detect(void); +extern bool amd_iommu_sev_tio_supported(void); #else /* CONFIG_AMD_IOMMU */ static inline void amd_iommu_detect(void) { } +static inline bool amd_iommu_sev_tio_supported(void) { return false; } #endif /* CONFIG_AMD_IOMMU */ -- cgit v1.2.3 From 4be423572da1f4c11f45168e3fafda870ddac9f8 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Tue, 2 Dec 2025 13:44:49 +1100 Subject: crypto/ccp: Implement SEV-TIO PCIe IDE (phase1) Implement the SEV-TIO (Trusted I/O) firmware interface for PCIe TDISP (Trust Domain In-Socket Protocol). This enables secure communication between trusted domains and PCIe devices through the PSP (Platform Security Processor). The implementation includes: - Device Security Manager (DSM) operations for establishing secure links - SPDM (Security Protocol and Data Model) over DOE (Data Object Exchange) - IDE (Integrity Data Encryption) stream management for secure PCIe This module bridges the SEV firmware stack with the generic PCIe TSM framework. This is phase1 as described in Documentation/driver-api/pci/tsm.rst. On AMD SEV, the AMD PSP firmware acts as TSM (manages the security/trust). The CCP driver provides the interface to it and registers in the TSM subsystem. Detect the PSP support (reported via FEATURE_INFO + SNP_PLATFORM_STATUS) and enable SEV-TIO in the SNP_INIT_EX call if the hardware supports TIO. Implement SEV TIO PSP command wrappers in sev-dev-tio.c and store the data in the SEV-TIO-specific structs. Implement TSM hooks and IDE setup in sev-dev-tsm.c. Signed-off-by: Alexey Kardashevskiy Link: https://patch.msgid.link/692f506bb80c9_261c11004@dwillia2-mobl4.notmuch Acked-by: Tom Lendacky Signed-off-by: Dan Williams --- include/linux/psp-sev.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index 34a25209f909..cce864dbf281 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -109,6 +109,13 @@ enum sev_cmd { SEV_CMD_SNP_VLEK_LOAD = 0x0CD, SEV_CMD_SNP_FEATURE_INFO = 0x0CE, + /* SEV-TIO commands */ + SEV_CMD_TIO_STATUS = 0x0D0, + SEV_CMD_TIO_INIT = 0x0D1, + SEV_CMD_TIO_DEV_CREATE = 0x0D2, + SEV_CMD_TIO_DEV_RECLAIM = 0x0D3, + SEV_CMD_TIO_DEV_CONNECT = 0x0D4, + SEV_CMD_TIO_DEV_DISCONNECT = 0x0D5, SEV_CMD_MAX, }; @@ -750,7 +757,8 @@ struct sev_data_snp_init_ex { u32 list_paddr_en:1; u32 rapl_dis:1; u32 ciphertext_hiding_en:1; - u32 rsvd:28; + u32 tio_en:1; + u32 rsvd:27; u32 rsvd1; u64 list_paddr; u16 max_snp_asid; @@ -850,6 +858,7 @@ struct snp_feature_info { } __packed; #define SNP_CIPHER_TEXT_HIDING_SUPPORTED BIT(3) +#define SNP_SEV_TIO_SUPPORTED BIT(1) /* EBX */ #ifdef CONFIG_CRYPTO_DEV_SP_PSP -- cgit v1.2.3 From f7231cff1f3ff8259bef02dc4999bc132abf29cf Mon Sep 17 00:00:00 2001 From: Jacopo Mondi Date: Wed, 3 Dec 2025 08:55:34 +0000 Subject: media: uapi: c3-isp: Fix documentation warning Building htmldocs generates a warning: WARNING: include/uapi/linux/media/amlogic/c3-isp-config.h:199 error: Cannot parse struct or union! Which correctly highlights that the c3_isp_params_block_header symbol is wrongly documented as a struct while it's a plain #define instead. Fix this by removing the 'struct' identifier from the documentation of the c3_isp_params_block_header symbol. [ribalda: Add Closes:] Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/all/20251127131425.4b5b6644@canb.auug.org.au/ Fixes: 45662082855c ("media: uapi: Convert Amlogic C3 to V4L2 extensible params") Cc: stable@vger.kernel.org Signed-off-by: Jacopo Mondi Signed-off-by: Ricardo Ribalda Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/media/amlogic/c3-isp-config.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/media/amlogic/c3-isp-config.h b/include/uapi/linux/media/amlogic/c3-isp-config.h index 0a3c1cc55ccb..92db5dcdda18 100644 --- a/include/uapi/linux/media/amlogic/c3-isp-config.h +++ b/include/uapi/linux/media/amlogic/c3-isp-config.h @@ -186,7 +186,7 @@ enum c3_isp_params_block_type { #define C3_ISP_PARAMS_BLOCK_FL_ENABLE V4L2_ISP_PARAMS_FL_BLOCK_ENABLE /** - * struct c3_isp_params_block_header - C3 ISP parameter block header + * c3_isp_params_block_header - C3 ISP parameter block header * * This structure represents the common part of all the ISP configuration * blocks and is identical to :c:type:`v4l2_isp_params_block_header`. -- cgit v1.2.3 From 305c8dc477175eb29df18accc95c868acd2cdd4e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 2 Dec 2025 09:59:38 -0800 Subject: objtool: Consolidate annotation macros Consolidate __ASM_ANNOTATE into a single macro which is used by both C and asm. This also makes the code generation a bit more palatable by putting it all on a single line. Turn this: 911: .pushsection .discard.annotate_insn,"M", @progbits, 8 .long 911b - . .long 1 .popsection jmp __x86_return_thunk Into: 911: .pushsection ".discard.annotate_insn", "M", @progbits, 8; .long 911b - .; .long 1; .popsection jmp __x86_return_thunk Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://patch.msgid.link/c05ff40d3383e85c3b59018ef0b3c7aaf993a60d.1764694625.git.jpoimboe@kernel.org --- include/linux/annotate.h | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/annotate.h b/include/linux/annotate.h index 7c10d34d198c..996126f5f9ec 100644 --- a/include/linux/annotate.h +++ b/include/linux/annotate.h @@ -6,41 +6,35 @@ #ifdef CONFIG_OBJTOOL -#ifndef __ASSEMBLY__ - #define __ASM_ANNOTATE(section, label, type) \ - ".pushsection " section ",\"M\", @progbits, 8\n\t" \ - ".long " __stringify(label) " - .\n\t" \ - ".long " __stringify(type) "\n\t" \ - ".popsection\n\t" + .pushsection section, "M", @progbits, 8; \ + .long label - .; \ + .long type; \ + .popsection + +#ifndef __ASSEMBLY__ #define ASM_ANNOTATE_LABEL(label, type) \ - __ASM_ANNOTATE(".discard.annotate_insn", label, type) + __stringify(__ASM_ANNOTATE(".discard.annotate_insn", label, type)) "\n\t" #define ASM_ANNOTATE(type) \ - "911:\n\t" \ - ASM_ANNOTATE_LABEL(911b, type) + "911: " \ + __stringify(__ASM_ANNOTATE(".discard.annotate_insn", 911b, type)) "\n\t" #define ASM_ANNOTATE_DATA(type) \ - "912:\n\t" \ - __ASM_ANNOTATE(".discard.annotate_data", 912b, type) + "912: " \ + __stringify(__ASM_ANNOTATE(".discard.annotate_data", 912b, type)) "\n\t" #else /* __ASSEMBLY__ */ -.macro __ANNOTATE section, type -.Lhere_\@: - .pushsection \section, "M", @progbits, 8 - .long .Lhere_\@ - . - .long \type - .popsection -.endm - .macro ANNOTATE type - __ANNOTATE ".discard.annotate_insn", \type +.Lhere_\@: + __ASM_ANNOTATE(".discard.annotate_insn", .Lhere_\@, \type) .endm .macro ANNOTATE_DATA type - __ANNOTATE ".discard.annotate_data", \type +.Lhere_\@: + __ASM_ANNOTATE(".discard.annotate_data", .Lhere_\@, \type) .endm #endif /* __ASSEMBLY__ */ -- cgit v1.2.3 From ed3bf863dc9150b56233b01ec073cbbd1fc9c6a3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 2 Dec 2025 09:59:39 -0800 Subject: objtool: Remove newlines and tabs from annotation macros Remove newlines and tabs from the annotation macros so the invoking code can insert them as needed to match the style of the surrounding code. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://patch.msgid.link/66305834c2eb78f082217611b756231ae9c0b555.1764694625.git.jpoimboe@kernel.org --- include/linux/annotate.h | 6 +++--- include/linux/objtool.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/annotate.h b/include/linux/annotate.h index 996126f5f9ec..5efac5d4f9cf 100644 --- a/include/linux/annotate.h +++ b/include/linux/annotate.h @@ -15,15 +15,15 @@ #ifndef __ASSEMBLY__ #define ASM_ANNOTATE_LABEL(label, type) \ - __stringify(__ASM_ANNOTATE(".discard.annotate_insn", label, type)) "\n\t" + __stringify(__ASM_ANNOTATE(".discard.annotate_insn", label, type)) #define ASM_ANNOTATE(type) \ "911: " \ - __stringify(__ASM_ANNOTATE(".discard.annotate_insn", 911b, type)) "\n\t" + __stringify(__ASM_ANNOTATE(".discard.annotate_insn", 911b, type)) #define ASM_ANNOTATE_DATA(type) \ "912: " \ - __stringify(__ASM_ANNOTATE(".discard.annotate_data", 912b, type)) "\n\t" + __stringify(__ASM_ANNOTATE(".discard.annotate_data", 912b, type)) #else /* __ASSEMBLY__ */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index b18ab53561c9..9a00e701454c 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -12,7 +12,7 @@ #define UNWIND_HINT(type, sp_reg, sp_offset, signal) \ "987: \n\t" \ ".pushsection .discard.unwind_hints\n\t" \ - ANNOTATE_DATA_SPECIAL \ + ANNOTATE_DATA_SPECIAL "\n\t" \ /* struct unwind_hint */ \ ".long 987b - .\n\t" \ ".short " __stringify(sp_offset) "\n\t" \ -- cgit v1.2.3 From 2d3451ef1ef679ae496f8e335f4b1305885e8083 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 3 Dec 2025 10:07:38 -0800 Subject: objtool: Simplify .annotate_insn code generation output some more Remove the superfluous section name quotes, and combine the longs into a single command. Before: 911: .pushsection ".discard.annotate_insn", "M", @progbits, 8; .long 911b - .; .long 2; .popsection After: 911: .pushsection .discard.annotate_insn, "M", @progbits, 8; .long 911b - ., 2; .popsection No change in functionality. Suggested-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/hpsfcihgqmhcdrg7pop7z73ptymakgjq7qlxrawrjxilosk43l@xikqif3ievj4 --- include/linux/annotate.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/annotate.h b/include/linux/annotate.h index 5efac5d4f9cf..2f1599c9e573 100644 --- a/include/linux/annotate.h +++ b/include/linux/annotate.h @@ -8,33 +8,32 @@ #define __ASM_ANNOTATE(section, label, type) \ .pushsection section, "M", @progbits, 8; \ - .long label - .; \ - .long type; \ + .long label - ., type; \ .popsection #ifndef __ASSEMBLY__ #define ASM_ANNOTATE_LABEL(label, type) \ - __stringify(__ASM_ANNOTATE(".discard.annotate_insn", label, type)) + __stringify(__ASM_ANNOTATE(.discard.annotate_insn, label, type)) #define ASM_ANNOTATE(type) \ "911: " \ - __stringify(__ASM_ANNOTATE(".discard.annotate_insn", 911b, type)) + __stringify(__ASM_ANNOTATE(.discard.annotate_insn, 911b, type)) #define ASM_ANNOTATE_DATA(type) \ "912: " \ - __stringify(__ASM_ANNOTATE(".discard.annotate_data", 912b, type)) + __stringify(__ASM_ANNOTATE(.discard.annotate_data, 912b, type)) #else /* __ASSEMBLY__ */ .macro ANNOTATE type .Lhere_\@: - __ASM_ANNOTATE(".discard.annotate_insn", .Lhere_\@, \type) + __ASM_ANNOTATE(.discard.annotate_insn, .Lhere_\@, \type) .endm .macro ANNOTATE_DATA type .Lhere_\@: - __ASM_ANNOTATE(".discard.annotate_data", .Lhere_\@, \type) + __ASM_ANNOTATE(.discard.annotate_data, .Lhere_\@, \type) .endm #endif /* __ASSEMBLY__ */ -- cgit v1.2.3 From faf07e611dfa464b201223a7253e9dc5ee0f3c9e Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 30 Sep 2025 15:58:02 +0300 Subject: tpm: Cap the number of PCR banks tpm2_get_pcr_allocation() does not cap any upper limit for the number of banks. Cap the limit to eight banks so that out of bounds values coming from external I/O cause on only limited harm. Cc: stable@vger.kernel.org # v5.10+ Fixes: bcfff8384f6c ("tpm: dynamically allocate the allocated_banks array") Tested-by: Lai Yi Reviewed-by: Jonathan McDowell Reviewed-by: Roberto Sassu Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index b15360ff78d7..53de9488c509 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -26,7 +26,9 @@ #include #define TPM_DIGEST_SIZE 20 /* Max TPM v1.2 PCR size */ -#define TPM_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE + +#define TPM2_MAX_DIGEST_SIZE SHA512_DIGEST_SIZE +#define TPM2_MAX_PCR_BANKS 8 struct tpm_chip; struct trusted_key_payload; @@ -68,7 +70,7 @@ enum tpm2_curves { struct tpm_digest { u16 alg_id; - u8 digest[TPM_MAX_DIGEST_SIZE]; + u8 digest[TPM2_MAX_DIGEST_SIZE]; } __packed; struct tpm_bank_info { @@ -189,7 +191,7 @@ struct tpm_chip { unsigned int groups_cnt; u32 nr_allocated_banks; - struct tpm_bank_info *allocated_banks; + struct tpm_bank_info allocated_banks[TPM2_MAX_PCR_BANKS]; #ifdef CONFIG_ACPI acpi_handle acpi_dev_handle; char ppi_version[TPM_PPI_VERSION_LEN + 1]; -- cgit v1.2.3 From 7fcf459ac84c42a4ef63a650dccc345602cf4da6 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 30 Sep 2025 16:02:54 +0300 Subject: tpm: Use -EPERM as fallback error code in tpm_ret_to_err Using -EFAULT as the tpm_ret_to_err() fallback error code causes makes it incompatible on how trusted keys transmute TPM return codes. Change the fallback as -EPERM in order to gain compatibility with trusted keys. In addition, map TPM_RC_HASH to -EINVAL in order to be compatible with tpm2_seal_trusted() return values. Signed-off-by: Jarkko Sakkinen Reviewed-by: Stefano Garzarella Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 53de9488c509..3d8f7d1ce2b8 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -456,8 +456,10 @@ static inline ssize_t tpm_ret_to_err(ssize_t ret) return 0; case TPM2_RC_SESSION_MEMORY: return -ENOMEM; + case TPM2_RC_HASH: + return -EINVAL; default: - return -EFAULT; + return -EPERM; } } -- cgit v1.2.3 From 5b5578c3b06eba4c256bc3a2788f5a65cd9f31ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 29 Oct 2025 14:31:04 +0800 Subject: f2fs: fix to access i_size w/ i_size_read() It recommends to use i_size_{read,write}() to access and update i_size, otherwise, we may get wrong tearing value due to high 32-bits value and low 32-bits value of i_size field are not updated atomically in 32-bits archicture machine. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index edbbd869078f..e1fae78d64a5 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -204,7 +204,7 @@ DECLARE_EVENT_CLASS(f2fs__inode, __entry->pino = F2FS_I(inode)->i_pino; __entry->mode = inode->i_mode; __entry->nlink = inode->i_nlink; - __entry->size = inode->i_size; + __entry->size = i_size_read(inode); __entry->blocks = inode->i_blocks; __entry->advise = F2FS_I(inode)->i_advise; ), @@ -353,7 +353,7 @@ TRACE_EVENT(f2fs_unlink_enter, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; - __entry->size = dir->i_size; + __entry->size = i_size_read(dir); __entry->blocks = dir->i_blocks; __assign_str(name); ), @@ -433,7 +433,7 @@ DECLARE_EVENT_CLASS(f2fs__truncate_op, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->size = inode->i_size; + __entry->size = i_size_read(inode); __entry->blocks = inode->i_blocks; __entry->from = from; ), @@ -1006,7 +1006,7 @@ TRACE_EVENT(f2fs_fallocate, __entry->mode = mode; __entry->offset = offset; __entry->len = len; - __entry->size = inode->i_size; + __entry->size = i_size_read(inode); __entry->blocks = inode->i_blocks; __entry->ret = ret; ), -- cgit v1.2.3 From 2e2e0d679a1fb88a960049496373f415b67f274f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Oct 2025 19:50:11 +0000 Subject: f2fs: add fadvise tracepoint This adds a tracepoint in the fadvise call path. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index e1fae78d64a5..e00611ead024 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -586,6 +586,38 @@ TRACE_EVENT(f2fs_file_write_iter, __entry->ret) ); +TRACE_EVENT(f2fs_fadvise, + + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int advice), + + TP_ARGS(inode, offset, len, advice), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, size) + __field(loff_t, offset) + __field(loff_t, len) + __field(int, advice) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->size = i_size_read(inode); + __entry->offset = offset; + __entry->len = len; + __entry->advice = advice; + ), + + TP_printk("dev = (%d,%d), ino = %lu, i_size = %lld offset:%llu, len:%llu, advise:%d", + show_dev_ino(__entry), + (unsigned long long)__entry->size, + __entry->offset, + __entry->len, + __entry->advice) +); + TRACE_EVENT(f2fs_map_blocks, TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int flag, int ret), -- cgit v1.2.3 From 7ee8bc3942f20964ad730871b885688ea3a2961a Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 11 Nov 2025 09:52:46 -0800 Subject: f2fs: revert summary entry count from 2048 to 512 in 16kb block support The recent increase in the number of Segment Summary Area (SSA) entries from 512 to 2048 was an unintentional change in logic of 16kb block support. This commit corrects the issue. To better utilize the space available from the erroneous 2048-entry calculation, we are implementing a solution to share the currently unused SSA space with neighboring segments. This enhances overall SSA utilization without impacting the established 8MB segment size. Fixes: d7e9a9037de2 ("f2fs: Support Block Size == Page Size") Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/linux/f2fs_fs.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 6afb4a13b81d..a7880787cad3 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -17,6 +17,7 @@ #define F2FS_LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) /* log number for sector/blk */ #define F2FS_BLKSIZE PAGE_SIZE /* support only block == page */ #define F2FS_BLKSIZE_BITS PAGE_SHIFT /* bits for F2FS_BLKSIZE */ +#define F2FS_SUM_BLKSIZE 4096 /* only support 4096 byte sum block */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ #define F2FS_EXTENSION_LEN 8 /* max size of extension */ @@ -441,7 +442,7 @@ struct f2fs_sit_block { * from node's page's beginning to get a data block address. * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node) */ -#define ENTRIES_IN_SUM (F2FS_BLKSIZE / 8) +#define ENTRIES_IN_SUM (F2FS_SUM_BLKSIZE / 8) #define SUMMARY_SIZE (7) /* sizeof(struct f2fs_summary) */ #define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */ #define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM) @@ -467,7 +468,7 @@ struct summary_footer { __le32 check_sum; /* summary checksum */ } __packed; -#define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\ +#define SUM_JOURNAL_SIZE (F2FS_SUM_BLKSIZE - SUM_FOOTER_SIZE -\ SUM_ENTRY_SIZE) #define NAT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\ sizeof(struct nat_journal_entry)) -- cgit v1.2.3 From 8d1cb17aca466b361cca17834b8bb1cf3e3d1818 Mon Sep 17 00:00:00 2001 From: YH Lin Date: Fri, 28 Nov 2025 11:23:57 +0800 Subject: f2fs: optimize trace_f2fs_write_checkpoint with enums This patch optimizes the tracepoint by replacing these hardcoded strings with a new enumeration f2fs_cp_phase. 1.Defines enum f2fs_cp_phase with values for each checkpoint phase. 2.Updates trace_f2fs_write_checkpoint to accept a u16 phase argument instead of a string pointer. 3.Uses __print_symbolic in TP_printk to convert the enum values back to their corresponding strings for human-readable trace output. This change reduces the storage overhead for each trace event by replacing a variable-length string with a 2-byte integer, while maintaining the same readable output in ftrace. Signed-off-by: YH Lin Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index e00611ead024..df4017dcc701 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -50,6 +50,9 @@ TRACE_DEFINE_ENUM(CP_PAUSE); TRACE_DEFINE_ENUM(CP_RESIZE); TRACE_DEFINE_ENUM(EX_READ); TRACE_DEFINE_ENUM(EX_BLOCK_AGE); +TRACE_DEFINE_ENUM(CP_PHASE_START_BLOCK_OPS); +TRACE_DEFINE_ENUM(CP_PHASE_FINISH_BLOCK_OPS); +TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT); #define show_block_type(type) \ __print_symbolic(type, \ @@ -175,6 +178,12 @@ TRACE_DEFINE_ENUM(EX_BLOCK_AGE); #define S_ALL_PERM (S_ISUID | S_ISGID | S_ISVTX | \ S_IRWXU | S_IRWXG | S_IRWXO) +#define show_cp_phase(phase) \ + __print_symbolic(phase, \ + { CP_PHASE_START_BLOCK_OPS, "start block_ops" }, \ + { CP_PHASE_FINISH_BLOCK_OPS, "finish block_ops" }, \ + { CP_PHASE_FINISH_CHECKPOINT, "finish checkpoint" }) + struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; @@ -1573,26 +1582,26 @@ TRACE_EVENT(f2fs_readpages, TRACE_EVENT(f2fs_write_checkpoint, - TP_PROTO(struct super_block *sb, int reason, const char *msg), + TP_PROTO(struct super_block *sb, int reason, u16 phase), - TP_ARGS(sb, reason, msg), + TP_ARGS(sb, reason, phase), TP_STRUCT__entry( __field(dev_t, dev) __field(int, reason) - __string(dest_msg, msg) + __field(u16, phase) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->reason = reason; - __assign_str(dest_msg); + __entry->phase = phase; ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", show_dev(__entry->dev), show_cpreason(__entry->reason), - __get_str(dest_msg)) + show_cp_phase(__entry->phase)) ); DECLARE_EVENT_CLASS(f2fs_discard, -- cgit v1.2.3 From f345be751b961ce91e0b883345eaa1d0993a4949 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Tue, 2 Dec 2025 11:21:31 -0700 Subject: io_uring/trace: rename io_uring_queue_async_work event "rw" field The io_uring_queue_async_work tracepoint event stores an int rw field that represents whether the work item is hashed. Rename it to "hashed" and change its type to bool to more accurately reflect its value. Signed-off-by: Caleb Sander Mateos Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 45d15460b495..34b31a855ea4 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -133,15 +133,15 @@ TRACE_EVENT(io_uring_file_get, * io_uring_queue_async_work - called before submitting a new async work * * @req: pointer to a submitted request - * @rw: type of workqueue, hashed or normal + * @hashed: whether async work is hashed * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, - TP_PROTO(struct io_kiocb *req, int rw), + TP_PROTO(struct io_kiocb *req, bool hashed), - TP_ARGS(req, rw), + TP_ARGS(req, hashed), TP_STRUCT__entry ( __field( void *, ctx ) @@ -150,7 +150,7 @@ TRACE_EVENT(io_uring_queue_async_work, __field( u8, opcode ) __field( unsigned long long, flags ) __field( struct io_wq_work *, work ) - __field( int, rw ) + __field( bool, hashed ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), @@ -162,7 +162,7 @@ TRACE_EVENT(io_uring_queue_async_work, __entry->flags = (__force unsigned long long) req->flags; __entry->opcode = req->opcode; __entry->work = &req->work; - __entry->rw = rw; + __entry->hashed = hashed; __assign_str(op_str); ), @@ -170,7 +170,7 @@ TRACE_EVENT(io_uring_queue_async_work, TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, - __entry->rw ? "hashed" : "normal", __entry->work) + __entry->hashed ? "hashed" : "normal", __entry->work) ); /** -- cgit v1.2.3 From 22a1ffea5f805dfa21b64d1c7b5fe39c0c78c997 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 1 Dec 2025 16:43:28 -0500 Subject: block: add IOC_PR_READ_KEYS ioctl Add a Persistent Reservations ioctl to read the list of currently registered reservation keys. This calls the pr_ops->read_keys() function that was previously added in commit c787f1baa503 ("block: Add PR callouts for read keys and reservation") but was only used by the in-kernel SCSI target so far. The IOC_PR_READ_KEYS ioctl is necessary so that userspace applications that rely on Persistent Reservations ioctls have a way of inspecting the current state. Cluster managers and validation tests need this functionality. Signed-off-by: Stefan Hajnoczi Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/uapi/linux/pr.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pr.h b/include/uapi/linux/pr.h index d8126415966f..fcb74eab92c8 100644 --- a/include/uapi/linux/pr.h +++ b/include/uapi/linux/pr.h @@ -56,6 +56,12 @@ struct pr_clear { __u32 __pad; }; +struct pr_read_keys { + __u32 generation; + __u32 num_keys; + __u64 keys_ptr; +}; + #define PR_FL_IGNORE_KEY (1 << 0) /* ignore existing key */ #define IOC_PR_REGISTER _IOW('p', 200, struct pr_registration) @@ -64,5 +70,6 @@ struct pr_clear { #define IOC_PR_PREEMPT _IOW('p', 203, struct pr_preempt) #define IOC_PR_PREEMPT_ABORT _IOW('p', 204, struct pr_preempt) #define IOC_PR_CLEAR _IOW('p', 205, struct pr_clear) +#define IOC_PR_READ_KEYS _IOWR('p', 206, struct pr_read_keys) #endif /* _UAPI_PR_H */ -- cgit v1.2.3 From 3e2cb9ee76c27f57bfdb7b4753b909594d4fa31a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 1 Dec 2025 16:43:29 -0500 Subject: block: add IOC_PR_READ_RESERVATION ioctl Add a Persistent Reservations ioctl to read the current reservation. This calls the pr_ops->read_reservation() function that was previously added in commit c787f1baa503 ("block: Add PR callouts for read keys and reservation") but was only used by the in-kernel SCSI target so far. The IOC_PR_READ_RESERVATION ioctl is necessary so that userspace applications that rely on Persistent Reservations ioctls have a way of inspecting the current state. Cluster managers and validation tests need this functionality. Signed-off-by: Stefan Hajnoczi Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/uapi/linux/pr.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/pr.h b/include/uapi/linux/pr.h index fcb74eab92c8..847f3051057a 100644 --- a/include/uapi/linux/pr.h +++ b/include/uapi/linux/pr.h @@ -62,6 +62,12 @@ struct pr_read_keys { __u64 keys_ptr; }; +struct pr_read_reservation { + __u64 key; + __u32 generation; + __u32 type; +}; + #define PR_FL_IGNORE_KEY (1 << 0) /* ignore existing key */ #define IOC_PR_REGISTER _IOW('p', 200, struct pr_registration) @@ -71,5 +77,6 @@ struct pr_read_keys { #define IOC_PR_PREEMPT_ABORT _IOW('p', 204, struct pr_preempt) #define IOC_PR_CLEAR _IOW('p', 205, struct pr_clear) #define IOC_PR_READ_KEYS _IOWR('p', 206, struct pr_read_keys) +#define IOC_PR_READ_RESERVATION _IOR('p', 207, struct pr_read_reservation) #endif /* _UAPI_PR_H */ -- cgit v1.2.3 From 71075d25ca5cae732fb57da065fbf14aeb3bcfc7 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Tue, 2 Dec 2025 19:58:09 -0800 Subject: blk-mq: add blk_rq_nr_bvec() helper Add a new helper function blk_rq_nr_bvec() that returns the number of bvecs in a request. This count represents the number of iterations rq_for_each_bvec() would perform on a request. Drivers need to pre-allocate bvec arrays before iterating through a request's bvecs. Currently, they manually count bvecs using rq_for_each_bvec() in a loop, which is repetitive. The new helper centralizes this logic. This pattern exists in loop and zloop drivers, where multi-bio requests require copying bvecs into a contiguous array before creating an iov_iter for file operations. Update loop and zloop drivers to use the new helper, eliminating duplicate code. This patch also provides a clear API to avoid any potential misuse of blk_nr_phys_segments() for calculating the bvecs since, one bvec can have more than one segments and use of blk_nr_phys_segments() can lead to extra memory allocation :- [ 6155.673749] nullb_bio: 128K bio as ONE bvec: sector=0, size=131072 [ 6155.673846] null_blk: #### null_handle_data_transfer:1375 [ 6155.673850] null_blk: nr_bvec=1 blk_rq_nr_phys_segments=2 [ 6155.674263] null_blk: #### null_handle_data_transfer:1375 [ 6155.674267] null_blk: nr_bvec=1 blk_rq_nr_phys_segments=1 Reviewed-by: Niklas Cassel Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index eb7254b3dddd..cae9e857aea4 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1213,6 +1213,24 @@ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) return max_t(unsigned short, rq->nr_phys_segments, 1); } +/** + * blk_rq_nr_bvec - return number of bvecs in a request + * @rq: request to calculate bvecs for + * + * Returns the number of bvecs. + */ +static inline unsigned int blk_rq_nr_bvec(struct request *rq) +{ + struct req_iterator rq_iter; + struct bio_vec bv; + unsigned int nr_bvec = 0; + + rq_for_each_bvec(bv, rq, rq_iter) + nr_bvec++; + + return nr_bvec; +} + int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg); static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist) -- cgit v1.2.3 From 41f7351fc47283822c4b70b0f42741f52cc1e6f6 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 2 Dec 2025 11:30:25 -0800 Subject: PM: runtime: Make pm_runtime_barrier() return void No callers check the return code, and that's a good thing. Doing so would be racy and unhelpful. Drop the return code entirely, so we don't make anyone think about its complexities. Signed-off-by: Brian Norris Tested-by: Guenter Roeck Link: https://patch.msgid.link/20251202193129.1411419-2-briannorris@chromium.org Signed-off-by: Rafael J. Wysocki --- include/linux/pm_runtime.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 911d7a4d32c1..41037c513f06 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -76,7 +76,7 @@ extern int pm_runtime_get_if_active(struct device *dev); extern int pm_runtime_get_if_in_use(struct device *dev); extern int pm_schedule_suspend(struct device *dev, unsigned int delay); extern int __pm_runtime_set_status(struct device *dev, unsigned int status); -extern int pm_runtime_barrier(struct device *dev); +extern void pm_runtime_barrier(struct device *dev); extern bool pm_runtime_block_if_disabled(struct device *dev); extern void pm_runtime_unblock(struct device *dev); extern void pm_runtime_enable(struct device *dev); @@ -284,7 +284,7 @@ static inline int pm_runtime_get_if_active(struct device *dev) } static inline int __pm_runtime_set_status(struct device *dev, unsigned int status) { return 0; } -static inline int pm_runtime_barrier(struct device *dev) { return 0; } +static inline void pm_runtime_barrier(struct device *dev) {} static inline bool pm_runtime_block_if_disabled(struct device *dev) { return true; } static inline void pm_runtime_unblock(struct device *dev) {} static inline void pm_runtime_enable(struct device *dev) {} -- cgit v1.2.3 From 8a32282175c964eb15638e8dfe199fc13c060f67 Mon Sep 17 00:00:00 2001 From: shechenglong Date: Wed, 3 Dec 2025 23:17:49 +0800 Subject: block: fix comment for op_is_zone_mgmt() to include RESET_ALL REQ_OP_ZONE_RESET_ALL is a zone management request, and op_is_zone_mgmt() has returned true for it. Update the comment to remove the misleading exception note so the documentation matches the implementation. Fixes: 12a1c9353c47 ("block: fix op_is_zone_mgmt() to handle REQ_OP_ZONE_RESET_ALL") Signed-off-by: shechenglong Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index cbbcb9051ec3..5dc061d318a4 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -479,10 +479,7 @@ static inline bool op_is_discard(blk_opf_t op) } /* - * Check if a bio or request operation is a zone management operation, with - * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case - * due to its different handling in the block layer and device response in - * case of command failure. + * Check if a bio or request operation is a zone management operation. */ static inline bool op_is_zone_mgmt(enum req_op op) { -- cgit v1.2.3 From 6e9722e9a7bfe1bbad649937c811076acf86e1fd Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Sun, 30 Nov 2025 21:07:12 +0200 Subject: tpm2-sessions: Fix out of range indexing in name_size 'name_size' does not have any range checks, and it just directly indexes with TPM_ALG_ID, which could lead into memory corruption at worst. Address the issue by only processing known values and returning -EINVAL for unrecognized values. Make also 'tpm_buf_append_name' and 'tpm_buf_fill_hmac_session' fallible so that errors are detected before causing any spurious TPM traffic. End also the authorization session on failure in both of the functions, as the session state would be then by definition corrupted. Cc: stable@vger.kernel.org # v6.10+ Fixes: 1085b8276bb4 ("tpm: Add the rest of the session HMAC API") Reviewed-by: Jonathan McDowell Signed-off-by: Jarkko Sakkinen --- include/linux/tpm.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index 3d8f7d1ce2b8..aa816b144ab3 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -529,8 +529,8 @@ static inline struct tpm2_auth *tpm2_chip_auth(struct tpm_chip *chip) #endif } -void tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf, - u32 handle, u8 *name); +int tpm_buf_append_name(struct tpm_chip *chip, struct tpm_buf *buf, + u32 handle, u8 *name); void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, u8 attributes, u8 *passphrase, int passphraselen); @@ -563,7 +563,7 @@ static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip, #ifdef CONFIG_TCG_TPM2_HMAC int tpm2_start_auth_session(struct tpm_chip *chip); -void tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf); +int tpm_buf_fill_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf); int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf, int rc); void tpm2_end_auth_session(struct tpm_chip *chip); @@ -577,10 +577,13 @@ static inline int tpm2_start_auth_session(struct tpm_chip *chip) static inline void tpm2_end_auth_session(struct tpm_chip *chip) { } -static inline void tpm_buf_fill_hmac_session(struct tpm_chip *chip, - struct tpm_buf *buf) + +static inline int tpm_buf_fill_hmac_session(struct tpm_chip *chip, + struct tpm_buf *buf) { + return 0; } + static inline int tpm_buf_check_hmac_response(struct tpm_chip *chip, struct tpm_buf *buf, int rc) -- cgit v1.2.3 From bc677a9216e1396322e42692e9c01cce04a7afc0 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 30 Sep 2025 16:07:35 +0300 Subject: tpm2-sessions: Remove 'attributes' parameter from tpm_buf_append_auth Remove 'attributes' parameter from 'tpm_buf_append_auth', as it is not used by the function. Fixes: 27184f8905ba ("tpm: Opt-in in disable PCR integrity protection") Signed-off-by: Jarkko Sakkinen Reviewed-by: Jonathan McDowell --- include/linux/tpm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index aa816b144ab3..afa51723296a 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -535,7 +535,7 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, u8 attributes, u8 *passphrase, int passphraselen); void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf, - u8 attributes, u8 *passphrase, int passphraselen); + u8 *passphrase, int passphraselen); static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip, struct tpm_buf *buf, u8 attributes, -- cgit v1.2.3 From b7960b90486139022d2d39caad90db252c469bab Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 30 Sep 2025 23:44:19 +0300 Subject: tpm2-sessions: Open code tpm_buf_append_hmac_session() Open code 'tpm_buf_append_hmac_session_opt' to the call site, as it only masks a call sequence and does otherwise nothing particularly useful. Signed-off-by: Jarkko Sakkinen Reviewed-by: Jonathan McDowell --- include/linux/tpm.h | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'include') diff --git a/include/linux/tpm.h b/include/linux/tpm.h index afa51723296a..202da079d500 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -536,29 +536,6 @@ void tpm_buf_append_hmac_session(struct tpm_chip *chip, struct tpm_buf *buf, int passphraselen); void tpm_buf_append_auth(struct tpm_chip *chip, struct tpm_buf *buf, u8 *passphrase, int passphraselen); -static inline void tpm_buf_append_hmac_session_opt(struct tpm_chip *chip, - struct tpm_buf *buf, - u8 attributes, - u8 *passphrase, - int passphraselen) -{ - struct tpm_header *head; - int offset; - - if (tpm2_chip_auth(chip)) { - tpm_buf_append_hmac_session(chip, buf, attributes, passphrase, passphraselen); - } else { - offset = buf->handles * 4 + TPM_HEADER_SIZE; - head = (struct tpm_header *)buf->data; - - /* - * If the only sessions are optional, the command tag must change to - * TPM2_ST_NO_SESSIONS. - */ - if (tpm_buf_length(buf) == offset) - head->tag = cpu_to_be16(TPM2_ST_NO_SESSIONS); - } -} #ifdef CONFIG_TCG_TPM2_HMAC -- cgit v1.2.3 From fe93446b5ebdaa89a8f97b15668c077921a65140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Wed, 3 Dec 2025 14:57:57 +0100 Subject: vfs: use UAPI types for new struct delegation definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using libc types and headers from the UAPI headers is problematic as it introduces a dependency on a full C toolchain. Use the fixed-width integer types provided by the UAPI headers instead. Fixes: 1602bad16d7d ("vfs: expose delegation support to userland") Fixes: 4be9e04ebf75 ("vfs: add needed headers for new struct delegation definition") Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20251203-uapi-fcntl-v1-1-490c67bf3425@linutronix.de Acked-by: Arnd Bergmann Acked-by: Jeff Layton Signed-off-by: Christian Brauner --- include/uapi/linux/fcntl.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 5e277fd955aa..aadfbf6e0cb3 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -4,11 +4,7 @@ #include #include -#ifdef __KERNEL__ #include -#else -#include -#endif #define F_SETLEASE (F_LINUX_SPECIFIC_BASE + 0) #define F_GETLEASE (F_LINUX_SPECIFIC_BASE + 1) @@ -90,9 +86,9 @@ /* Argument structure for F_GETDELEG and F_SETDELEG */ struct delegation { - uint32_t d_flags; /* Must be 0 */ - uint16_t d_type; /* F_RDLCK, F_WRLCK, F_UNLCK */ - uint16_t __pad; /* Must be 0 */ + __u32 d_flags; /* Must be 0 */ + __u16 d_type; /* F_RDLCK, F_WRLCK, F_UNLCK */ + __u16 __pad; /* Must be 0 */ }; /* -- cgit v1.2.3 From 02e7769e38c87c92b82db59923d3b0598d153903 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Dec 2025 16:17:51 -0500 Subject: tracing: Fix enabling of tracing on file release The trace file will pause tracing if the tracing instance has the "pause-on-trace" option is set. This happens when the file is opened, and it is unpaused when the file is closed. When this was first added, there was only one user that paused tracing. On open, the check to pause was: if (!iter->snapshot && (tr->trace_flags & TRACE_ITER(PAUSE_ON_TRACE))) Where if it is not the snapshot tracer and the "pause-on-trace" option is set, then it increments a "stop_count" of the trace instance. On close, the check is: if (!iter->snapshot && tr->stop_count) That is, if it is not the snapshot buffer and it was stopped, it will re-enable tracing. Now there's more places that stop tracing. This means, if something else stops tracing the tr->stop_count will be non-zero, and that means if the trace file is closed, it will decrement the stop_count even though it never incremented it. This causes a warning because when the user that stopped tracing enables it again, the stop_count goes below zero. Instead of relying on the stop_count being set to know if the close of the trace file should enable tracing again, add a new flag to the trace iterator. The trace iterator is unique per open of the trace file, and if the open stops tracing set the trace iterator PAUSE flag. On close, if the PAUSE flag is set, then re-enable it again. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Link: https://patch.msgid.link/20251202161751.24abaaf1@gandalf.local.home Fixes: 06e0a548bad0f ("tracing: Do not disable tracing when reading the trace file") Reported-by: syzbot+ccdec3bfe0beec58a38d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/692f44a5.a70a0220.2ea503.00c8.GAE@google.com/ Signed-off-by: Steven Rostedt (Google) --- include/linux/trace_events.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 04307a19cde3..3690221ba3d8 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -138,6 +138,7 @@ enum trace_iter_flags { TRACE_FILE_LAT_FMT = 1, TRACE_FILE_ANNOTATE = 2, TRACE_FILE_TIME_IN_NS = 4, + TRACE_FILE_PAUSE = 8, }; -- cgit v1.2.3 From 7bfe3b8ea6e30437e01fcb8e4f56ef6e4d986d0f Mon Sep 17 00:00:00 2001 From: Naman Jain Date: Thu, 13 Nov 2025 04:41:49 +0000 Subject: Drivers: hv: Introduce mshv_vtl driver Provide an interface for Virtual Machine Monitor like OpenVMM and its use as OpenHCL paravisor to control VTL0 (Virtual trust Level). Expose devices and support IOCTLs for features like VTL creation, VTL0 memory management, context switch, making hypercalls, mapping VTL0 address space to VTL2 userspace, getting new VMBus messages and channel events in VTL2 etc. Co-developed-by: Roman Kisel Signed-off-by: Roman Kisel Co-developed-by: Saurabh Sengar Signed-off-by: Saurabh Sengar Reviewed-by: Michael Kelley Signed-off-by: Naman Jain Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 106 ++++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/mshv.h | 80 +++++++++++++++++++++++++++++++++ 2 files changed, 186 insertions(+) (limited to 'include') diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 7499a679e60a..1d5ce11be8b6 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -885,6 +885,48 @@ struct hv_get_vp_from_apic_id_in { u32 apic_ids[]; } __packed; +union hv_register_vsm_partition_config { + u64 as_uint64; + struct { + u64 enable_vtl_protection : 1; + u64 default_vtl_protection_mask : 4; + u64 zero_memory_on_reset : 1; + u64 deny_lower_vtl_startup : 1; + u64 intercept_acceptance : 1; + u64 intercept_enable_vtl_protection : 1; + u64 intercept_vp_startup : 1; + u64 intercept_cpuid_unimplemented : 1; + u64 intercept_unrecoverable_exception : 1; + u64 intercept_page : 1; + u64 mbz : 51; + } __packed; +}; + +union hv_register_vsm_capabilities { + u64 as_uint64; + struct { + u64 dr6_shared: 1; + u64 mbec_vtl_mask: 16; + u64 deny_lower_vtl_startup: 1; + u64 supervisor_shadow_stack: 1; + u64 hardware_hvpt_available: 1; + u64 software_hvpt_available: 1; + u64 hardware_hvpt_range_bits: 6; + u64 intercept_page_available: 1; + u64 return_action_available: 1; + u64 reserved: 35; + } __packed; +}; + +union hv_register_vsm_page_offsets { + struct { + u64 vtl_call_offset : 12; + u64 vtl_return_offset : 12; + u64 reserved_mbz : 40; + } __packed; + u64 as_uint64; +}; + struct hv_nested_enlightenments_control { struct { u32 directhypercall : 1; @@ -1007,6 +1049,70 @@ enum hv_register_name { /* VSM */ HV_REGISTER_VSM_VP_STATUS = 0x000D0003, + + /* Synthetic VSM registers */ + HV_REGISTER_VSM_CODE_PAGE_OFFSETS = 0x000D0002, + HV_REGISTER_VSM_CAPABILITIES = 0x000D0006, + HV_REGISTER_VSM_PARTITION_CONFIG = 0x000D0007, + +#if defined(CONFIG_X86) + /* X64 Debug Registers */ + HV_X64_REGISTER_DR0 = 0x00050000, + HV_X64_REGISTER_DR1 = 0x00050001, + HV_X64_REGISTER_DR2 = 0x00050002, + HV_X64_REGISTER_DR3 = 0x00050003, + HV_X64_REGISTER_DR6 = 0x00050004, + HV_X64_REGISTER_DR7 = 0x00050005, + + /* X64 Cache control MSRs */ + HV_X64_REGISTER_MSR_MTRR_CAP = 0x0008000D, + HV_X64_REGISTER_MSR_MTRR_DEF_TYPE = 0x0008000E, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE0 = 0x00080010, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE1 = 0x00080011, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE2 = 0x00080012, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE3 = 0x00080013, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE4 = 0x00080014, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE5 = 0x00080015, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE6 = 0x00080016, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE7 = 0x00080017, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE8 = 0x00080018, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASE9 = 0x00080019, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEA = 0x0008001A, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEB = 0x0008001B, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEC = 0x0008001C, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASED = 0x0008001D, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEE = 0x0008001E, + HV_X64_REGISTER_MSR_MTRR_PHYS_BASEF = 0x0008001F, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK0 = 0x00080040, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK1 = 0x00080041, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK2 = 0x00080042, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK3 = 0x00080043, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK4 = 0x00080044, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK5 = 0x00080045, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK6 = 0x00080046, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK7 = 0x00080047, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK8 = 0x00080048, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASK9 = 0x00080049, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKA = 0x0008004A, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKB = 0x0008004B, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKC = 0x0008004C, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKD = 0x0008004D, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKE = 0x0008004E, + HV_X64_REGISTER_MSR_MTRR_PHYS_MASKF = 0x0008004F, + HV_X64_REGISTER_MSR_MTRR_FIX64K00000 = 0x00080070, + HV_X64_REGISTER_MSR_MTRR_FIX16K80000 = 0x00080071, + HV_X64_REGISTER_MSR_MTRR_FIX16KA0000 = 0x00080072, + HV_X64_REGISTER_MSR_MTRR_FIX4KC0000 = 0x00080073, + HV_X64_REGISTER_MSR_MTRR_FIX4KC8000 = 0x00080074, + HV_X64_REGISTER_MSR_MTRR_FIX4KD0000 = 0x00080075, + HV_X64_REGISTER_MSR_MTRR_FIX4KD8000 = 0x00080076, + HV_X64_REGISTER_MSR_MTRR_FIX4KE0000 = 0x00080077, + HV_X64_REGISTER_MSR_MTRR_FIX4KE8000 = 0x00080078, + HV_X64_REGISTER_MSR_MTRR_FIX4KF0000 = 0x00080079, + HV_X64_REGISTER_MSR_MTRR_FIX4KF8000 = 0x0008007A, + + HV_X64_REGISTER_REG_PAGE = 0x0009001C, +#endif }; /* diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index b645d17cc531..dee3ece28ce5 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -322,4 +322,84 @@ struct mshv_get_set_vp_state { * #define MSHV_ROOT_HVCALL _IOWR(MSHV_IOCTL, 0x07, struct mshv_root_hvcall) */ +/* Structure definitions, macros and IOCTLs for mshv_vtl */ + +#define MSHV_CAP_CORE_API_STABLE 0x0 +#define MSHV_CAP_REGISTER_PAGE 0x1 +#define MSHV_CAP_VTL_RETURN_ACTION 0x2 +#define MSHV_CAP_DR6_SHARED 0x3 +#define MSHV_MAX_RUN_MSG_SIZE 256 + +struct mshv_vp_registers { + __u32 count; /* supports only 1 register at a time */ + __u32 reserved; /* Reserved for alignment or future use */ + __u64 regs_ptr; /* pointer to struct hv_register_assoc */ +}; + +struct mshv_vtl_set_eventfd { + __s32 fd; + __u32 flag; +}; + +struct mshv_vtl_signal_event { + __u32 connection_id; + __u32 flag; +}; + +struct mshv_vtl_sint_post_msg { + __u64 message_type; + __u32 connection_id; + __u32 payload_size; /* Must not exceed HV_MESSAGE_PAYLOAD_BYTE_COUNT */ + __u64 payload_ptr; /* pointer to message payload (bytes) */ +}; + +struct mshv_vtl_ram_disposition { + __u64 start_pfn; + __u64 last_pfn; +}; + +struct mshv_vtl_set_poll_file { + __u32 cpu; + __u32 fd; +}; + +struct mshv_vtl_hvcall_setup { + __u64 bitmap_array_size; /* stores number of bytes */ + __u64 allow_bitmap_ptr; +}; + +struct mshv_vtl_hvcall { + __u64 control; /* Hypercall control code */ + __u64 input_size; /* Size of the input data */ + __u64 input_ptr; /* Pointer to the input struct */ + __u64 status; /* Status of the hypercall (output) */ + __u64 output_size; /* Size of the output data */ + __u64 output_ptr; /* Pointer to the output struct */ +}; + +struct mshv_sint_mask { + __u8 mask; + __u8 reserved[7]; +}; + +/* /dev/mshv device IOCTL */ +#define MSHV_CHECK_EXTENSION _IOW(MSHV_IOCTL, 0x00, __u32) + +/* vtl device */ +#define MSHV_CREATE_VTL _IOR(MSHV_IOCTL, 0x1D, char) +#define MSHV_ADD_VTL0_MEMORY _IOW(MSHV_IOCTL, 0x21, struct mshv_vtl_ram_disposition) +#define MSHV_SET_POLL_FILE _IOW(MSHV_IOCTL, 0x25, struct mshv_vtl_set_poll_file) +#define MSHV_RETURN_TO_LOWER_VTL _IO(MSHV_IOCTL, 0x27) +#define MSHV_GET_VP_REGISTERS _IOWR(MSHV_IOCTL, 0x05, struct mshv_vp_registers) +#define MSHV_SET_VP_REGISTERS _IOW(MSHV_IOCTL, 0x06, struct mshv_vp_registers) + +/* VMBus device IOCTLs */ +#define MSHV_SINT_SIGNAL_EVENT _IOW(MSHV_IOCTL, 0x22, struct mshv_vtl_signal_event) +#define MSHV_SINT_POST_MESSAGE _IOW(MSHV_IOCTL, 0x23, struct mshv_vtl_sint_post_msg) +#define MSHV_SINT_SET_EVENTFD _IOW(MSHV_IOCTL, 0x24, struct mshv_vtl_set_eventfd) +#define MSHV_SINT_PAUSE_MESSAGE_STREAM _IOW(MSHV_IOCTL, 0x25, struct mshv_sint_mask) + +/* hv_hvcall device */ +#define MSHV_HVCALL_SETUP _IOW(MSHV_IOCTL, 0x1E, struct mshv_vtl_hvcall_setup) +#define MSHV_HVCALL _IOWR(MSHV_IOCTL, 0x1F, struct mshv_vtl_hvcall) #endif -- cgit v1.2.3 From 9d70ef7a18e0ec1653ac63020a13a5d4dda7cc0d Mon Sep 17 00:00:00 2001 From: Jinank Jain Date: Mon, 24 Nov 2025 14:25:59 +0000 Subject: mshv: adjust interrupt control structure for ARM64 Interrupt control structure (union hv_interupt_control) has different fields when it comes to x86 vs ARM64. Bring in the correct structure from HyperV header files and adjust the existing interrupt routing code accordingly. Signed-off-by: Jinank Jain Signed-off-by: Anirudh Rayabharam (Microsoft) Signed-off-by: Wei Liu --- include/hyperv/hvhdk.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/hyperv/hvhdk.h b/include/hyperv/hvhdk.h index 416c0d45b793..469186df7826 100644 --- a/include/hyperv/hvhdk.h +++ b/include/hyperv/hvhdk.h @@ -579,9 +579,15 @@ union hv_interrupt_control { u64 as_uint64; struct { u32 interrupt_type; /* enum hv_interrupt_type */ +#if IS_ENABLED(CONFIG_X86) u32 level_triggered : 1; u32 logical_dest_mode : 1; u32 rsvd : 30; +#elif IS_ENABLED(CONFIG_ARM64) + u32 rsvd1 : 2; + u32 asserted : 1; + u32 rsvd2 : 29; +#endif } __packed; }; -- cgit v1.2.3 From 723c47a221ee407901055c9d9b4434e68c5d650e Mon Sep 17 00:00:00 2001 From: Praveen K Paladugu Date: Fri, 5 Dec 2025 14:17:06 -0600 Subject: mshv: Add definitions for MSHV sleep state configuration Add the definitions required to configure sleep states in mshv hypervsior. Signed-off-by: Praveen K Paladugu Co-developed-by: Anatol Belski Signed-off-by: Anatol Belski Reviewed-by: Easwar Hariharan Reviewed-by: Nuno Das Neves Acked-by: Stanislav Kinsburskii Signed-off-by: Wei Liu --- include/hyperv/hvgdk_mini.h | 4 +++- include/hyperv/hvhdk_mini.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/hyperv/hvgdk_mini.h b/include/hyperv/hvgdk_mini.h index 1d5ce11be8b6..04b18d0e37af 100644 --- a/include/hyperv/hvgdk_mini.h +++ b/include/hyperv/hvgdk_mini.h @@ -465,19 +465,21 @@ union hv_vp_assist_msr_contents { /* HV_REGISTER_VP_ASSIST_PAGE */ #define HVCALL_RESET_DEBUG_SESSION 0x006b #define HVCALL_MAP_STATS_PAGE 0x006c #define HVCALL_UNMAP_STATS_PAGE 0x006d +#define HVCALL_SET_SYSTEM_PROPERTY 0x006f #define HVCALL_ADD_LOGICAL_PROCESSOR 0x0076 #define HVCALL_GET_SYSTEM_PROPERTY 0x007b #define HVCALL_MAP_DEVICE_INTERRUPT 0x007c #define HVCALL_UNMAP_DEVICE_INTERRUPT 0x007d #define HVCALL_RETARGET_INTERRUPT 0x007e #define HVCALL_NOTIFY_PARTITION_EVENT 0x0087 +#define HVCALL_ENTER_SLEEP_STATE 0x0084 #define HVCALL_NOTIFY_PORT_RING_EMPTY 0x008b #define HVCALL_REGISTER_INTERCEPT_RESULT 0x0091 #define HVCALL_ASSERT_VIRTUAL_INTERRUPT 0x0094 #define HVCALL_CREATE_PORT 0x0095 #define HVCALL_CONNECT_PORT 0x0096 #define HVCALL_START_VP 0x0099 -#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a +#define HVCALL_GET_VP_INDEX_FROM_APIC_ID 0x009a #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 #define HVCALL_SIGNAL_EVENT_DIRECT 0x00c0 diff --git a/include/hyperv/hvhdk_mini.h b/include/hyperv/hvhdk_mini.h index f2d7b50de7a4..41a29bf8ec14 100644 --- a/include/hyperv/hvhdk_mini.h +++ b/include/hyperv/hvhdk_mini.h @@ -140,6 +140,7 @@ enum hv_snp_status { enum hv_system_property { /* Add more values when needed */ + HV_SYSTEM_PROPERTY_SLEEP_STATE = 3, HV_SYSTEM_PROPERTY_SCHEDULER_TYPE = 15, HV_DYNAMIC_PROCESSOR_FEATURE_PROPERTY = 21, HV_SYSTEM_PROPERTY_CRASHDUMPAREA = 47, @@ -155,6 +156,19 @@ union hv_pfn_range { /* HV_SPA_PAGE_RANGE */ } __packed; }; +enum hv_sleep_state { + HV_SLEEP_STATE_S1 = 1, + HV_SLEEP_STATE_S2 = 2, + HV_SLEEP_STATE_S3 = 3, + HV_SLEEP_STATE_S4 = 4, + HV_SLEEP_STATE_S5 = 5, + /* + * After hypervisor has received this, any follow up sleep + * state registration requests will be rejected. + */ + HV_SLEEP_STATE_LOCK = 6 +}; + enum hv_dynamic_processor_feature_property { /* Add more values when needed */ HV_X64_DYNAMIC_PROCESSOR_FEATURE_MAX_ENCRYPTED_PARTITIONS = 13, @@ -184,6 +198,32 @@ struct hv_output_get_system_property { }; } __packed; +struct hv_sleep_state_info { + u32 sleep_state; /* enum hv_sleep_state */ + u8 pm1a_slp_typ; + u8 pm1b_slp_typ; +} __packed; + +struct hv_input_set_system_property { + u32 property_id; /* enum hv_system_property */ + u32 reserved; + union { + /* More fields to be filled in when needed */ + struct hv_sleep_state_info set_sleep_state_info; + + /* + * Add a reserved field to ensure the union is 8-byte aligned as + * existing members may not be. This is a temporary measure + * until all remaining members are added. + */ + u64 reserved0[8]; + }; +} __packed; + +struct hv_input_enter_sleep_state { /* HV_INPUT_ENTER_SLEEP_STATE */ + u32 sleep_state; /* enum hv_sleep_state */ +} __packed; + struct hv_input_map_stats_page { u32 type; /* enum hv_stats_object_type */ u32 padding; -- cgit v1.2.3 From 90dfeef1cd38dff19f8b3a752d13bfd79f0f7694 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Dec 2025 11:43:32 +0100 Subject: seqlock: Cure some more scoped_seqlock() optimization fails Arnd reported an x86 randconfig using gcc-15 tripped over __scoped_seqlock_bug(). Turns out GCC chose not to inline the scoped_seqlock helper functions and as such was not able to optimize properly. [ mingo: Clang fails the build too in some circumstances. ] Reported-by: Arnd Bergmann Tested-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: Oleg Nesterov Link: https://patch.msgid.link/20251204104332.GG2528459@noisy.programming.kicks-ass.net --- include/linux/seqlock.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index a8a8661839b6..221123660e71 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -1224,7 +1224,7 @@ struct ss_tmp { spinlock_t *lock_irqsave; }; -static inline void __scoped_seqlock_cleanup(struct ss_tmp *sst) +static __always_inline void __scoped_seqlock_cleanup(struct ss_tmp *sst) { if (sst->lock) spin_unlock(sst->lock); @@ -1252,7 +1252,7 @@ static inline void __scoped_seqlock_bug(void) { } extern void __scoped_seqlock_bug(void); #endif -static inline void +static __always_inline void __scoped_seqlock_next(struct ss_tmp *sst, seqlock_t *lock, enum ss_state target) { switch (sst->state) { -- cgit v1.2.3 From 5e5ea7f61610239fca058011e7d4f342b34d1558 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 6 Dec 2025 11:13:50 -0800 Subject: iommu/amd: fix SEV-TIO support reporting Commit eeb934137deb ("iommu/amd: Report SEV-TIO support") was confused about the config options that expose amd_iommu_sev_tio_supported(), and made the declaration (and alternative dummy function) conditional on the CONFIG_AMD_IOMMU config option. But the code is actually dependent on CONFIG_KVM_AMD_SEV, resulting in ERROR: modpost: "amd_iommu_sev_tio_supported" [drivers/crypto/ccp/ccp.ko] undefined! make[2]: *** [scripts/Makefile.modpost:147: Module.symvers] Error 1 if you have the AMD iommu enabled, but don't enable KVM_AMD_SEV support. Fix it by moving the declaration into the right #ifdef section in the header file. Fixes: eeb934137deb ("iommu/amd: Report SEV-TIO support") Cc: Alexey Kardashevskiy Cc: Joerg Roedel Cc: Vasant Hegde Cc: Tom Lendacky Cc: Dan Williams Signed-off-by: Linus Torvalds --- include/linux/amd-iommu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/amd-iommu.h b/include/linux/amd-iommu.h index 0f64f09d1f34..edcee9f5335a 100644 --- a/include/linux/amd-iommu.h +++ b/include/linux/amd-iommu.h @@ -18,12 +18,10 @@ struct task_struct; struct pci_dev; extern void amd_iommu_detect(void); -extern bool amd_iommu_sev_tio_supported(void); #else /* CONFIG_AMD_IOMMU */ static inline void amd_iommu_detect(void) { } -static inline bool amd_iommu_sev_tio_supported(void) { return false; } #endif /* CONFIG_AMD_IOMMU */ @@ -72,8 +70,10 @@ struct amd_iommu *get_amd_iommu(unsigned int idx); #ifdef CONFIG_KVM_AMD_SEV int amd_iommu_snp_disable(void); +extern bool amd_iommu_sev_tio_supported(void); #else static inline int amd_iommu_snp_disable(void) { return 0; } +static inline bool amd_iommu_sev_tio_supported(void) { return false; } #endif #endif /* _ASM_X86_AMD_IOMMU_H */ -- cgit v1.2.3 From a4f2fa516e83f11c3792405599613c12efe6135e Mon Sep 17 00:00:00 2001 From: Joakim Zhang Date: Fri, 5 Dec 2025 23:46:20 +0800 Subject: ALSA: hda/core: add addr_offset field for bus address translation Add bus addr_offset field for dma address translation, for some SoCs such as CIX SKY1 which is ARM64 Arch, HOST and HDAC has different memory view, so need to do dma address translation between HOST and HDAC. Signed-off-by: Joakim Zhang Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20251205154621.3019640-3-joakim.zhang@cixtech.com --- include/sound/hdaudio.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h index 4e0c1d8af09f..f11bfc6b9f42 100644 --- a/include/sound/hdaudio.h +++ b/include/sound/hdaudio.h @@ -380,6 +380,9 @@ struct hdac_bus { /* factor used to derive STRIPE control value */ unsigned int sdo_limit; + + /* address offset between host and hadc */ + dma_addr_t addr_offset; }; int snd_hdac_bus_init(struct hdac_bus *bus, struct device *dev, -- cgit v1.2.3 From 455a65260f526cedd4680d4836ebdf2eaf1ab4c6 Mon Sep 17 00:00:00 2001 From: Tobias Schumacher Date: Thu, 4 Dec 2025 06:05:01 +0100 Subject: genirq: Change hwirq parameter to irq_hw_number_t The irqdomain implementation internally represents hardware IRQs as irq_hw_number_t, which is defined as unsigned long int. When providing an irq_hw_number_t to the generic_handle_domain() functions that expect and unsigned int hwirq, this can lead to a loss of information. Change the hwirq parameter to irq_hw_number_t to support the full range of hwirqs. Reviewed-by: Thomas Gleixner Reviewed-by: Niklas Schnelle Reviewed-by: Farhan Ali Signed-off-by: Tobias Schumacher Signed-off-by: Heiko Carstens --- include/linux/irqdesc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 37e0b5b5600a..17902861de76 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -182,9 +182,9 @@ int generic_handle_irq_safe(unsigned int irq); * and handle the result interrupt number. Return -EINVAL if * conversion failed. */ -int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq); -int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq); -int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq); +int generic_handle_domain_irq(struct irq_domain *domain, irq_hw_number_t hwirq); +int generic_handle_domain_irq_safe(struct irq_domain *domain, irq_hw_number_t hwirq); +int generic_handle_domain_nmi(struct irq_domain *domain, irq_hw_number_t hwirq); #endif /* Test to see if a driver has successfully requested an irq */ -- cgit v1.2.3 From 0f35040de59371ad542b915d7b91176c9910dadc Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Mon, 8 Dec 2025 00:41:47 +0900 Subject: mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache destruction Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab caches when a cache is destroyed. This is unnecessary; only the RCU sheaves belonging to the cache being destroyed need to be flushed. As suggested by Vlastimil Babka, introduce a weaker form of kvfree_rcu_barrier() that operates on a specific slab cache. Factor out flush_rcu_sheaves_on_cache() from flush_all_rcu_sheaves() and call it from flush_all_rcu_sheaves() and kvfree_rcu_barrier_on_cache(). Call kvfree_rcu_barrier_on_cache() instead of kvfree_rcu_barrier() on cache destruction. The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen 5900X machine (1 socket), by loading slub_kunit module. Before: Total calls: 19 Average latency (us): 18127 Total time (us): 344414 After: Total calls: 19 Average latency (us): 10066 Total time (us): 191264 Two performance regression have been reported: - stress module loader test's runtime increases by 50-60% (Daniel) - internal graphics test's runtime on Tegra234 increases by 35% (Jon) They are fixed by this change. Suggested-by: Vlastimil Babka Fixes: ec66e0d59952 ("slab: add sheaf support for batching kfree_rcu() operations") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz Reported-and-tested-by: Daniel Gomez Closes: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org Reported-and-tested-by: Jon Hunter Closes: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20251207154148.117723-1-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index cf443f064a66..2482992248dc 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1150,10 +1150,17 @@ static inline void kvfree_rcu_barrier(void) rcu_barrier(); } +static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) +{ + rcu_barrier(); +} + static inline void kfree_rcu_scheduler_running(void) { } #else void kvfree_rcu_barrier(void); +void kvfree_rcu_barrier_on_cache(struct kmem_cache *s); + void kfree_rcu_scheduler_running(void); #endif -- cgit v1.2.3 From 18223eececd66365c12275f09042e6fcb2ac5748 Mon Sep 17 00:00:00 2001 From: Srinivas Kandagatla Date: Fri, 12 Sep 2025 09:32:19 +0100 Subject: of: base: Add of_property_read_u8_index Add support for of_property_read_u8_index(), simillar to others u16 and u32 variants. Having this helper makes the code more tidy in isome cases, specially when we are parsing multiple of these into data structures. Signed-off-by: Srinivas Kandagatla Reviewed-by: Rob Herring (Arm) Tested-by: Alexey Klimov # sm8550 Link: https://patch.msgid.link/20250912083225.228778-2-srinivas.kandagatla@oss.qualcomm.com Signed-off-by: Vinod Koul --- include/linux/of.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/of.h b/include/linux/of.h index 121a288ca92d..57fb598b72d3 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -316,6 +316,9 @@ extern struct property *of_find_property(const struct device_node *np, extern bool of_property_read_bool(const struct device_node *np, const char *propname); extern int of_property_count_elems_of_size(const struct device_node *np, const char *propname, int elem_size); +extern int of_property_read_u8_index(const struct device_node *np, + const char *propname, + u32 index, u8 *out_value); extern int of_property_read_u16_index(const struct device_node *np, const char *propname, u32 index, u16 *out_value); @@ -646,6 +649,12 @@ static inline int of_property_count_elems_of_size(const struct device_node *np, return -ENOSYS; } +static inline int of_property_read_u8_index(const struct device_node *np, + const char *propname, u32 index, u8 *out_value) +{ + return -ENOSYS; +} + static inline int of_property_read_u16_index(const struct device_node *np, const char *propname, u32 index, u16 *out_value) { -- cgit v1.2.3 From 167efc6dfd621494c6a7e47115dc829dcc0e502c Mon Sep 17 00:00:00 2001 From: Bard Liao Date: Tue, 14 Oct 2025 11:14:48 +0800 Subject: ASoC: SOF: Intel: export hda_sdw_bpt_get_buf_size_aligment The dma buffer need to be a multiple of data block size and the fifo size. Export a function to return the LCM of data block size and the fifo size. Signed-off-by: Bard Liao Reviewed-by: Ranjani Sridharan Link: https://patch.msgid.link/20251014031450.3781789-6-yung-chuan.liao@linux.intel.com Signed-off-by: Vinod Koul --- include/sound/hda-sdw-bpt.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include') diff --git a/include/sound/hda-sdw-bpt.h b/include/sound/hda-sdw-bpt.h index f649549b75d5..9b654c31829a 100644 --- a/include/sound/hda-sdw-bpt.h +++ b/include/sound/hda-sdw-bpt.h @@ -30,6 +30,8 @@ int hda_sdw_bpt_wait(struct device *dev, struct hdac_ext_stream *bpt_tx_stream, int hda_sdw_bpt_close(struct device *dev, struct hdac_ext_stream *bpt_tx_stream, struct snd_dma_buffer *dmab_tx_bdl, struct hdac_ext_stream *bpt_rx_stream, struct snd_dma_buffer *dmab_rx_bdl); + +unsigned int hda_sdw_bpt_get_buf_size_alignment(unsigned int dma_bandwidth); #else static inline int hda_sdw_bpt_open(struct device *dev, int link_id, struct hdac_ext_stream **bpt_tx_stream, @@ -64,6 +66,11 @@ static inline int hda_sdw_bpt_close(struct device *dev, struct hdac_ext_stream * WARN_ONCE(1, "SoundWire BPT is disabled"); return -EOPNOTSUPP; } + +static inline unsigned int hda_sdw_bpt_get_buf_size_alignment(unsigned int dma_bandwidth) +{ + return 0; +} #endif #endif /* __HDA_SDW_BPT_H */ -- cgit v1.2.3 From 31b931bebd11a0f00967114f62c8c38952f483e5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Sun, 7 Dec 2025 19:47:56 +0100 Subject: dma-mapping: Fix DMA_BIT_MASK() macro being broken After commit a50f7456f853 ("dma-mapping: Allow use of DMA_BIT_MASK(64) in global scope"), the DMA_BIT_MASK() macro is broken when passed non trivial statements for the value of 'n'. This is caused by the new version missing parenthesis around 'n' when evaluating 'n'. One example of this breakage is the IPU6 driver now crashing due to it getting DMA-addresses with address bit 32 set even though it has tried to set a 32 bit DMA mask. The IPU6 CSI2 engine has a DMA mask of either 31 or 32 bits depending on if it is in secure mode or not and it sets this masks like this: mmu_info->aperture_end = (dma_addr_t)DMA_BIT_MASK(isp->secure_mode ? IPU6_MMU_ADDR_BITS : IPU6_MMU_ADDR_BITS_NON_SECURE); So the 'n' argument here is "isp->secure_mode ? IPU6_MMU_ADDR_BITS : IPU6_MMU_ADDR_BITS_NON_SECURE" which gets expanded into: isp->secure_mode ? IPU6_MMU_ADDR_BITS : IPU6_MMU_ADDR_BITS_NON_SECURE - 1 With the -1 only being applied in the non secure case, causing the secure mode mask to be one 1 bit too large. Fixes: a50f7456f853 ("dma-mapping: Allow use of DMA_BIT_MASK(64) in global scope") Cc: Sakari Ailus Cc: James Clark Cc: Nathan Chancellor Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Reviewed-by: Nathan Chancellor Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20251207184756.97904-1-johannes.goede@oss.qualcomm.com --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 2ceda49c609f..aa36a0d1d9df 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -90,7 +90,7 @@ */ #define DMA_MAPPING_ERROR (~(dma_addr_t)0) -#define DMA_BIT_MASK(n) GENMASK_ULL(n - 1, 0) +#define DMA_BIT_MASK(n) GENMASK_ULL((n) - 1, 0) struct dma_iova_state { dma_addr_t addr; -- cgit v1.2.3 From 9a97857db0c5655b8932f86b5d18bb959079b0ee Mon Sep 17 00:00:00 2001 From: Andres J Rosa Date: Wed, 3 Dec 2025 10:25:01 -0600 Subject: ALSA: uapi: Fix typo in asound.h comment Fix 'level-shit' to 'level-shift' in struct snd_cea_861_aud_if comment. Fixes: 7ba1c40b536e ("ALSA: Add definitions for CEA-861 Audio InfoFrames") Signed-off-by: Andres J Rosa Link: https://patch.msgid.link/20251203162509.1822-1-andyrosa@gmail.com Signed-off-by: Takashi Iwai --- include/uapi/sound/asound.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index 5a049eeaecce..d3ce75ba938a 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -60,7 +60,7 @@ struct snd_cea_861_aud_if { unsigned char db2_sf_ss; /* sample frequency and size */ unsigned char db3; /* not used, all zeros */ unsigned char db4_ca; /* channel allocation code */ - unsigned char db5_dminh_lsv; /* downmix inhibit & level-shit values */ + unsigned char db5_dminh_lsv; /* downmix inhibit & level-shift values */ }; /**************************************************************************** -- cgit v1.2.3 From 2fb6915fa22dc5524d704afba58a13305dd9f533 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 18 Jul 2025 11:35:00 -0700 Subject: compiler_types.h: add "auto" as a macro for "__auto_type" "auto" was defined as a keyword back in the K&R days, but as a storage type specifier. No one ever used it, since it was and is the default storage type for local variables. C++11 recycled the keyword to allow a type to be declared based on the type of an initializer. This was finally adopted into standard C in C23. gcc and clang provide the "__auto_type" alias keyword as an extension for pre-C23, however, there is no reason to pollute the bulk of the source base with this temporary keyword; instead define "auto" as a macro unless the compiler is running in C23+ mode. This macro is added in because that header is included in some of the tools headers, wheres is not as it has a bunch of very kernel-specific things in it. [ Cc: stable to reduce potential backporting burden. ] Signed-off-by: H. Peter Anvin (Intel) Acked-by: Miguel Ojeda Cc: --- include/linux/compiler_types.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 3eac51d68426..41172a28ce76 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -13,6 +13,19 @@ #ifndef __ASSEMBLY__ +/* + * C23 introduces "auto" as a standard way to define type-inferred + * variables, but "auto" has been a (useless) keyword even since K&R C, + * so it has always been "namespace reserved." + * + * Until at some future time we require C23 support, we need the gcc + * extension __auto_type, but there is no reason to put that elsewhere + * in the source code. + */ +#if __STDC_VERSION__ < 202311L +# define auto __auto_type +#endif + /* * Skipped when running bindgen due to a libclang issue; * see https://github.com/rust-lang/rust-bindgen/issues/2244. -- cgit v1.2.3 From b3b8767c290102a8d95b9d12585cc1e03381ce3f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 19 Jul 2025 23:36:58 -0700 Subject: include/linux: change "__auto_type" to "auto" Replace instances of "__auto_type" with "auto" in include/linux. Signed-off-by: H. Peter Anvin (Intel) --- include/linux/cleanup.h | 6 +++--- include/linux/compiler.h | 2 +- include/linux/minmax.h | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/cleanup.h b/include/linux/cleanup.h index 0b55a8f6c59e..8d41b917c77d 100644 --- a/include/linux/cleanup.h +++ b/include/linux/cleanup.h @@ -212,10 +212,10 @@ #define __free(_name) __cleanup(__free_##_name) -#define __get_and_null(p, nullvalue) \ +#define __get_and_null(p, nullvalue) \ ({ \ - __auto_type __ptr = &(p); \ - __auto_type __val = *__ptr; \ + auto __ptr = &(p); \ + auto __val = *__ptr; \ *__ptr = nullvalue; \ __val; \ }) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ff71bebe56f5..04487c9bd751 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -190,7 +190,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #define data_race(expr) \ ({ \ __kcsan_disable_current(); \ - __auto_type __v = (expr); \ + auto __v = (expr); \ __kcsan_enable_current(); \ __v; \ }) diff --git a/include/linux/minmax.h b/include/linux/minmax.h index eaaf5c008e4d..a0158db54a04 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -89,7 +89,7 @@ __cmp_once_unique(op, type, x, y, __UNIQUE_ID(x_), __UNIQUE_ID(y_)) #define __careful_cmp_once(op, x, y, ux, uy) ({ \ - __auto_type ux = (x); __auto_type uy = (y); \ + auto ux = (x); auto uy = (y); \ BUILD_BUG_ON_MSG(!__types_ok(ux, uy), \ #op"("#x", "#y") signedness error"); \ __cmp(op, ux, uy); }) @@ -129,7 +129,7 @@ __careful_cmp(max, (x) + 0u + 0ul + 0ull, (y) + 0u + 0ul + 0ull) #define __careful_op3(op, x, y, z, ux, uy, uz) ({ \ - __auto_type ux = (x); __auto_type uy = (y);__auto_type uz = (z);\ + auto ux = (x); auto uy = (y); auto uz = (z); \ BUILD_BUG_ON_MSG(!__types_ok3(ux, uy, uz), \ #op"3("#x", "#y", "#z") signedness error"); \ __cmp(op, ux, __cmp(op, uy, uz)); }) @@ -203,7 +203,7 @@ * This macro checks @val/@lo/@hi to make sure they have compatible * signedness. */ -#define clamp(val, lo, hi) __careful_clamp(__auto_type, val, lo, hi) +#define clamp(val, lo, hi) __careful_clamp(auto, val, lo, hi) /** * clamp_t - return a value clamped to a given range using a given type -- cgit v1.2.3 From 1cba2eba9b73d8dfee6b3e7465f510cace71637c Mon Sep 17 00:00:00 2001 From: Jinhui Guo Date: Thu, 27 Nov 2025 17:25:12 +0800 Subject: mm/sparse: fix sparse_vmemmap_init_nid_early definition without CONFIG_SPARSEMEM When CONFIG_SPARSEMEM is disabled, the macro sparse_vmemmap_init_nid_early(_nid, _use) passes two arguments, while the actual function accepts only nid. Drop the extra argument _use. Link: https://lkml.kernel.org/r/20251127092512.278-1-guojinhui.liam@bytedance.com Fixes: d65917c42373 ("mm/sparse: allow for alternate vmemmap section init at boot") Signed-off-by: Jinhui Guo Cc: Frank van der Linden Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: "David Hildenbrand (Red Hat)" Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4398e027f450..75ef7c9f9307 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -2289,7 +2289,7 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) -#define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0) +#define sparse_vmemmap_init_nid_early(_nid) do {} while (0) #define sparse_vmemmap_init_nid_late(_nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) -- cgit v1.2.3 From bdd0d69a32c2aa6437d23e35acc705758b835a75 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 26 Nov 2025 16:06:15 -0500 Subject: mm/huge_memory: change folio_split_supported() to folio_check_splittable() Patch series "Improve folio split related functions", v4. This patchset improves several folio split related functions to avoid future misuse. The changes are: 1. Consolidated folio splittable checks by moving truncated folio check, huge zero folio check, and writeback folio check into folio_split_supported(). Changed the function return type. Renamed it to folio_check_splittable() for clarification. 2. Replaced can_split_folio() with open coded folio_expected_ref_count() and folio_ref_count() and introduced folio_cache_ref_count(). 3. Changed min_order_for_split() to always return an order. 4. Fixed folio split stats counting. Motivation ========== This is based on Wei's observation[1] and solves several potential issues: 1. Dereferencing NULL folio->mapping in try_folio_split_to_order() if it is called on truncated folios. 2. Not handling of negative return value of min_order_for_split() in mm/memory-failure.c There is no bug in the current code. This patch (of 4): folio_split_supported() used in try_folio_split_to_order() requires folio->mapping to be non NULL, but current try_folio_split_to_order() does not check it. There is no issue in the current code, since try_folio_split_to_order() is only used in truncate_inode_partial_folio(), where folio->mapping is not NULL. To prevent future misuse, move folio->mapping NULL check (i.e., folio is truncated) into folio_split_supported(). Since folio->mapping NULL check returns -EBUSY and folio_split_supported() == false means -EINVAL, change folio_split_supported() return type from bool to int and return error numbers accordingly. Rename folio_split_supported() to folio_check_splittable() to match the return type change. While at it, move is_huge_zero_folio() check and folio_test_writeback() check into folio_check_splittable() and add kernel-doc. Remove all warnings inside folio_check_splittable() and give warnings in __folio_split() instead, so that bool warns parameter can be removed. Link: https://lkml.kernel.org/r/20251126210618.1971206-1-ziy@nvidia.com Link: https://lkml.kernel.org/r/20251126210618.1971206-2-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: Wei Yang Acked-by: Balbir Singh Acked-by: David Hildenbrand (Red Hat) Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Nico Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1d439de1ca2c..66105a90b4c3 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -375,8 +375,8 @@ int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list int folio_split_unmapped(struct folio *folio, unsigned int new_order); int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); -bool folio_split_supported(struct folio *folio, unsigned int new_order, - enum split_type split_type, bool warns); +int folio_check_splittable(struct folio *folio, unsigned int new_order, + enum split_type split_type); int folio_split(struct folio *folio, unsigned int new_order, struct page *page, struct list_head *list); @@ -407,7 +407,7 @@ static inline int split_huge_page_to_order(struct page *page, unsigned int new_o static inline int try_folio_split_to_order(struct folio *folio, struct page *page, unsigned int new_order) { - if (!folio_split_supported(folio, new_order, SPLIT_TYPE_NON_UNIFORM, /* warns= */ false)) + if (folio_check_splittable(folio, new_order, SPLIT_TYPE_NON_UNIFORM)) return split_huge_page_to_order(&folio->page, new_order); return folio_split(folio, new_order, page, NULL); } -- cgit v1.2.3 From 5842bcbfc316738cbfcbdb4def5a7592aa03ebf2 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 26 Nov 2025 16:06:16 -0500 Subject: mm/huge_memory: replace can_split_folio() with direct refcount calculation can_split_folio() is just a refcount comparison, making sure only the split caller holds an extra pin. Open code it with folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins used by folio_ref_freeze(), add folio_cache_ref_count() to calculate it. Also replace folio_expected_ref_count() with folio_cache_ref_count() used by folio_ref_unfreeze(), since they are returning the same values when a folio is frozen and folio_cache_ref_count() does not have unnecessary folio_mapcount() in its implementation. Link: https://lkml.kernel.org/r/20251126210618.1971206-3-ziy@nvidia.com Signed-off-by: Zi Yan Suggested-by: David Hildenbrand (Red Hat) Reviewed-by: Wei Yang Acked-by: David Hildenbrand (Red Hat) Cc: Balbir Singh Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Nico Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 66105a90b4c3..8a52e20387b0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -369,7 +369,6 @@ enum split_type { SPLIT_TYPE_NON_UNIFORM, }; -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins); int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); int folio_split_unmapped(struct folio *folio, unsigned int new_order); -- cgit v1.2.3 From 2f78910659c72807b7ff03a2c0d121901bf55848 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 26 Nov 2025 16:06:17 -0500 Subject: mm/huge_memory: make min_order_for_split() always return an order min_order_for_split() returns -EBUSY when the folio is truncated and cannot be split. In commit 77008e1b2ef7 ("mm/huge_memory: do not change split_huge_page*() target order silently"), memory_failure() does not handle it and pass -EBUSY to try_to_split_thp_page() directly. try_to_split_thp_page() returns -EINVAL since -EBUSY becomes 0xfffffff0 as new_order is unsigned int in __folio_split() and this large new_order is rejected as an invalid input. The code does not cause a bug. soft_offline_in_use_page() also uses min_order_for_split() but it always passes 0 as new_order for split. Fix it by making min_order_for_split() always return an order. When the given folio is truncated, namely folio->mapping == NULL, return 0 and let a subsequent split function handle the situation and return -EBUSY. Add kernel-doc to min_order_for_split() to clarify its use. Link: https://lkml.kernel.org/r/20251126210618.1971206-4-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: Wei Yang Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Cc: Balbir Singh Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Nico Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8a52e20387b0..21162493a0a0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -372,7 +372,7 @@ enum split_type { int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order); int folio_split_unmapped(struct folio *folio, unsigned int new_order); -int min_order_for_split(struct folio *folio); +unsigned int min_order_for_split(struct folio *folio); int split_folio_to_list(struct folio *folio, struct list_head *list); int folio_check_splittable(struct folio *folio, unsigned int new_order, enum split_type split_type); @@ -630,10 +630,10 @@ static inline int split_huge_page(struct page *page) return -EINVAL; } -static inline int min_order_for_split(struct folio *folio) +static inline unsigned int min_order_for_split(struct folio *folio) { VM_WARN_ON_ONCE_FOLIO(1, folio); - return -EINVAL; + return 0; } static inline int split_folio_to_list(struct folio *folio, struct list_head *list) -- cgit v1.2.3 From 40a4af52e0472dfc114aa78d6f3debec70b42048 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 1 Dec 2025 13:29:22 +0100 Subject: mm: fix CONFIG_STACK_GROWSUP typo in mm.h Commit 2b6a3f061f11 ("mm: declare VMA flags by bit") significantly refactors the header file include/linux/mm.h. In that step, it introduces a typo in an ifdef, referring to a non-existing config option STACK_GROWS_UP, whereas the actual config option is called STACK_GROWSUP. Fix this typo in the mm header file. Link: https://lkml.kernel.org/r/20251201122922.352480-1-lukas.bulwahn@redhat.com Fixes: 2b6a3f061f11 ("mm: declare VMA flags by bit") Signed-off-by: Lukas Bulwahn Acked-by: David Hildenbrand (Red Hat) Reviewed-by: Lorenzo Stoakes Cc: Alice Ryhl Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2887d3b34d3e..03f7f92d08c8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -438,7 +438,7 @@ enum { #define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) #define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) #define VM_STACK INIT_VM_FLAG(STACK) -#ifdef CONFIG_STACK_GROWS_UP +#ifdef CONFIG_STACK_GROWSUP #define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) #else #define VM_STACK_EARLY VM_NONE -- cgit v1.2.3 From 12eef14bcbac77bd08dc5693ad5818e69993246f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Dec 2025 09:18:57 +0100 Subject: lockref: add a __cond_lock annotation for lockref_put_or_lock Add a cond_lock annotation for lockref_put_or_lock to make sparse happy with using it. Note that for this the return value has to be double-inverted as the return value convention of lockref_put_or_lock is inverted compared to _trylock conventions expected by __cond_lock, as lockref_put_or_lock returns true when it did not need to take the lock. Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- include/linux/lockref.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 676721ee878d..815d871fadfc 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -50,6 +50,8 @@ void lockref_get(struct lockref *lockref); int lockref_put_return(struct lockref *lockref); bool lockref_get_not_zero(struct lockref *lockref); bool lockref_put_or_lock(struct lockref *lockref); +#define lockref_put_or_lock(_lockref) \ + (!__cond_lock((_lockref)->lock, !lockref_put_or_lock(_lockref))) void lockref_mark_dead(struct lockref *lockref); bool lockref_get_not_dead(struct lockref *lockref); -- cgit v1.2.3 From 55026a9670ce8b7b3d74f7d570de1382cbfb395d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 2 Dec 2025 21:23:27 +0100 Subject: irqdomain: Delete irq_domain_add_tree() No in-tree users anymore. [ tglx: Remove the reference in the Chinese documentation as well ] Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20251202202327.1444693-1-andriy.shevchenko@linux.intel.com --- include/linux/irqdomain.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 952d3c8dd6b7..62f81bbeb490 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -730,22 +730,6 @@ static inline void msi_device_domain_free_wired(struct irq_domain *domain, unsig } #endif -static inline struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - const struct irq_domain_ops *ops, - void *host_data) -{ - struct irq_domain_info info = { - .fwnode = of_fwnode_handle(of_node), - .hwirq_max = ~0U, - .ops = ops, - .host_data = host_data, - }; - struct irq_domain *d; - - d = irq_domain_instantiate(&info); - return IS_ERR(d) ? NULL : d; -} - static inline struct irq_domain *irq_domain_add_linear(struct device_node *of_node, unsigned int size, const struct irq_domain_ops *ops, -- cgit v1.2.3 From d927a595ab2f6de4e10b3e3962bc70ab61d8f907 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 25 Sep 2025 12:45:12 +0200 Subject: ceph: add trace points to the MDS client This patch adds trace points to the Ceph filesystem MDS client: - request submission (CEPH_MSG_CLIENT_REQUEST) and completion (CEPH_MSG_CLIENT_REPLY) - capabilities (CEPH_MSG_CLIENT_CAPS) These are the central pieces that are useful for analyzing MDS latency/performance problems from the client's perspective. In the long run, all doutc() calls should be replaced with tracepoints. This way, the Ceph filesystem can be traced at any time (without spamming the kernel log). Additionally, trace points can be used in BPF programs (which can even deference the pointer parameters and extract more values). Signed-off-by: Max Kellermann Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/trace/events/ceph.h | 234 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 include/trace/events/ceph.h (limited to 'include') diff --git a/include/trace/events/ceph.h b/include/trace/events/ceph.h new file mode 100644 index 000000000000..08cb0659fbfc --- /dev/null +++ b/include/trace/events/ceph.h @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Ceph filesystem support module tracepoints + * + * Copyright (C) 2025 IONOS SE. All Rights Reserved. + * Written by Max Kellermann (max.kellermann@ionos.com) + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ceph + +#if !defined(_TRACE_CEPH_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CEPH_H + +#include + +#define ceph_mdsc_suspend_reasons \ + EM(ceph_mdsc_suspend_reason_no_mdsmap, "no-mdsmap") \ + EM(ceph_mdsc_suspend_reason_no_active_mds, "no-active-mds") \ + EM(ceph_mdsc_suspend_reason_rejected, "rejected") \ + E_(ceph_mdsc_suspend_reason_session, "session") + +#ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY +#define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY + +#undef EM +#undef E_ +#define EM(a, b) a, +#define E_(a, b) a + +enum ceph_mdsc_suspend_reason { ceph_mdsc_suspend_reasons } __mode(byte); + +#endif + +/* + * Export enum symbols via userspace. + */ +#undef EM +#undef E_ +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define E_(a, b) TRACE_DEFINE_ENUM(a); + +ceph_mdsc_suspend_reasons; + +/* + * Now redefine the EM() and E_() macros to map the enums to the strings that + * will be printed in the output. + */ +#undef EM +#undef E_ +#define EM(a, b) { a, b }, +#define E_(a, b) { a, b } + +TRACE_EVENT(ceph_mdsc_submit_request, + TP_PROTO(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req), + + TP_ARGS(mdsc, req), + + TP_STRUCT__entry( + __field(u64, tid) + __field(int, op) + __field(u64, ino) + __field(u64, snap) + ), + + TP_fast_assign( + struct inode *inode; + + __entry->tid = req->r_tid; + __entry->op = req->r_op; + + inode = req->r_inode; + if (inode == NULL && req->r_dentry) + inode = d_inode(req->r_dentry); + + if (inode) { + __entry->ino = ceph_ino(inode); + __entry->snap = ceph_snap(inode); + } else { + __entry->ino = __entry->snap = 0; + } + ), + + TP_printk("R=%llu op=%s ino=%llx,%llx", + __entry->tid, + ceph_mds_op_name(__entry->op), + __entry->ino, __entry->snap) +); + +TRACE_EVENT(ceph_mdsc_suspend_request, + TP_PROTO(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_mds_request *req, + enum ceph_mdsc_suspend_reason reason), + + TP_ARGS(mdsc, session, req, reason), + + TP_STRUCT__entry( + __field(u64, tid) + __field(int, op) + __field(int, mds) + __field(enum ceph_mdsc_suspend_reason, reason) + ), + + TP_fast_assign( + __entry->tid = req->r_tid; + __entry->op = req->r_op; + __entry->mds = session ? session->s_mds : -1; + __entry->reason = reason; + ), + + TP_printk("R=%llu op=%s reason=%s", + __entry->tid, + ceph_mds_op_name(__entry->op), + __print_symbolic(__entry->reason, ceph_mdsc_suspend_reasons)) +); + +TRACE_EVENT(ceph_mdsc_resume_request, + TP_PROTO(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req), + + TP_ARGS(mdsc, req), + + TP_STRUCT__entry( + __field(u64, tid) + __field(int, op) + ), + + TP_fast_assign( + __entry->tid = req->r_tid; + __entry->op = req->r_op; + ), + + TP_printk("R=%llu op=%s", + __entry->tid, + ceph_mds_op_name(__entry->op)) +); + +TRACE_EVENT(ceph_mdsc_send_request, + TP_PROTO(struct ceph_mds_session *session, + struct ceph_mds_request *req), + + TP_ARGS(session, req), + + TP_STRUCT__entry( + __field(u64, tid) + __field(int, op) + __field(int, mds) + ), + + TP_fast_assign( + __entry->tid = req->r_tid; + __entry->op = req->r_op; + __entry->mds = session->s_mds; + ), + + TP_printk("R=%llu op=%s mds=%d", + __entry->tid, + ceph_mds_op_name(__entry->op), + __entry->mds) +); + +TRACE_EVENT(ceph_mdsc_complete_request, + TP_PROTO(struct ceph_mds_client *mdsc, + struct ceph_mds_request *req), + + TP_ARGS(mdsc, req), + + TP_STRUCT__entry( + __field(u64, tid) + __field(int, op) + __field(int, err) + __field(unsigned long, latency_ns) + ), + + TP_fast_assign( + __entry->tid = req->r_tid; + __entry->op = req->r_op; + __entry->err = req->r_err; + __entry->latency_ns = req->r_end_latency - req->r_start_latency; + ), + + TP_printk("R=%llu op=%s err=%d latency_ns=%lu", + __entry->tid, + ceph_mds_op_name(__entry->op), + __entry->err, + __entry->latency_ns) +); + +TRACE_EVENT(ceph_handle_caps, + TP_PROTO(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + int op, + const struct ceph_vino *vino, + struct ceph_inode_info *inode, + u32 seq, u32 mseq, u32 issue_seq), + + TP_ARGS(mdsc, session, op, vino, inode, seq, mseq, issue_seq), + + TP_STRUCT__entry( + __field(int, mds) + __field(int, op) + __field(u64, ino) + __field(u64, snap) + __field(u32, seq) + __field(u32, mseq) + __field(u32, issue_seq) + ), + + TP_fast_assign( + __entry->mds = session->s_mds; + __entry->op = op; + __entry->ino = vino->ino; + __entry->snap = vino->snap; + __entry->seq = seq; + __entry->mseq = mseq; + __entry->issue_seq = issue_seq; + ), + + TP_printk("mds=%d op=%s vino=%llx.%llx seq=%u iseq=%u mseq=%u", + __entry->mds, + ceph_cap_op_name(__entry->op), + __entry->ino, + __entry->snap, + __entry->seq, + __entry->issue_seq, + __entry->mseq) +); + +#undef EM +#undef E_ +#endif /* _TRACE_CEPH_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From e6b4d264c8c883d8451c7b5f20cd96ddf94af3ef Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 1 Dec 2025 21:10:18 +0100 Subject: args: fix documentation to reflect the correct numbers The macro uses up to 15 arguments. Reflect this in the top level comment. Link: https://lkml.kernel.org/r/20251201201018.765475-1-andriy.shevchenko@linux.intel.com Fixes: d51e783c17ba ("lsm: count the LSMs enabled at compile time") Signed-off-by: Andy Shevchenko Reviewed-by: Randy Dunlap Signed-off-by: Andrew Morton --- include/linux/args.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/args.h b/include/linux/args.h index 2e8e65d975c7..0562dc51435e 100644 --- a/include/linux/args.h +++ b/include/linux/args.h @@ -6,9 +6,9 @@ /* * How do these macros work? * - * In __COUNT_ARGS() _0 to _12 are just placeholders from the start + * In __COUNT_ARGS() _0 to _15 are just placeholders from the start * in order to make sure _n is positioned over the correct number - * from 12 to 0 (depending on X, which is a variadic argument list). + * from 15 to 0 (depending on X, which is a variadic argument list). * They serve no purpose other than occupying a position. Since each * macro parameter must have a distinct identifier, those identifiers * are as good as any. -- cgit v1.2.3 From bdae29d6512ddc589200b9ae6bda467bdbab863d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Dec 2025 10:07:53 +0000 Subject: rseq: Always inline rseq_debug_syscall_return() To get the full benefit of: eaa9088d568c ("rseq: Use static branch for syscall exit debug when GENERIC_IRQ_ENTRY=y") clang needs an __always_inline instead of a plain inline qualifier: $ for i in {1..10}; do taskset -c 4 perf5 bench syscall basic -l 100000000 | grep "ops/sec"; done Before After ops/sec 15424491 15872221 +2.9% Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Mathieu Desnoyers Link: https://patch.msgid.link/20251205100753.4073221-1-edumazet@google.com --- include/linux/rseq_entry.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index c92167ff8a7f..a36b472627de 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -596,7 +596,7 @@ static __always_inline void rseq_exit_to_user_mode_legacy(void) void __rseq_debug_syscall_return(struct pt_regs *regs); -static inline void rseq_debug_syscall_return(struct pt_regs *regs) +static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs) { if (static_branch_unlikely(&rseq_debug_enabled)) __rseq_debug_syscall_return(regs); -- cgit v1.2.3 From 41b80d43d9a00a302b5559baa7ebafc28dd54793 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 3 Dec 2025 15:45:51 -0500 Subject: i3c: master: cleanup callback .priv_xfers() Remove the .priv_xfers() callback from the framework after all master controller drivers have switched to use the new .i3c_xfers() callback. Signed-off-by: Frank Li Tested-by: Tommaso Merciai Link: https://patch.msgid.link/20251203-i3c_xfer_cleanup_master-v2-2-7dd94d04ee2d@nxp.com Signed-off-by: Alexandre Belloni --- include/linux/i3c/master.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/i3c/master.h b/include/linux/i3c/master.h index 2fd850f4678b..58d01ed4cce7 100644 --- a/include/linux/i3c/master.h +++ b/include/linux/i3c/master.h @@ -417,12 +417,8 @@ struct i3c_bus { * all CCC commands are supported. * @send_ccc_cmd: send a CCC command * This method is mandatory. - * @priv_xfers: do one or several private I3C SDR transfers - * This method is mandatory when i3c_xfers is not implemented. It - * is deprecated. - * @i3c_xfers: do one or several I3C SDR or HDR transfers - * This method is mandatory when priv_xfers is not implemented but - * should be implemented instead of priv_xfers. + * @i3c_xfers: do one or several I3C SDR or HDR transfers. + * This method is mandatory. * @attach_i2c_dev: called every time an I2C device is attached to the bus. * This is a good place to attach master controller specific * data to I2C devices. @@ -478,10 +474,6 @@ struct i3c_master_controller_ops { const struct i3c_ccc_cmd *cmd); int (*send_ccc_cmd)(struct i3c_master_controller *master, struct i3c_ccc_cmd *cmd); - /* Deprecated, please use i3c_xfers() */ - int (*priv_xfers)(struct i3c_dev_desc *dev, - struct i3c_priv_xfer *xfers, - int nxfers); int (*i3c_xfers)(struct i3c_dev_desc *dev, struct i3c_xfer *xfers, int nxfers, enum i3c_xfer_mode mode); -- cgit v1.2.3 From d2ea4d254d04a89e17504af0230c7268e3cac6bf Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 13 Dec 2025 08:45:23 +0100 Subject: file: ensure cleanup Brown paper bag time. This is a silly oversight where I missed to drop the error condition checking to ensure we clean up on early error returns. I have an internal unit testset coming up for this which will catch all such issues going forward. Reported-by: Chris Mason Reported-by: Jeff Layton Fixes: 011703a9acd7 ("file: add FD_{ADD,PREPARE}()") Signed-off-by: Christian Brauner Reviewed-by: Jeff Layton Signed-off-by: Linus Torvalds --- include/linux/file.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/file.h b/include/linux/file.h index cf389fde9bc2..27484b444d31 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -161,12 +161,10 @@ typedef struct fd_prepare class_fd_prepare_t; /* Do not use directly. */ static inline void class_fd_prepare_destructor(const struct fd_prepare *fdf) { - if (unlikely(fdf->err)) { - if (likely(fdf->__fd >= 0)) - put_unused_fd(fdf->__fd); - if (unlikely(!IS_ERR_OR_NULL(fdf->__file))) - fput(fdf->__file); - } + if (unlikely(fdf->__fd >= 0)) + put_unused_fd(fdf->__fd); + if (unlikely(!IS_ERR_OR_NULL(fdf->__file))) + fput(fdf->__file); } /* Do not use directly. */ @@ -230,7 +228,8 @@ static inline int class_fd_prepare_lock_err(const struct fd_prepare *fdf) VFS_WARN_ON_ONCE(fdp->__fd < 0); \ VFS_WARN_ON_ONCE(IS_ERR_OR_NULL(fdp->__file)); \ fd_install(fdp->__fd, fdp->__file); \ - fdp->__fd; \ + retain_and_null_ptr(fdp->__file); \ + take_fd(fdp->__fd); \ }) /* Do not use directly. */ -- cgit v1.2.3